I created a Neural Network that takes two greyscale images 14x14 pixels portraying a digit (from MNIST database) and returns 1 if the first digit is less or equal to the second digit, returns 0 otherwise. The code runs, but every time the initial weights are the same. They should be random
Forcing the initial weights to be random, by using the following line of code in the Net class, does not help.
torch.nn.init.normal_(self.layer1.weight, mean=0.0, std=0.01)
Here is the code of the "main.py" file:
import os; os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import torch
import torch.nn as nn
from dlc_practical_prologue import *
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layer1 = nn.Linear(2*14*14, 32)
#torch.nn.init.normal_(self.layer1.weight, mean=0.0, std=0.01)
#self.layer2 = nn.Linear(100, 100)
#self.layer3 = nn.Linear(100, 100)
self.layer2 = nn.Linear(32, 1)
def forward(self, x):
x = torch.relu(self.layer1(x))
#x = torch.relu(self.layer2(x))
#x = torch.relu(self.layer3(x))
x = torch.sigmoid(self.layer2(x))
return x
if __name__ == '__main__':
# Data initialization
N = 1000
train_input, train_target, train_classes, _, _, _, = generate_pair_sets(N)
_, _, _, test_input, test_target, test_classes = generate_pair_sets(N)
train_input = train_input.view(-1, 2*14*14)
test_input = test_input.view(-1, 2*14*14)
train_target = train_target.view(-1, 1)
test_target = test_target.view(-1, 1)
# I convert the type to torch.float32
train_input, train_target, train_classes, test_input, test_target, test_classes = \
train_input.type(torch.float32), train_target.type(torch.float32), train_classes.type(torch.long), \
test_input.type(torch.float32), test_target.type(torch.float32), test_classes.type(torch.long)
# Create the neural network
net = Net()
# Training
learning_rate = 0.01
# Use MSELoss
loss = nn.MSELoss()
# Use Adam optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
EPOCHS = 50
for param in net.parameters():
print(param)
for epoch in range(EPOCHS):
target_predicted = net(train_input)
l = loss(train_target, target_predicted) #loss = nn.MSELoss()
#l = loss(target_predicted, train_target)
l.backward()
optimizer.step()
optimizer.zero_grad()
#print(l)
# Testing
total = 1000
correct = 0
with torch.no_grad():
correct = ( test_target == net(test_input).round() ).sum()
print("Accuracy %.2f%%" % (correct / total * 100))
Here is the code for "dlc_practical_monologue.py":
import os; os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
from torchvision import datasets
import argparse
import os
import urllib
######################################################################
parser = argparse.ArgumentParser(description='DLC prologue file for practical sessions.')
parser.add_argument('--full',
action='store_true', default=False,
help = 'Use the full set, can take ages (default False)')
parser.add_argument('--tiny',
action='store_true', default=False,
help = 'Use a very small set for quick checks (default False)')
parser.add_argument('--seed',
type = int, default = 0,
help = 'Random seed (default 0, < 0 is no seeding)')
parser.add_argument('--cifar',
action='store_true', default=False,
help = 'Use the CIFAR data-set and not MNIST (default False)')
parser.add_argument('--data_dir',
type = str, default = None,
help = 'Where are the PyTorch data located (default $PYTORCH_DATA_DIR or \'./data\')')
# Timur's fix
parser.add_argument('-f', '--file',
help = 'quick hack for jupyter')
args = parser.parse_args()
if args.seed >= 0:
torch.manual_seed(args.seed)
######################################################################
# The data
def convert_to_one_hot_labels(input, target):
tmp = input.new_zeros(target.size(0), target.max() + 1)
tmp.scatter_(1, target.view(-1, 1), 1.0)
return tmp
def load_data(cifar = None, one_hot_labels = False, normalize = False, flatten = True):
if args.data_dir is not None:
data_dir = args.data_dir
else:
data_dir = os.environ.get('PYTORCH_DATA_DIR')
if data_dir is None:
data_dir = './data'
if args.cifar or (cifar is not None and cifar):
print('* Using CIFAR')
cifar_train_set = datasets.CIFAR10(data_dir + '/cifar10/', train = True, download = True)
cifar_test_set = datasets.CIFAR10(data_dir + '/cifar10/', train = False, download = True)
train_input = torch.from_numpy(cifar_train_set.data)
train_input = train_input.transpose(3, 1).transpose(2, 3).float()
train_target = torch.tensor(cifar_train_set.targets, dtype = torch.int64)
test_input = torch.from_numpy(cifar_test_set.data).float()
test_input = test_input.transpose(3, 1).transpose(2, 3).float()
test_target = torch.tensor(cifar_test_set.targets, dtype = torch.int64)
else:
print('* Using MNIST')
######################################################################
# import torchvision
# raw_folder = data_dir + '/mnist/raw/'
# resources = [
# ("https://fleuret.org/dlc/data/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
# ("https://fleuret.org/dlc/data/train-labels-idx1-ubyte.gz", "d53e105ee54ea40749a09fcbcd1e9432"),
# ("https://fleuret.org/dlc/data/t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3"),
# ("https://fleuret.org/dlc/data/t10k-labels-idx1-ubyte.gz", "ec29112dd5afa0611ce80d1b7f02629c")
# ]
# os.makedirs(raw_folder, exist_ok=True)
# # download files
# for url, md5 in resources:
# filename = url.rpartition('/')[2]
# torchvision.datasets.utils.download_and_extract_archive(url, download_root=raw_folder, filename=filename, md5=md5)
######################################################################
mnist_train_set = datasets.MNIST(data_dir + '/mnist/', train = True, download = True)
mnist_test_set = datasets.MNIST(data_dir + '/mnist/', train = False, download = True)
train_input = mnist_train_set.data.view(-1, 1, 28, 28).float()
train_target = mnist_train_set.targets
test_input = mnist_test_set.data.view(-1, 1, 28, 28).float()
test_target = mnist_test_set.targets
if flatten:
train_input = train_input.clone().reshape(train_input.size(0), -1)
test_input = test_input.clone().reshape(test_input.size(0), -1)
if args.full:
if args.tiny:
raise ValueError('Cannot have both --full and --tiny')
else:
if args.tiny:
print('** Reduce the data-set to the tiny setup')
train_input = train_input.narrow(0, 0, 500)
train_target = train_target.narrow(0, 0, 500)
test_input = test_input.narrow(0, 0, 100)
test_target = test_target.narrow(0, 0, 100)
else:
print('** Reduce the data-set (use --full for the full thing)')
train_input = train_input.narrow(0, 0, 1000)
train_target = train_target.narrow(0, 0, 1000)
test_input = test_input.narrow(0, 0, 1000)
test_target = test_target.narrow(0, 0, 1000)
print('** Use {:d} train and {:d} test samples'.format(train_input.size(0), test_input.size(0)))
if one_hot_labels:
train_target = convert_to_one_hot_labels(train_input, train_target)
test_target = convert_to_one_hot_labels(test_input, test_target)
if normalize:
mu, std = train_input.mean(), train_input.std()
train_input.sub_(mu).div_(std)
test_input.sub_(mu).div_(std)
return train_input, train_target, test_input, test_target
######################################################################
def mnist_to_pairs(nb, input, target):
input = torch.functional.F.avg_pool2d(input, kernel_size = 2)
a = torch.randperm(input.size(0))
a = a[:2 * nb].view(nb, 2)
input = torch.cat((input[a[:, 0]], input[a[:, 1]]), 1)
classes = target[a]
target = (classes[:, 0] <= classes[:, 1]).long()
return input, target, classes
######################################################################
def generate_pair_sets(nb):
if args.data_dir is not None:
data_dir = args.data_dir
else:
data_dir = os.environ.get('PYTORCH_DATA_DIR')
if data_dir is None:
data_dir = './data'
train_set = datasets.MNIST(data_dir + '/mnist/', train = True, download = True)
train_input = train_set.data.view(-1, 1, 28, 28).float()
train_target = train_set.targets
test_set = datasets.MNIST(data_dir + '/mnist/', train = False, download = True)
test_input = test_set.data.view(-1, 1, 28, 28).float()
test_target = test_set.targets
return mnist_to_pairs(nb, train_input, train_target) + \
mnist_to_pairs(nb, test_input, test_target)
######################################################################
Note that I have to add the following line of code to run the code on Windows 10, while it is not necessary to run it on Linux.
import os; os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
Also on Linux I always get the same initial weights.
Please, can you help me?
Correct me if I'm wrong here but only the weights of the first layer should be the same each time you run this. The thing is when you import the dlc_practical_monologue.py there's this thing in it:
if args.seed >= 0:
torch.manual_seed(args.seed)
which fires up if the seed is >=0 (default is 0).
This should only initialize the first layer with the same weights for each run. Check if this is the case.
The solution was to delete the following lines from "dlv_practical_prologue.py":
if args.seed >= 0:
torch.manual_seed(args.seed)
I am trying to extract the data which will have the structure of the code i have pasted below and from this data i would like to extract the X and y for each 10v, 20V... and keep them separated. I am ultimately planning to generate a plot of each of them after the parsing for the list of the data i will have.
The Soup:
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-72511212-1', 'auto');
ga('send', 'pageview');
</script>
<!DOCTYPE html>
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Metabolite Chart</title>
<!-- 1. Add these JavaScript inclusions in the head of your page -->
<script src="lib/js/jquery-1.6.x/jquery-1.6.1.min.js" type="text/javascript"></script>
<script src="lib/js/misc/highcharts.src.delta.js" type="text/javascript"></script>
<!--<script type="text/javascript" src="/lib/highcharts.js"></script>-->
<script src="lib/js/misc/excanvas.compiled.js" type="text/javascript"></script>
<!--
<script src="https://code.highcharts.com/highcharts.js"></script>
<script src="https://code.highcharts.com/modules/exporting.js"></script>
-->
<!-- 2. Add the JavaScript to initialize the chart on document ready -->
<script type="text/javascript">
//alert("molI: " + "203");
//alert("molN: " + "Chenodeoxycholic acid glycine conjugate");
$(document).ready(function() {
//alert("molN: " + "Chenodeoxycholic acid glycine conjugate");
// function resetchart() {
// fireEvent(chart, 'selection', { resetSelection: true }, zoom);
// }
var count = 0;
//alert("molI: " + "203");
//var mid = 203; // Pass MID here!
var mid = "203"; // Pass
var mole = "Chenodeoxycholic acid glycine conjugate"; // Pass molecule name here!
var chart = new Highcharts.Chart({
chart: {
renderTo: 'container',
defaultSeriesType: 'column',
zoomType: 'xy',
margin: [50, 50, 200, 80]
},
title: {
text: '' + mole
},
subtitle: {
text: "MID: 203 <font color='blue'><b>Insilico predicted spectra<\/b><\/font>" },
credits: {
enabled: false
},
xAxis: {
min: 0,
// max: 200,
title: {
enabled: true,
text: 'Mass (m/z)'
},
maxZoom: 0.1,
tickPixelInterval: 100
},
yAxis: {
min: 0,
max: 100,
title: {
text: 'Intensity (%)'
}
},
legend: {
enabled: true,
showFragments: true,
showNeutrals: false,
showPeaks: false,
exclusiveSelect: true, // Turns on exclusive radio style buttons
dblClick: false,
startNumber: 0, // The default legend item when page loads
borderWidth: 1,
layout: 'vertical',
backgroundColor: '#FFFFFF',
style: {
left: '50px',
top: '300px',
bottom: 'auto'
}
},
// Tooltip HTML
tooltip: {
second: true,
neutral: false,
borderRadius: 0,
formatter: function() {
var namestr;
if (this.series.name.match(/\+/g) && !this.series.name.match("Cl"))
namestr = "Mode: <b><font size=\"4\">(+)</font></b> Collision Energy: ";
else if (this.series.name.match('-'))
namestr = "Mode: <b><font size=\"4\">(-)</font></b> Collision Energy: ";
if (!(this.series.name.match("10 V")||this.series.name.match("20 V")||this.series.name.match("40 V")))
namestr += "<b><font size=\"3\">0 V</font></b>";
else if (this.series.name.match("10 V"))
namestr += "<b><font size=\"3\">10 V</font></b>";
else if (this.series.name.match("20 V"))
namestr += "<b><font size=\"3\">20 V</font></b>";
else if (this.series.name.match("40 V"))
namestr += "<b><font size=\"3\">40 V</font></b>";
return '<center><br/> '+ namestr +'<br/>' + ' m/z: <b><font size="3">' + this.x.toFixed(4) + '</font></b> Intensity: <b><font size="3">' + parseInt(Math.abs(this.y)) + ' % </font></b></center><br/>';
},
formatter2: function() {
var namestr;
if (this.series.name.match(/\+/g))
namestr = "Mode: (+), Collision Energy: ";
else if (this.series.name.match('-'))
namestr = "Mode: (-), Collision Energy: ";
if (!(this.series.name.match("10 V")||this.series.name.match("20 V")||this.series.name.match("40 V")))
namestr += "0 V, Adduct: ";
else if (this.series.name.match("10 V"))
namestr += "10 V, Adduct: ";
else if(this.series.name.match("20 V"))
namestr += "20 V, Adduct: ";
else if (this.series.name.match("40 V"))
namestr += "40 V, Adduct: ";
return false;
}
},
plotOptions: {
column: {
pointPadding: 0.53,
// pointPadding: 0.99,
borderWidth: 0,
shadow: false
// borderColor: '#000000'
}
},
series: [{name: ' (+) 10 V [M+H]+ ',data:[{x:450.321,y:84,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:8,fragment: false},{x:432.311,y:100,fragment: false},{x:416.28,y:0,fragment: false},{x:414.3,y:24,fragment: false},{x:406.332,y:0,fragment: false},{x:404.316,y:16,fragment: false},{x:390.3,y:0,fragment: false},{x:390.264,y:0,fragment: false},{x:388.321,y:0,fragment: false},{x:386.305,y:8,fragment: false},{x:375.289,y:24,fragment: false},{x:372.29,y:0,fragment: false},{x:357.279,y:12,fragment: false},{x:347.294,y:4,fragment: false},{x:331.3,y:0,fragment: false},{x:329.284,y:4,fragment: false},{x:319.263,y:0,fragment: false},{x:301.253,y:0,fragment: false},{x:291.232,y:0,fragment: false},{x:273.221,y:0,fragment: false},{x:240.159,y:0,fragment: false},{x:158.081,y:0,fragment: false},{x:130.05,y:0,fragment: false},{x:76.0393,y:44,fragment: false},{x:74.0237,y:0,fragment: false},{x:59.0128,y:8,fragment: false},{x:58.0287,y:0,fragment: false},{x:30.0338,y:0,fragment: false},{x:28.0182,y:0,fragment: false} ]},{name: ' (+) 20 V [M+H]+ ',data:[{x:450.321,y:11.764705882353,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:35.294117647059,fragment: false},{x:432.311,y:5.8823529411765,fragment: false},{x:416.28,y:5.8823529411765,fragment: false},{x:414.3,y:58.823529411765,fragment: false},{x:404.316,y:11.764705882353,fragment: false},{x:388.321,y:5.8823529411765,fragment: false},{x:386.305,y:35.294117647059,fragment: false},{x:375.289,y:29.411764705882,fragment: false},{x:372.29,y:5.8823529411765,fragment: false},{x:357.279,y:35.294117647059,fragment: false},{x:347.294,y:17.647058823529,fragment: false},{x:333.279,y:5.8823529411765,fragment: false},{x:331.3,y:5.8823529411765,fragment: false},{x:329.284,y:17.647058823529,fragment: false},{x:327.268,y:5.8823529411765,fragment: false},{x:319.263,y:0,fragment: false},{x:315.268,y:5.8823529411765,fragment: false},{x:301.253,y:5.8823529411765,fragment: false},{x:273.221,y:5.8823529411765,fragment: false},{x:158.081,y:5.8823529411765,fragment: false},{x:130.05,y:5.8823529411765,fragment: false},{x:111.08,y:5.8823529411765,fragment: false},{x:102.019,y:0,fragment: false},{x:76.0393,y:100,fragment: false},{x:74.0237,y:5.8823529411765,fragment: false},{x:59.0128,y:23.529411764706,fragment: false},{x:58.0287,y:17.647058823529,fragment: false},{x:30.0338,y:11.764705882353,fragment: false},{x:28.0182,y:11.764705882353,fragment: false} ]},{name: ' (+) 40 V [M+H]+ ',data:[{x:416.28,y:18.181818181818,fragment: false},{x:414.3,y:100,fragment: false},{x:388.321,y:18.181818181818,fragment: false},{x:386.305,y:54.545454545455,fragment: false},{x:372.29,y:9.0909090909091,fragment: false},{x:359.294,y:9.0909090909091,fragment: false},{x:357.279,y:63.636363636364,fragment: false},{x:355.263,y:9.0909090909091,fragment: false},{x:333.279,y:0,fragment: false},{x:331.3,y:18.181818181818,fragment: false},{x:331.263,y:9.0909090909091,fragment: false},{x:329.284,y:36.363636363636,fragment: false},{x:327.268,y:9.0909090909091,fragment: false},{x:317.284,y:9.0909090909091,fragment: false},{x:315.268,y:18.181818181818,fragment: false},{x:303.268,y:9.0909090909091,fragment: false},{x:301.253,y:18.181818181818,fragment: false},{x:275.237,y:9.0909090909091,fragment: false},{x:273.221,y:18.181818181818,fragment: false},{x:261.221,y:0,fragment: false},{x:111.08,y:9.0909090909091,fragment: false},{x:97.0648,y:9.0909090909091,fragment: false},{x:76.0393,y:45.454545454545,fragment: false},{x:59.0128,y:63.636363636364,fragment: false},{x:58.0287,y:81.818181818182,fragment: false},{x:55.0542,y:9.0909090909091,fragment: false},{x:44.9971,y:9.0909090909091,fragment: false},{x:41.0022,y:18.181818181818,fragment: false},{x:32.0495,y:9.0909090909091,fragment: false},{x:30.0338,y:36.363636363636,fragment: false},{x:28.0182,y:54.545454545455,fragment: false} ]}]
});
});
</script>
</head>
<body style="border:0;overflow:visible">
<!-- 3. Add the container -->
<div id="container" style="width: 720px; height: 460px; margin: 0 auto">
</div>
<!-- <table align = "center" style="border-width:0; cellpadding:5; table-layout:fixed; bordercolor:'#00FF00'"> -->
<table align="center" style="border-width:0; cellpadding:5; table-layout:fixed; bordercolor:'#00FF00'">
<tr>
<td style="border-style: solid; border-color:#FFF8C6"><img align="top" alt="attention" src="img/attn.png" title="how to use spectrum"/>
<font color="red" face="helvetica,arial" size="2">
<b>Please mouse over the spectrum to view the detail information of each peak<br/>
Use left mouse button to zoom in (click and drag) and zoom out (double-click)</b></font>
</td>
</tr>
</table>
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-1907670-5']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</body>
</html>
The data i am trying to extract out:
series: [{name: ' (+) 10 V [M+H]+ ',data:[{x:450.321,y:84,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:8,fragment: false},{x:432.311,y:100,fragment: false},{x:416.28,y:0,fragment: false},{x:414.3,y:24,fragment: false},{x:406.332,y:0,fragment: false},{x:404.316,y:16,fragment: false},{x:390.3,y:0,fragment: false},{x:390.264,y:0,fragment: false},{x:388.321,y:0,fragment: false},{x:386.305,y:8,fragment: false},{x:375.289,y:24,fragment: false},{x:372.29,y:0,fragment: false},{x:357.279,y:12,fragment: false},{x:347.294,y:4,fragment: false},{x:331.3,y:0,fragment: false},{x:329.284,y:4,fragment: false},{x:319.263,y:0,fragment: false},{x:301.253,y:0,fragment: false},{x:291.232,y:0,fragment: false},{x:273.221,y:0,fragment: false},{x:240.159,y:0,fragment: false},{x:158.081,y:0,fragment: false},{x:130.05,y:0,fragment: false},{x:76.0393,y:44,fragment: false},{x:74.0237,y:0,fragment: false},{x:59.0128,y:8,fragment: false},{x:58.0287,y:0,fragment: false},{x:30.0338,y:0,fragment: false},{x:28.0182,y:0,fragment: false} ]},{name: ' (+) 20 V [M+H]+ ',data:[{x:450.321,y:11.764705882353,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:35.294117647059,fragment: false},{x:432.311,y:5.8823529411765,fragment: false},{x:416.28,y:5.8823529411765,fragment: false},{x:414.3,y:58.823529411765,fragment: false},{x:404.316,y:11.764705882353,fragment: false},{x:388.321,y:5.8823529411765,fragment: false},{x:386.305,y:35.294117647059,fragment: false},{x:375.289,y:29.411764705882,fragment: false},{x:372.29,y:5.8823529411765,fragment: false},{x:357.279,y:35.294117647059,fragment: false},{x:347.294,y:17.647058823529,fragment: false},{x:333.279,y:5.8823529411765,fragment: false},{x:331.3,y:5.8823529411765,fragment: false},{x:329.284,y:17.647058823529,fragment: false},{x:327.268,y:5.8823529411765,fragment: false},{x:319.263,y:0,fragment: false},{x:315.268,y:5.8823529411765,fragment: false},{x:301.253,y:5.8823529411765,fragment: false},{x:273.221,y:5.8823529411765,fragment: false},{x:158.081,y:5.8823529411765,fragment: false},{x:130.05,y:5.8823529411765,fragment: false},{x:111.08,y:5.8823529411765,fragment: false},{x:102.019,y:0,fragment: false},{x:76.0393,y:100,fragment: false},{x:74.0237,y:5.8823529411765,fragment: false},{x:59.0128,y:23.529411764706,fragment: false},{x:58.0287,y:17.647058823529,fragment: false},{x:30.0338,y:11.764705882353,fragment: false},{x:28.0182,y:11.764705882353,fragment: false} ]},{name: ' (+) 40 V [M+H]+ ',data:[{x:416.28,y:18.181818181818,fragment: false},{x:414.3,y:100,fragment: false},{x:388.321,y:18.181818181818,fragment: false},{x:386.305,y:54.545454545455,fragment: false},{x:372.29,y:9.0909090909091,fragment: false},{x:359.294,y:9.0909090909091,fragment: false},{x:357.279,y:63.636363636364,fragment: false},{x:355.263,y:9.0909090909091,fragment: false},{x:333.279,y:0,fragment: false},{x:331.3,y:18.181818181818,fragment: false},{x:331.263,y:9.0909090909091,fragment: false},{x:329.284,y:36.363636363636,fragment: false},{x:327.268,y:9.0909090909091,fragment: false},{x:317.284,y:9.0909090909091,fragment: false},{x:315.268,y:18.181818181818,fragment: false},{x:303.268,y:9.0909090909091,fragment: false},{x:301.253,y:18.181818181818,fragment: false},{x:275.237,y:9.0909090909091,fragment: false},{x:273.221,y:18.181818181818,fragment: false},{x:261.221,y:0,fragment: false},{x:111.08,y:9.0909090909091,fragment: false},{x:97.0648,y:9.0909090909091,fragment: false},{x:76.0393,y:45.454545454545,fragment: false},{x:59.0128,y:63.636363636364,fragment: false},{x:58.0287,y:81.818181818182,fragment: false},{x:55.0542,y:9.0909090909091,fragment: false},{x:44.9971,y:9.0909090909091,fragment: false},{x:41.0022,y:18.181818181818,fragment: false},{x:32.0495,y:9.0909090909091,fragment: false},{x:30.0338,y:36.363636363636,fragment: false},{x:28.0182,y:54.545454545455,fragment: false} ]}]
i am having trouble getting to that segment of the data and extracting it in a way that is useful. I did notice it has a JSON format structure but i cant gain access to it from the soup to make use of that.
If i am not clear on what i am trying to do please let me know.
Below is my python script:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import xlwt
import xlrd
import requests
import re
import json
CASNUMBERS =xlrd.open_workbook("./OUTPUTFILE.xls")
CASNUMBERS_sheet = CASNUMBERS.sheet_by_index(0)
# import sqlite3
# conn = sqlite3.connect('CurationParsedData.db')
# c = conn.cursor()
#
# ##Create Table
# c.execute('''CREATE TABLE CurationParsedData(Exp_website TEXT,InSilico_website TEXT)''')
#
#
# def add_website(exp,insil):
# c.execute("INSERT INTO CurationParsedData VALUES("+ exp ","+ insil")")
# MID = "154"
# NAMEID = "glucose"
# CASID = "492-62-6"
# KEGGID = "C00267"
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
def ESILINK(number):
ESIlink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=experimental"
return ESIlink
def insilico(number):
insilicolink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=insilico"
return insilicolink
def metlinsearch(NAME= "",CAS = "", KEGG=""):
metlin_search = "https://metlin.scripps.edu/advanced_search_result.php?molid=&mass_min=&mass_max=&name=" + NAME + "&formula=&cas=" + CAS + "&kegg=" + KEGG + "&smilefile=&msmspeaks_min=&AminoAcid=add&drug=add&toxinEPA=add&smilesExactMatchCheckBox=false&nameExactMatchCheckBox=false"
return metlin_search
def HMDBsearch(number):
hmdb_search = "http://www.hmdb.ca/unearth/q?utf8=%E2%9C%93&query=" + number + "&searcher=metabolites&button="
return hmdb_search
for items in CASNUMBERS_sheet.col_values(2, 1):
print(items)
hmdbsearch_link = HMDBsearch(items)
print(hmdbsearch_link)
metlinsearch_link = metlinsearch(CAS = items)
metlinesearch_soup = make_soup(metlinsearch_link)
firstMID = metlinesearch_soup.find("th", {"scope": "row"})
allMID = metlinesearch_soup.findAll("th", {"scope": "row"})
ESI = "NO LINK"
INSILICO = "NO LINK"
if firstMID != None:
firstMID = firstMID.text
ESI = ESILINK(firstMID)
INSILICO = insilico(firstMID)
if allMID != None:
MIDlist = []
for items in allMID:
MIDlist.append(items.text)
esi = ESILINK(ESI)
sil = insilico(INSILICO)
print(ESI)
print(INSILICO)
# sil_soup = make_soup(sil)
sil_link = requests.get(INSILICO)
sil_soup = BeautifulSoup(sil_link.text, "lxml")
# print(sil_soup)
series = sil_soup.findAll('script', {"type": "text/javascript"})
series = series[3]
info = []
for x in series:
info.append(str(x))
for text in info:
head, body, tail = text.partition('series:')
tail = tail.replace(' ', '').replace(';', '').replace(' ', '')
print(tail)
json_string = tail
parse_json= json.loads(json_string)
print(parse_json['data'])
I was able to get the output i wanted by a few steps, definitely not the best and optimal way but i first turned it in to a string -> partition to get to the part i needed -> removed characters that were not of interest from the string -> run it through a for loop to reorganize the data back into a list -> run through and partition for x and y values -> convert to float -> values can be used for graphing.
very round about way to get the task done but might help someone with similar issues when trying to parse JavaScript data.
below is the code:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import xlwt
import xlrd
import requests
import re
import json
import matplotlib.pyplot as plt
CASNUMBERS =xlrd.open_workbook("./OUTPUTFILE.xls")
CASNUMBERS_sheet = CASNUMBERS.sheet_by_index(0)
# import sqlite3
# conn = sqlite3.connect('CurationParsedData.db')
# c = conn.cursor()
#
# ##Create Table
# c.execute('''CREATE TABLE CurationParsedData(Exp_website TEXT,InSilico_website TEXT)''')
#
#
# def add_website(exp,insil):
# c.execute("INSERT INTO CurationParsedData VALUES("+ exp ","+ insil")")
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
def ESILINK(number):
ESIlink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=experimental"
return ESIlink
def insilico(number):
insilicolink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=insilico"
return insilicolink
def metlinsearch(NAME= "",CAS = "", KEGG=""):
metlin_search = "https://metlin.scripps.edu/advanced_search_result.php?molid=&mass_min=&mass_max=&name=" + NAME + "&formula=&cas=" + CAS + "&kegg=" + KEGG + "&smilefile=&msmspeaks_min=&AminoAcid=add&drug=add&toxinEPA=add&smilesExactMatchCheckBox=false&nameExactMatchCheckBox=false"
return metlin_search
def HMDBsearch(number):
hmdb_search = "http://www.hmdb.ca/unearth/q?utf8=%E2%9C%93&query=" + number + "&searcher=metabolites&button="
return hmdb_search
for items in CASNUMBERS_sheet.col_values(2, 1):
print(items)
hmdbsearch_link = HMDBsearch(items)
print(hmdbsearch_link)
metlinsearch_link = metlinsearch(CAS = items)
metlinesearch_soup = make_soup(metlinsearch_link)
firstMID = metlinesearch_soup.find("th", {"scope": "row"})
allMID = metlinesearch_soup.findAll("th", {"scope": "row"})
ESI = "NO LINK"
INSILICO = "NO LINK"
if firstMID != None:
firstMID = firstMID.text
ESI = ESILINK(firstMID)
INSILICO = insilico(firstMID)
if allMID != None:
MIDlist = []
for items in allMID:
MIDlist.append(items.text)
esi = ESILINK(ESI)
sil = insilico(INSILICO)
print(ESI)
print(INSILICO)
if ESI != "NO LINK":
sil_link = requests.get(ESI)
sil_soup = BeautifulSoup(sil_link.text, "lxml")
# print(sil_soup)
series = sil_soup.findAll('script', {"type": "text/javascript"})
series = series[3]
info = []
for x in series:
info.append(str(x))
for text in info:
head, body, tail = text.partition('series:')
tail = tail.replace(' ', '').replace(';', '').replace(' ', '').replace('fragment:false', '').replace('fragment:true', '').replace('\n', '').replace('[', '').replace(']', '').replace('name:', '').replace('data:', '')
# print(tail)
identifyers = ['{', '}']
datalist = []
temp = ''
for data in tail:
if data != identifyers:
temp = temp + data
# print(temp)
# print('active1')
if data in identifyers:
datalist.append(temp)
temp = ''
# print("active2")
# print(datalist)
finallist = []
for items in datalist:
items = items.replace("}", '').replace('{', '').replace(',', '').replace(')', '').replace('(','')
if items != '':
finallist.append(items)
print(finallist)
for items in finallist:
if items[0] == "x":
head,body,tail = items.partition("x:")
head,body,tail = tail.partition("y:")
xvalue = round(float(head), 3)
yvalue = round(float(tail), 3)
print("x:",xvalue, "y:", yvalue)
else:
# items[1] == "-" or "+":
print("energy", items)