How to unpickle inferSent and load model? - pickle

I had a working code that simply loads the infersent model. Now, it wont unpickle the model
MODEL_PATH = "./encoder/infersent1.pkl"
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
'pool_type': 'max', 'dpout_model': 0.0, 'version':
model_version}
inferSent = InferSent(params_model)
print(MODEL_PATH)
inferSent.load_state_dict(torch.load(MODEL_PATH))
use_cuda = False
inferSent = inferSent.cuda() if use_cuda else inferSent
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent
embeddings.
W2V_PATH = './dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else
'../dataset/fastText/crawl-300d-2M.vec'
inferSent.set_w2v_path(W2V_PATH)
UnpicklingError: invalid load key, '<'.

The reason of this problem is that your pickle file has not been downloaded properly.
Check the size of your file, it should be around 160 MB. For some reason, the links in the infersent repo don't work. You can build your own NLI model using the train_nli.py script provided in the repository.
python train_nli.py --word_emb_path 'Your word embedding(for example GloVe/fastText)'

Related

Useful way to convert string to dictionary using python

I have the below string as input:
'name SP2, status Online, size 4764771 MB, free 2576353 MB, path /dev/sde, log 210 MB, port 5660, guid 7478a0141b7b9b0d005b30b0e60f3c4d, clusterUuid -8650609094877646407--116798096584060989, disks /dev/sde /dev/sdf /dev/sdg, dare 0'
I wrote function which convert it to dictionary using python:
def str_2_json(string):
str_arr = string.split(',')
#str_arr{0} = name SP2
#str_arr{1} = status Online
json_data = {}
for i in str_arr:
#remove whitespaces
stripped_str = " ".join(i.split()) # i.strip()
subarray = stripped_str.split(' ')
#subarray{0}=name
#subarray{1}=SP2
key = subarray[0] #key: 'name'
value = subarray[1] #value: 'SP2'
json_data[key] = value
#{dict 0}='name': SP2'
#{dict 1}='status': online'
return json_data
The return turns the dictionary into json (it has jsonfiy).
Is there a simple/elegant way to do it better?
You can do this with regex
import re
def parseString(s):
dict(re.findall('(?:(\S+) ([^,]+)(?:, )?)', s))
sample = "name SP1, status Offline, size 4764771 MB, free 2406182 MB, path /dev/sdb, log 230 MB, port 5660, guid a48134c00cda2c37005b30b0e40e3ed6, clusterUuid -8650609094877646407--116798096584060989, disks /dev/sdb /dev/sdc /dev/sdd, dare 0"
parseString(sample)
Output:
{'name': 'SP1',
'status': 'Offline',
'size': '4764771 MB',
'free': '2406182 MB',
'path': '/dev/sdb',
'log': '230 MB',
'port': '5660',
'guid': 'a48134c00cda2c37005b30b0e40e3ed6',
'clusterUuid': '-8650609094877646407--116798096584060989',
'disks': '/dev/sdb /dev/sdc /dev/sdd',
'dare': '0'}
Your approach is good, except for a couple weird things:
You aren't creating a JSON anything, so to avoid any confusion I suggest you don't name your returned dictionary json_data or your function str_2_json. JSON, or JavaScript Object Notation is just that -- a standard of denoting an object as text. The objects themselves have nothing to do with JSON.
You can use i.strip() instead of joining the splitted string (not sure why you did it this way, since you commented out i.strip())
Some of your values contain multiple spaces (e.g. "size 4764771 MB" or "disks /dev/sde /dev/sdf /dev/sdg"). By your code, you end up everything after the second space in such strings. To avoid this, do stripped_str.split(' ', 1) which limits how many times you want to split the string.
Other than that, you could create a dictionary in one line using the dict() constructor and a generator expression:
def str_2_dict(string):
data = dict(item.strip().split(' ', 1) for item in string.split(','))
return data
print(str_2_dict('name SP2, status Online, size 4764771 MB, free 2576353 MB, path /dev/sde, log 210 MB, port 5660, guid 7478a0141b7b9b0d005b30b0e60f3c4d, clusterUuid -8650609094877646407--116798096584060989, disks /dev/sde /dev/sdf /dev/sdg, dare 0'))
Outputs:
{
'name': 'SP2',
'status': 'Online',
'size': '4764771 MB',
'free': '2576353 MB',
'path': '/dev/sde',
'log': '210 MB',
'port': '5660',
'guid': '7478a0141b7b9b0d005b30b0e60f3c4d',
'clusterUuid': '-8650609094877646407--116798096584060989',
'disks': '/dev/sde /dev/sdf /dev/sdg',
'dare': '0'
}
This is probably the same (practically, in terms of efficiency / time) as writing out the full loop:
def str_2_dict(string):
data = dict()
for item in string.split(','):
key, value = item.strip().split(' ', 1)
data[key] = value
return data
Assuming these fields cannot contain internal commas, you can use re.split to both split and remove surrounding whitespace. It looks like you have different types of fields that should be handled differently. I've added a guess at a schema handler based on field names that can serve as a template for converting the various fields as needed.
And as noted elsewhere, there is no json so don't use that name.
import re
test = 'name SP2, status Online, size 4764771 MB, free 2576353 MB, path /dev/sde, log 210 MB, port 5660, guid 7478a0141b7b9b0d005b30b0e60f3c4d, clusterUuid -8650609094877646407--116798096584060989, disks /dev/sde /dev/sdf /dev/sdg, dare 0'
def decode_data(string):
str_arr = re.split(r"\s*,\s*", string)
data = {}
for entry in str_arr:
values = re.split(r"\s+", entry)
key = values.pop(0)
# schema processing
if key in ("disks"): # multivalue keys
data[key] = values
elif key in ("size", "free"): # convert to int bytes on 2nd value
multiplier = {"MB":10**6, "MiB":2**20} # todo: expand as needed
data[key] = int(values[0]) * multiplier[values[1]]
else:
data[key] = " ".join(values)
return data
decoded = decode_data(test)
for kv in sorted(decoded.items()):
print(kv)
import json
json_data = json.loads(string)

How to efficiently parse JSON data with multiple keys in Python 2.7?

I'm writing a script that will check the CVS COVID vaccine availability for cities in my state of VA. I have been successful getting the data I'm looking for, but my code is hard coded in some areas. I'm specifically asking for help improving my code in the areas number 1 & 2 below:
The JSON file can be found here:
https://www.cvs.com//immunizations/covid-19-vaccine.vaccine-status.VA.json?vaccineinfo
I'm trying to access the data in the responsePayloadData key. The only way I could figure out how to do this is to make it the only key. For that reason, I deleted the other key responseMetaData:
#remove the key that we don't need
del obj['responseMetaData']
I'm also not sure how to dynamically loop through the VA items without hard coding the number of cities I know are there in the data:
for x, y in obj.items():
for a in range(34):
Here's the full code:
import requests
import json
import time
from datetime import datetime
import urllib2
try:
import indigo
except:
pass
strAvail = "False"
strAvailCity = "None"
try:
# download raw json object from CVS Virginia Website
url = "https://www.cvs.com//immunizations/covid-19-vaccine.vaccine-status.VA.json?vaccineinfo"
data = urllib2.urlopen(url).read().decode()
except urllib2.HTTPError, err:
return {"error": err.reason, "error_code": err.code}
# parse json object
obj = json.loads(data)
# remove the key that we don't need
del obj['responseMetaData']
# loop through the JSON dictionary and check availability
# status options: {"Fully Booked", "Available"}
for x, y in obj.items():
for a in range(34):
# print('City: ' + y['data']['VA'][a]['city'])
# print('Total Available: ' + y['data']['VA'][a]['totalAvailable'])
# print('Percent Available: ' + y['data']['VA'][a]['pctAvailable'])
# print('Status: ' + y['data']['VA'][a]['status'])
# print("------------------------------")
# If there is availability anywhere in the state, take some action.
if y['data']['VA'][a]['status'] == "Available":
strAvail = True
strAvailCity = y['data']['VA'][a]['city']
# Log timestamp for this check to the JSON
now = datetime.now()
strDateTime = now.strftime("%m/%d/%Y %I:%M %p")
EDIT: Since the JSON is not available outside the US. I've pasted it below:
{"responsePayloadData":{"currentTime":"2021-02-11T14:55:00.470","data":{"VA":[{"totalAvailable":"1","city":"ABINGDON","state":"VA","pctAvailable":"0.19%","status":"Fully Booked"},{"totalAvailable":"0","city":"ALEXANDRIA","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"ARLINGTON","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"BEDFORD","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"BLACKSBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"CHARLOTTESVILLE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"CHATHAM","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"CHESAPEAKE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"1","city":"DANVILLE","state":"VA","pctAvailable":"0.19%","status":"Fully Booked"},{"totalAvailable":"2","city":"DUBLIN","state":"VA","pctAvailable":"0.39%","status":"Fully Booked"},{"totalAvailable":"0","city":"FAIRFAX","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"FREDERICKSBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"GAINESVILLE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"HAMPTON","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"HARRISONBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"LEESBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"LYNCHBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"MARTINSVILLE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"MECHANICSVILLE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"MIDLOTHIAN","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},
{"totalAvailable":"0","city":"NEWPORT NEWS","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"NORFOLK","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"PETERSBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"PORTSMOUTH","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"RICHMOND","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"ROANOKE","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},
{"totalAvailable":"0","city":"ROCKY MOUNT","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"STAFFORD","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"SUFFOLK","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},
{"totalAvailable":"0","city":"VIRGINIA BEACH","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"WARRENTON","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"WILLIAMSBURG","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"WINCHESTER","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"},{"totalAvailable":"0","city":"WOODSTOCK","state":"VA","pctAvailable":"0.00%","status":"Fully Booked"}]}},"responseMetaData":{"statusDesc":"Success","conversationId":"Id-beb5f68730b34e6aa3bbc1fd927ea12b","refId":"Id-b4a7256078789eb59b8912b4","operation":"getInventorybyCity","statusCode":"0000"}}
Regarding problem 1, you can just access the data by key. You don't need to delete the other key:
payload = obj['responsePayloadData']
For the second problem, you can just iterate over the items in the list associated with obj['data']['VA']:
for city in payload['data']['VA']:
print(city)
{'city': 'ABINGDON',
'pctAvailable': '0.19%',
'state': 'VA',
'status': 'Fully Booked',
'totalAvailable': '1'}
{'city': 'ALEXANDRIA',
'pctAvailable': '0.00%',
'state': 'VA',
'status': 'Fully Booked',
'totalAvailable': '0'}
...

Fluctuating RAM in google colab while running a BERT model

I am running a simple comment classification task on google colab. I am using DistilBERT for contextual embeddings.I use only 4000 training sample cause the notebook keeps on crashing.
When I run the cell for obtaining the embeddings, I keep a tab on how the RAM utilisation increases. I am seeing that it oscillates from somewhere between 3gb to 8gb.
Should not it be just increasing? Can anyone explain how this works at lower level.
Here is my code, the cell block at last is where I am seeing the above said thing.
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
max_len=80
tokenized = sample['comment_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True,max_length= max_len)))
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
**with torch.no_grad():
last_hidden_states = model(input_ids, attention_mask=attention_mask)**

Python: KeyError: 'style_code' even when style_code is in my json data

I might just need new eyes to look at this but im not sure why im yielding this error KeyError: 'style_code'
I know that style code is in my json data and i looked up why this error occurs and it said it was because it cannot find it in the list. Here is my code of the json data...
data1['task'].append({
'profile': profiles_select.get(),
'proxy pool': pool_combo.get(),
'captcha': captcha,
'task_id': num_id,
'style_code': stylecode_entry.get(),
'delay': int(delay_entry.get()),
'size': size,
'splash': splash,
'browser': browser
})
num_id = num_id + 1
with open('tasks_ys.txt', 'w', encoding='utf-8') as outfile:
json.dump(data1, outfile, indent=2)
As you can see 'style_code' is clearly stated in there. Here is my code which is giving me the error.
with open('tasks_ys.txt', 'r') as json_file:
load_data = json.load(json_file)
for task in load_data['task']:
style_c = load_data['style_code']
product_lbl = Label(ys_tasks_frame, text=style_c, bg='#1a2228', fg=fgcolor,
font=("Candara", 12))
product_lbl.place(x=150, y=task_y)
id_lbl = Label(ys_tasks_frame, text=num_t, bg='#1a2228', fg=fgcolor, font=("Candara", 12))
id_lbl.place(x=25, y=task_y)
num_t = num_t + 1
task_y = task_y + 25
Can anyone help me and point out what is wrong about the code?

Is there a way to take a list of strings and create a JSON file, where both the key and value are list items?

I am creating a python script that can read scanned, and tabular .pdfs and extract some important data and insert it into a JSON to later be implemented into a SQL database (I will also be developing the DB as a project for learning MongoDB).
Basically, my issue is I have never worked with any JSON files before but that was the format I was recommended to output to. The scraping script works, the pre-processing could be a lot cleaner, but for now it works. The issue I run into is the keys, and values are in the same list, and some of the values because they had a decimal point are two different list items. Not really sure where to even start.
I don't really know where to start, I suppose since I know what the indexes of the list are I can easily assign keys and values, but then it may not be applicable to any .pdf, that is the script cannot be coded explicitly.
import PyPDF2 as pdf2
import textract
with "TestSpec.pdf" as filename:
pdfFileObj = open(filename, 'rb')
pdfReader = pdf2.pdfFileReader(pdfFileObj)
num_pages = pdfReader.numpages
count = 0
text = ""
while count < num_pages:
pageObj = pdfReader.getPage(0)
count += 1
text += pageObj.extractText()
if text != "":
text = text
else:
text = textract.process(filename, method="tesseract", language="eng")
def cleanText(x):
'''
This function takes the byte data extracted from scanned PDFs, and cleans it of all
unnessary data.
Requires re
'''
stringedText = str(x)
cleanText = stringedText.replace('\n','')
splitText = re.split(r'\W+', cleanText)
caseingText = [word.lower() for word in splitText]
cleanOne = [word for word in caseingText if word != 'n']
dexStop = cleanOne.index("od260")
dexStart = cleanOne.index("sheet")
clean = cleanOne[dexStart + 1:dexStop]
return clean
cleanText = cleanText(text)
This is the current output
['n21', 'feb', '2019', 'nsequence', 'lacz', 'rp', 'n5', 'gat', 'ctc', 'tac', 'cat', 'ggc', 'gca', 'cat', 'ttc', 'ccc', 'gaa', 'aag', 'tgc', '3', 'norder', 'no', '15775199', 'nref', 'no', '207335463', 'n25', 'nmole', 'dna', 'oligo', '36', 'bases', 'nproperties', 'amount', 'of', 'oligo', 'shipped', 'to', 'ntm', '50mm', 'nacl', '66', '8', 'xc2', 'xb0c', '11', '0', '32', '6', 'david', 'cook', 'ngc', 'content', '52', '8', 'd260', 'mmoles', 'kansas', 'state', 'university', 'biotechno', 'nmolecular', 'weight', '10', '965', '1', 'nnmoles']
and we want the output as a JSON setup like
{"Date | 21feb2019", "Sequence ID: | lacz-rp", "Sequence 5'-3' | gat..."}
and so on. Just not sure how to do that.
here is a screenshot of the data from my sample pdf
So, i have figured out some of this. I am still having issues with grabbing the last 3rd of the data i need without explicitly programming it in. but here is what i have so far. Once i have everything working then i will worry about optimizing it and condensing.
# for PDF reading
import PyPDF2 as pdf2
import textract
# for data preprocessing
import re
from dateutil.parser import parse
# For generating the JSON file array
import json
# This finds and opens the pdf file, reads the data, and extracts the data.
filename = "*.pdf"
pdfFileObj = open(filename, 'rb')
pdfReader = pdf2.PdfFileReader(pdfFileObj)
text = ""
pageObj = pdfReader.getPage(0)
text += pageObj.extractText()
# checks if extracted data is in string form or picture, if picture textract reads data.
# it then closes the pdf file
if text != "":
text = text
else:
text = textract.process(filename, method="tesseract", language="eng")
pdfFileObj.close()
# Converts text to string from byte data for preprocessing
stringedText = str(text)
# Removed escaped lines and replaced them with actual new lines.
formattedText = stringedText.replace('\\n', '\n').lower()
# Slices the long string into a workable piece (only contains useful data)
slice1 = formattedText[(formattedText.index("sheet") + 10): (formattedText.index("secondary") - 2)]
clean = re.sub('\n', " ", slice1)
clean2 = re.sub(' +', ' ', clean)
# Creating the PrimerData dictionary
with open("PrimerData.json",'w') as file:
primerDataSlice = clean[clean.index("molecular"): -1]
primerData = re.split(": |\n", primerDataSlice)
primerKeys = primerData[0::2]
primerValues = primerData[1::2]
primerDict = {"Primer Data": dict(zip(primerKeys,primerValues))}
# Generatring the JSON array "Primer Data"
primerJSON = json.dumps(primerDict, ensure_ascii=False)
file.write(primerJSON)
# Grabbing the date (this has just the date, so json will have to add date.)
date = re.findall('(\d{2}[\/\- ](\d{2}|january|jan|february|feb|march|mar|april|apr|may|may|june|jun|july|jul|august|aug|september|sep|october|oct|november|nov|december|dec)[\/\- ]\d{2,4})', clean2)
Without input data it is difficult to give you working code. A minimal working example with input would help. As for JSON handling, python dictionaries can dump to json easily. See examples here.
https://docs.python-guide.org/scenarios/json/
Get a json string from a dictionary and write to a file. Figure out how to parse the text into a dictionary.
import json
d = {"Date" : "21feb2019", "Sequence ID" : "lacz-rp", "Sequence 5'-3'" : "gat"}
json_data = json.dumps(d)
print(json_data)
# Write that data to a file
So, I did figure this out, the problem was really just that because of the way my pre-processing was pulling all the data into a single list wasn't really that great of an idea considering that the keys for the dictionary never changed.
Here is the semi-finished result for making the Dictionary and JSON file.
# Collect the sequence name
name = clean2[clean2.index("Sequence") + 11: clean2.index("Sequence") + 19]
# Collecting Shipment info
ordered = input("Who placed this order? ")
received = input("Who is receiving this order? ")
dateOrder = re.findall(
r"(\d{2}[/\- ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[/\- ]\d{2,4})",
clean2)
dateReceived = date.today()
refNo = clean2[clean2.index("ref.No. ") + 8: clean2.index("ref.No.") + 17]
orderNo = clean2[clean2.index("Order No.") +
10: clean2.index("Order No.") + 18]
# Finding and grabbing the sequence data. Storing it and then finding the
# GC content and melting temp or TM
bases = int(clean2[clean2.index("bases") - 3:clean2.index("bases") - 1])
seqList = [line for line in clean2 if re.match(r'^[AGCT]+$', line)]
sequence = "".join(i for i in seqList[:bases])
def gc_content(x):
count = 0
for i in x:
if i == 'G' or i == 'C':
count += 1
else:
count = count
return round((count / bases) * 100, 1)
gc = gc_content(sequence)
tm = mt.Tm_GC(sequence, Na=50)
moleWeight = round(mw(Seq(sequence, generic_dna)), 2)
dilWeight = float(clean2[clean2.index("ug/OD260:") +
10: clean2.index("ug/OD260:") + 14])
dilution = dilWeight * 10
primerDict = {"Primer Data": {
"Sequence": sequence,
"Bases": bases,
"TM (50mM NaCl)": tm,
"% GC content": gc,
"Molecular weight": moleWeight,
"ug/0D260": dilWeight,
"Dilution volume (uL)": dilution
},
"Shipment Info": {
"Ref. No.": refNo,
"Order No.": orderNo,
"Ordered by": ordered,
"Date of Order": dateOrder,
"Received By": received,
"Date Received": str(dateReceived.strftime("%d-%b-%Y"))
}}
# Generating the JSON array "Primer Data"
with open("".join(name) + ".json", 'w') as file:
primerJSON = json.dumps(primerDict, ensure_ascii=False)
file.write(primerJSON)