Python Json creating dictionary from a text file, printing file issue - json

I was able to take a text file, read each line, create a dictionary per line, update(append) each line and store the json file. The issue is when reading the json file it will not read correctly. the error point to a storing file issue?
The text file looks like:
84.txt; Frankenstein, or the Modern Prometheus; Mary Wollstonecraft (Godwin) Shelley
98.txt; A Tale of Two Cities; Charles Dickens
...
import json
import re
path = "C:\\...\\data\\"
books = {}
books_json = {}
final_book_json ={}
file = open(path + 'books\\set_of_books.txt', 'r')
json_list = file.readlines()
open(path + 'books\\books_json.json', 'w').close() # used to clean each test
json_create = []
i = 0
for line in json_list:
line = line.replace('#', '')
line = line.replace('.txt','')
line = line.replace('\n','')
line = line.split(';', 4)
BookNumber = line[0]
BookTitle = line[1]
AuthorName = line[-1]
file
if BookNumber == ' 2701':
BookNumber = line[0]
BookTitle1 = line[1]
BookTitle2 = line[2]
AuthorName = line[3]
BookTitle = BookTitle1 + ';' + BookTitle2 # needed to combine title into one to fit dict format
books = json.dumps( {'AuthorName': AuthorName, 'BookNumber': BookNumber, 'BookTitle': BookTitle})
books_json = json.loads(books)
final_book_json.update(books_json)
with open(path + 'books\\books_json.json', 'a'
) as out_put:
json.dump(books_json, out_put)
with open(path + 'books\\books_json.json', 'r'
) as out_put:
'books\\books_json.json', 'r')]
print(json.load(out_put))
The reported error is: JSONDecodeError: Extra data: line 1 column 133
(char 132) - adding this is right between the first "}{". Not sure
how json should look in a flat-file format? The output file as seen on
an editor looks like: {"AuthorName": " Mary Wollstonecraft (Godwin)
Shelley", "BookNumber": " 84", "BookTitle": " Frankenstein, or the
Modern Prometheus"}{"AuthorName": " Charles Dickens", "BookNumber": "
98", "BookTitle": " A Tale of Two Cities"}...

I ended up changing the approach and used pandas to read the text and then spliting the single-cell input.
books = pd.read_csv(path + 'books\\set_of_books.txt', sep='\t', names =('r','t', 'a') )
#print(books.head(10))
# Function to clean the 'raw(r)' inoput data
def clean_line(cell):
...
return cell
books['r'] = books['r'].apply(clean_line)
books = books['r'].str.split(';', expand=True)

Related

Pandas DataFrame KeyError: 1

I am a beginner so cant figure a reason for the error in the following code when train.jsonl uses format like that
{"claim": "But he said if people really want to know if they have CHIP they can get a blood test that costs a few MONEYc1", "evidence": "sentenceID100037", "label": "0"}
{"claim": "This is rather a courtly formulation and would doubtless trigger further eyerolling if uttered in", "evidence": "sentenceID100038", "label": "0"}
The top part executes without problem and displays the data.
import pandas as pd
prefix = '/content/'
train_df = pd.read_json(prefix + 'train.jsonl', orient='records', lines=True)
train_df.head()
[See my Colab Notebook][https://colab.research.google.com/gist/lenyabloko/0e17ebe0f3a0e808779bc1fa95e9b24d/semeval2020-delex.ipynb]
I even tried this additional trick which explained comments about 0 column
prefix = '/content/'
train_df = pd.read_json(prefix + 'train_delex.jsonl', orient='columns')
train_df.to_csv(prefix+'train.tsv', sep='\t', index=False, header=False)
train_df = pd.read_csv(prefix + 'train.tsv', header=None)
train_df.head()
Now I see column labeled '0' instead of the original three columns {"claim": "...", "evidence": " ...", "label": "..."} from the above JSONL file (why is that?)
But when I add DataFrame code it results in error
train_df = pd.DataFrame({
'id': train_df[1],
'text': train_df[0],
'labels':train_df[2]
})
In light of the column named "0" this wouldn't work. But where did that column come from??
KeyError Traceback (most recent call last)
2 frames
<ipython-input-16-0537eda6b397> in <module>()
6
7 train_df = pd.DataFrame({
----> 8 'id': train_df[1],
9 'text': train_df[0],
10 'labels':train_df[2]
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2993 if self.columns.nlevels > 1:
2994 return self._getitem_multilevel(key)
-> 2995 indexer = self.columns.get_loc(key)
2996 if is_integer(indexer):
2997 indexer = [indexer]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
Here is the solution that worked for me:
import pandas as pd
prefix = '/content/'
test_df = pd.read_json(prefix + 'test_delex.jsonl', orient='records', lines=True)
test_df.rename(columns={'claim': 'text', 'evidence': 'id', 'label':'labels'}, inplace=True)
cols = test_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols = cols[-1:] + cols[:-1]
test_df = test_df[cols]
test_df.to_csv(prefix+'test.csv', sep=',', index=False, header=False)
test_df.head()
I updated my shared Colab Notebook linked in the question above

json.decoder.JSONDecodeError with Google Translate API in Python3

Using regex I extract a substring from a Sentence and then translate it multiple times but sometimes I getting an error.
My code:
from googletrans import Translator
try:
translator = Translator()
# substring Sentence: Direccion Referencia </tr>
# I want To Translate This Substring Sentence
# Address = re.search(r'(?<=Direccion).*?(?=</tr>)' ,
#new_get_htmlSource).group(0)
# Address Substring From Website
Address = re.search(r'(?<=<td>).*?(?=</td>)' , Address).group(0)
translated = translator.translate(str(Address))
Address = translated.text
except Exception as e :
exc_type , exc_obj , exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print("Error ON : ",sys._getframe().f_code.co_name + "--> " + str(e) ,
"\n" , exc_type , "\n" ,fname , "\n" , exc_tb.tb_lineno)
I expect the output of This: Reference # Translated Text
I get this error:
scraping_data--> Expecting value: line 1 column 1 (char 0)
class: json.decoder.JSONDecodeError
Scraping_Things.py
29

Is there a way to take a list of strings and create a JSON file, where both the key and value are list items?

I am creating a python script that can read scanned, and tabular .pdfs and extract some important data and insert it into a JSON to later be implemented into a SQL database (I will also be developing the DB as a project for learning MongoDB).
Basically, my issue is I have never worked with any JSON files before but that was the format I was recommended to output to. The scraping script works, the pre-processing could be a lot cleaner, but for now it works. The issue I run into is the keys, and values are in the same list, and some of the values because they had a decimal point are two different list items. Not really sure where to even start.
I don't really know where to start, I suppose since I know what the indexes of the list are I can easily assign keys and values, but then it may not be applicable to any .pdf, that is the script cannot be coded explicitly.
import PyPDF2 as pdf2
import textract
with "TestSpec.pdf" as filename:
pdfFileObj = open(filename, 'rb')
pdfReader = pdf2.pdfFileReader(pdfFileObj)
num_pages = pdfReader.numpages
count = 0
text = ""
while count < num_pages:
pageObj = pdfReader.getPage(0)
count += 1
text += pageObj.extractText()
if text != "":
text = text
else:
text = textract.process(filename, method="tesseract", language="eng")
def cleanText(x):
'''
This function takes the byte data extracted from scanned PDFs, and cleans it of all
unnessary data.
Requires re
'''
stringedText = str(x)
cleanText = stringedText.replace('\n','')
splitText = re.split(r'\W+', cleanText)
caseingText = [word.lower() for word in splitText]
cleanOne = [word for word in caseingText if word != 'n']
dexStop = cleanOne.index("od260")
dexStart = cleanOne.index("sheet")
clean = cleanOne[dexStart + 1:dexStop]
return clean
cleanText = cleanText(text)
This is the current output
['n21', 'feb', '2019', 'nsequence', 'lacz', 'rp', 'n5', 'gat', 'ctc', 'tac', 'cat', 'ggc', 'gca', 'cat', 'ttc', 'ccc', 'gaa', 'aag', 'tgc', '3', 'norder', 'no', '15775199', 'nref', 'no', '207335463', 'n25', 'nmole', 'dna', 'oligo', '36', 'bases', 'nproperties', 'amount', 'of', 'oligo', 'shipped', 'to', 'ntm', '50mm', 'nacl', '66', '8', 'xc2', 'xb0c', '11', '0', '32', '6', 'david', 'cook', 'ngc', 'content', '52', '8', 'd260', 'mmoles', 'kansas', 'state', 'university', 'biotechno', 'nmolecular', 'weight', '10', '965', '1', 'nnmoles']
and we want the output as a JSON setup like
{"Date | 21feb2019", "Sequence ID: | lacz-rp", "Sequence 5'-3' | gat..."}
and so on. Just not sure how to do that.
here is a screenshot of the data from my sample pdf
So, i have figured out some of this. I am still having issues with grabbing the last 3rd of the data i need without explicitly programming it in. but here is what i have so far. Once i have everything working then i will worry about optimizing it and condensing.
# for PDF reading
import PyPDF2 as pdf2
import textract
# for data preprocessing
import re
from dateutil.parser import parse
# For generating the JSON file array
import json
# This finds and opens the pdf file, reads the data, and extracts the data.
filename = "*.pdf"
pdfFileObj = open(filename, 'rb')
pdfReader = pdf2.PdfFileReader(pdfFileObj)
text = ""
pageObj = pdfReader.getPage(0)
text += pageObj.extractText()
# checks if extracted data is in string form or picture, if picture textract reads data.
# it then closes the pdf file
if text != "":
text = text
else:
text = textract.process(filename, method="tesseract", language="eng")
pdfFileObj.close()
# Converts text to string from byte data for preprocessing
stringedText = str(text)
# Removed escaped lines and replaced them with actual new lines.
formattedText = stringedText.replace('\\n', '\n').lower()
# Slices the long string into a workable piece (only contains useful data)
slice1 = formattedText[(formattedText.index("sheet") + 10): (formattedText.index("secondary") - 2)]
clean = re.sub('\n', " ", slice1)
clean2 = re.sub(' +', ' ', clean)
# Creating the PrimerData dictionary
with open("PrimerData.json",'w') as file:
primerDataSlice = clean[clean.index("molecular"): -1]
primerData = re.split(": |\n", primerDataSlice)
primerKeys = primerData[0::2]
primerValues = primerData[1::2]
primerDict = {"Primer Data": dict(zip(primerKeys,primerValues))}
# Generatring the JSON array "Primer Data"
primerJSON = json.dumps(primerDict, ensure_ascii=False)
file.write(primerJSON)
# Grabbing the date (this has just the date, so json will have to add date.)
date = re.findall('(\d{2}[\/\- ](\d{2}|january|jan|february|feb|march|mar|april|apr|may|may|june|jun|july|jul|august|aug|september|sep|october|oct|november|nov|december|dec)[\/\- ]\d{2,4})', clean2)
Without input data it is difficult to give you working code. A minimal working example with input would help. As for JSON handling, python dictionaries can dump to json easily. See examples here.
https://docs.python-guide.org/scenarios/json/
Get a json string from a dictionary and write to a file. Figure out how to parse the text into a dictionary.
import json
d = {"Date" : "21feb2019", "Sequence ID" : "lacz-rp", "Sequence 5'-3'" : "gat"}
json_data = json.dumps(d)
print(json_data)
# Write that data to a file
So, I did figure this out, the problem was really just that because of the way my pre-processing was pulling all the data into a single list wasn't really that great of an idea considering that the keys for the dictionary never changed.
Here is the semi-finished result for making the Dictionary and JSON file.
# Collect the sequence name
name = clean2[clean2.index("Sequence") + 11: clean2.index("Sequence") + 19]
# Collecting Shipment info
ordered = input("Who placed this order? ")
received = input("Who is receiving this order? ")
dateOrder = re.findall(
r"(\d{2}[/\- ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[/\- ]\d{2,4})",
clean2)
dateReceived = date.today()
refNo = clean2[clean2.index("ref.No. ") + 8: clean2.index("ref.No.") + 17]
orderNo = clean2[clean2.index("Order No.") +
10: clean2.index("Order No.") + 18]
# Finding and grabbing the sequence data. Storing it and then finding the
# GC content and melting temp or TM
bases = int(clean2[clean2.index("bases") - 3:clean2.index("bases") - 1])
seqList = [line for line in clean2 if re.match(r'^[AGCT]+$', line)]
sequence = "".join(i for i in seqList[:bases])
def gc_content(x):
count = 0
for i in x:
if i == 'G' or i == 'C':
count += 1
else:
count = count
return round((count / bases) * 100, 1)
gc = gc_content(sequence)
tm = mt.Tm_GC(sequence, Na=50)
moleWeight = round(mw(Seq(sequence, generic_dna)), 2)
dilWeight = float(clean2[clean2.index("ug/OD260:") +
10: clean2.index("ug/OD260:") + 14])
dilution = dilWeight * 10
primerDict = {"Primer Data": {
"Sequence": sequence,
"Bases": bases,
"TM (50mM NaCl)": tm,
"% GC content": gc,
"Molecular weight": moleWeight,
"ug/0D260": dilWeight,
"Dilution volume (uL)": dilution
},
"Shipment Info": {
"Ref. No.": refNo,
"Order No.": orderNo,
"Ordered by": ordered,
"Date of Order": dateOrder,
"Received By": received,
"Date Received": str(dateReceived.strftime("%d-%b-%Y"))
}}
# Generating the JSON array "Primer Data"
with open("".join(name) + ".json", 'w') as file:
primerJSON = json.dumps(primerDict, ensure_ascii=False)
file.write(primerJSON)

Printing out python output to an html file

I am writing a script to print the output to an html file. I am stuck on the format of my output. Below is my code:
def printTohtml(Alist):
myfile = open('zip_files.html', 'w')
html = """<html>
<head></head>
<body><p></p>{htmlText}</body>
</html>"""
title = "Study - User - zip file - Last date modified"
myfile.write(html.format(htmlText = title))
for newL in Alist:
for j in newL:
if j == newL[-1]:
myfile.write(html.format(htmlText=j))
else:
message = j + ', '
myfile.write(html.format(htmlText = message))
myfile.close()
Alist = [['123', 'user1', 'New Compressed (zipped) Folder.zip', '05-24-17'],
['123', 'user2', 'Iam.zip', '05-19-17'], ['abcd', 'Letsee.zip', '05-22-17'],
['Here', 'whichTwo.zip', '06-01-17']]
printTohtml(Alist)
I want my output to be like this:
Study - User - zip file - Last date modified
123, user1, New Compressed (zipped) Folder.zip, 05-24-17
123, user2, Iam.zip, 05-19-17
abcd, Letsee.zip, 05-22-17
Here, whichTwo.zip, 06-01-17
But my code is giving me everything on its own line. Can anyone please help me?
Thanks in advance for your help!
My Output:
Study - User - zip file - Last date modified
123,
user1,
New Compressed (zipped) Folder.zip,
05-24-17
123,
user2,
Iam.zip,
05-19-17
abcd,
Letsee.zip,
05-22-17
Here,
whichTwo.zip,
06-01-17
You might want to try something like that. I haven't tested but this will create the string first and then write it to the file. Might be faster for avoiding multiple writes but I am not sure how python is handling that on the background.
def printTohtml(Alist):
myfile = open('zip_files.html', 'w')
html = """<html>
<head></head>
<body><p></p>{htmlText}</body>
</html>"""
title = "Study - User - zip file - Last date modified"
Alist = [title] + [", ".join(line) for line in Alist]
myfile.write(html.format(htmlText = "\n".join(Alist)))
myfile.close()
Alist = [['123', 'user1', 'New Compressed (zipped) Folder.zip', '05-24-17'],
['123', 'user2', 'Iam.zip', '05-19-17'], ['abcd', 'Letsee.zip', '05-22-17'],
['Here', 'whichTwo.zip', '06-01-17']]
printTohtml(Alist)
Your issue is that you're including the html, body and paragraph tags every time you write a line to your file.
Why don't you concatenate your string, separating the lines with <br> tags, and then load them into your file, like so:
def printTohtml(Alist):
myfile = open('zip_files.html', 'w')
html = """<html>
<head></head>
<body><p>{htmlText}</p></body>
</html>"""
complete_string = "Study - User - zip file - Last date modified"
for newL in Alist:
for j in newL:
if j == newL[-1]:
complete_string += j + "<br>"
else:
message = j + ', '
complete_string += message + "<br>"
myfile.write(html.format(htmlText = complete_string))
myfile.close()
Also, your template placeholder is in the wrong spot, it should be between your paragraph tags.

subprocess.popen returning empty string

There was an earlier question on this, but the asker was just overwriting their output and solved their own problem.
I'm using a subprocess.popen to read video information and write the output to a json. It works fine on MOST videos, but on others is returning an empty string on others - even though it runs fine from the command line. I tried it several times and am getting the data fine through the command line.
Here's the relevant part of the script:
out_prj.write('[')
for m, i in enumerate(files):
print i
out_prj.write('{"$type":"BatchProcessor.Job, BatchProcessor","Id":0,"Ver":1.02,"CurrentTask":0,"IsSelected":true,"TaskList":[')
f_name = os.path.basename(i[0])
f_json = out_folder + os.sep + "06_Output" + os.sep + os.path.basename(i[0]).split(".")[0] + ".json"
trans_f = out_folder + os.sep + "04_Video" + os.sep + os.path.basename(i[0]).split(".")[0] + "-tr.ts"
trans_f_out = out_folder + os.sep + "06_Output" + os.sep + os.path.basename(i[0]).split(".")[0] + "-tr-out.ts"
ffprobe = 'ffprobe.exe'
command = [ffprobe, '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', i[0]]
p = sp.Popen(command, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
out, err = p.communicate()
io = cStringIO.StringIO(out)
info = json.load(io)
print info
filea = open(f_json, 'w')
filea.write(json.dumps(info))
filea.close()
f = open(f_json)
b = json.load(f)
print b
#########################
###################
f_format = str(b['streams'][0]['codec_long_name'])
Your code ignores error messages (err variable). print err or don't redirect stderr to see them.
Unrelated: the json handling in your code is insane: most operations are redundant.
To save output of the subprocess to a file:
import os
from subprocess import check_call
f_json = os.path.join(out_folder, "06_Output",
os.path.splitext(f_name)[0] + ".json")
with open(f_json, 'wb', 0) as file:
check_call(command, stdout=file)
Note: shell=True is not necessary here. If subprocess can't find ffprobe.exe then specify the full path e.g. (use the path appropriate for your system):
ffprobe = r'C:\Program Files\Real\RealPlayer\RPDS\Tools\ffmpeg\ffprobe.exe'
Note: r'' -- a raw string literal is used to avoid doubling the backslashes.