Python open file not identifying break line - json

I'm trying to fix an JSON file that was badly generated, so I need to replace all the '}{' by '},{' and stuff. The problem is that Python stop recognizing '\n' as a break line so when read the files by readlines() it print all the file, the same with ASCII characteres, they are all printed in the file and not recognized as ASCII characteres.
My code
def strip_accents(text):
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
print('ai carai')
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return text
file_name = 'json/get_tweets.json'
with open(file_name, 'r') as f:
file_s = ''
for i in f.readlines():
print(i)
i = i.replace('}{','},{')
i = strip_accents(i)
file_s += i
file_s = '[' + file_s + ']'
My file is around 4GB almost impossible to print here, so there is a print.
I already tried different encondings, but no result.
Can someone help to find a solution?
EDIT: The print wasnt uploaded. Sorry.

Related

Output to CGI using Python?? Error keeps telling me missing parenthesis?

Attempting to generate using python to screen using cgi. However, when I run it from the command line, I keep getting an error stating that it is missing parenthesis on the line print "Content-type:text/html\r\n\r\n".
#!/usr/bin/python3
import urllib.request
import json
import os
link = "https://api.nasa.gov/planetary/apod?api_key....."
resp = urllib.request.urlopen(link)
data = resp.read()
print(str(data, 'utf-8'))
returnJson = json.loads(data)
img_url = returnJson['url']
title = returnJson['title']
current_date = returnJson['date']
(filename, headers) = urllib.request.urlretrieve(img_url)
img_file_name = img_url.split('/')[-1]
os.rename(filename, img_file_name)
html = """
<center>
<h1>Astronomy Picture of the Day</h1>
<img src="%s">
<p><b>%s</b></p>
</center>
""" % (img_file_name, title)
html_file_name = 'nasa_apod_%s.html' %current_date
print "Content-type:text/html\r\n\r\n" **Where it says parenthesis**
print '<html>'
print '<head>'
print '<title>Astronomy Picture of the Day</title>'
print '</head>'
print '<body>'
print '<h1>Astronomy Picture of the Day</h1>'
print '</body>'
print '</html>'
This is because you are using python 3. In python 3 print is a function not a statement. So this means you need to add parentheses around anything you print.
# this will fail in Python 3
print "Content-type:text/html\r\n\r\n"
# but this will work
print("Content-type:text/html\r\n\r\n")
As you do, earlier on with print(str(data, 'utf-8'))

Error trying to open json file [json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes]

I'm trying to open a json file using the json library in Python 3.8 but I have not succeeded.
This is my MWE:
with open(pbit_path + file_name, 'r') as f:
data = json.load(f)
print(data)
where pbit_path and file_name is the absolute path of the .json file. As an example, this is a sample of the .json file that i'm trying to open.
https://github.com/pwnaoj/desktop-tutorial/blob/master/DataModelSchema.json
Error returned
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
I have also tried using the functions loads(), dump(), dumps().
I appreciate any suggestions
Thanks in advance.
I found a solution to my problem. In principle, it is an encoding problem since the type of file I am trying to read is encoded with UCS-2, so in python
with open(file, mode='r', encoding='utf_16_le') as file:
data = file.read()
data = json.loads(data)
file.close()

How to compare a character at a specific location in a string to identify the processing path

I'm trying to identify if the first character in a .txt/string is either a "{" or a "<". Depending on which will determine how the .txt is handled.
I'm working with two systems where one takes xml and the other takes json. So, as a file comes from one system it's converted and sent to the other. I've worked out the conversion for the files if they have the correct file extension but now I'm needing to be able to identify if a file is json or xml based off the content of a .txt file. I don't know why this would occur but was asked to include it.
Best way, as far as I can tell, is based off the first character within the file. If it's "<" than it is xml, if it's "{" than it's json. I'm not aware of a character that is only in json or only in xml that I can search through and identify that way.
The code below the # txt to xml and json is searching the whole file for the string which can give false positives which is why I'm trying to look at just the first character.
start_path = 'fileLocation'
for path,dirs,files in os.walk(start_path):
for fileName in files:
filePath = os.path.join(path,fileName)
# xml2json
if re.match('.*\.xml',fileName):
with open(filePath) as x:
xStr = x.read()
jStr = json.dumps(xmltodict.parse(xStr), indent=4)
with open("jsonOutput.json", 'w') as j:
j.write(jStr)
# json2xml
elif re.match('.*\.json',fileName):
with open(filePath) as j:
jStr = j.read()
xStr = xmltodict.unparse(json.loads(jStr), pretty=True)
with open('xmlOutput.xml', 'w') as x:
x.write(xStr)
# **Where I'm Having Trouble**
# txt to xml and json
elif re.match('.*\.txt',fileName):
with open(filePath) as t:
tStr = t.read()
if 'xml' in tStr:
with open('xmlOutput.xml', 'w') as x:
x.write(tStr)
elif '{' in tStr:
with open('jsonOutput.xml', 'w') as j:
j.write(tStr)
The ideal solution would replace the 'xml' and '{' full txt search with '<' and '{' checking the first character.
Any help is greatly appreciated and thank you.
If anyone is interested, I found a solution using readline(). This reads only the first line and if '{' is found it will process as a json, if there's an '<' it will process as xml. Thanks everyone for the help.
# unk to json & xml
else:
with open(filePath) as u:
fLine = u.readline() #This is only reading the first line.
uStr = u.read()
if '<' in fLine:
time = strftime('%Y%b%d %H%M', gmtime())
fName = fileName + ' ' + time + ".xml"
with open(fName, 'w') as x:
x.write(uStr)
elif '{' in fLine:
time = strftime('%Y%b%d %H%M', gmtime())
fName = fileName + ' ' + time + ".json"
with open(fName, 'w') as j:
j.write(uStr)

Python Replace throwing errors when replacing "</html>"

I am very new to Python and I'm trying to understand and use the script from this link in Anaconda running on Python 3.5.2. I have had to change some things so that the script can run in this version of Python since it is from 2013. The script (as amended by inexperienced me) is as follows and my problem is in the try block in the line html = f.read().replace("</html>", "") + "</html>".
I simply cannot understand the reason of the + "</html>" that comes after the close parenthesis. From what I have found out on the replace() method is that it takes at least two parameters, the old character/s and the new ones. As it is, this script is jumping to the except Exception as e: and prints out a bytes-like object is required, not 'str'.
Now this is, as far as I can tell, because the reading is being done as bytes whereas the replace method takes strings. I tried to divide the line into:
html = f.read
html = str.replace("</html>", "") + "</html>"
but this throws replace() takes at least 2 arguments (1 given). I also tried changing the contents of html from bytes to str as follows
html = str(f.read(), 'utf-8')
html = str.replace("</html>", "")
but this also returns the error that replace() takes two arguments (1 given). When I removed the html = str.replace("</html>", "") + "</html>" altogether and so skipped to the soup = BeautifulSoup(html), I ended up with a warning that no parser was explicitly specified and later on an AttributeError that NoneType object has no attribute get_dictionary.
Any help about the need for the mentioned line and why it is used and how to use it would be greatly appreciated. Thank you.
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
# url = 'http://twitter.com/%s/status/%s' % (uid, sid)
# print url
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
print('URL: ', f.geturl())
# Thanks to Arturo!
# html = f.read()
html = f.read().replace("</html>", "") + "</html>"
soup = BeautifulSoup(html)
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print len(tweets)
# print tweets
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if js.has_key("embedData"):
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
# except Exception as e:
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print json.dumps(tweet, indent=2)
print("\t".join(fields + [text]).encode('utf-8'))
str.replace is using replace in its static form (calling the method from the type-class str instead of an str object).
str.replace will actually need 3 arguments: the string to act on, the char or string to replace and the new char or string.
'abcd'.replace('d', 'z') is equivallent to str.replace('abcd', 'd', 'z'):
print('abcd'.replace('d', 'z'))
# abcz
print(str.replace('abcd', 'd', 'z'))
# abcz
I have accepted the solution kindly given by #DeepSpace as an answer as it helped me to realise how to overcome the problem I was facing. The code below can now execute under Python 3 if run from command prompt as follows (Please note that I executed this from Windows command prompt):
python download_tweets.py inpuot_file.tsv > output_file.tsv. The code follows:
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
# print('URL: ', f.geturl())
# Thanks to Arturo!
html = str.replace(str(f.read(), 'utf-8'), "</html>", "")
# html = f.read().replace("</html>", "") + "</html>" # original line
soup = BeautifulSoup(html, "lxml") # added "lxml" as it was giving warnings
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print(len(tweets))
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if "embedData" in js:
# if js.has_key("embedData"): # original line
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print(json.dumps("dump: ", tweet, indent=2))
print(" \t ".join(fields + [text]).encode('utf-8'))

directing the output to .txt file and automatically naming the file

I'm new to python and writing code in general. I want to direct the output of a users input into a .txt file (if possible). And if possible name it after the input in line 3. Thank you for any help or advice
userName = raw_input("login = ")
print "Welcome,", userName
number = raw_input("ID number = ")
weight = raw_input("Weight = ")
Write inside a file is quite easy in Python:
f = open(number + '.txt', 'w') #create a file using the given input
f.write(userName + " " + weight)
f.close()
For further references : http://docs.python.org/2/tutorial/inputoutput.html