Output to CGI using Python?? Error keeps telling me missing parenthesis? - html

Attempting to generate using python to screen using cgi. However, when I run it from the command line, I keep getting an error stating that it is missing parenthesis on the line print "Content-type:text/html\r\n\r\n".
#!/usr/bin/python3
import urllib.request
import json
import os
link = "https://api.nasa.gov/planetary/apod?api_key....."
resp = urllib.request.urlopen(link)
data = resp.read()
print(str(data, 'utf-8'))
returnJson = json.loads(data)
img_url = returnJson['url']
title = returnJson['title']
current_date = returnJson['date']
(filename, headers) = urllib.request.urlretrieve(img_url)
img_file_name = img_url.split('/')[-1]
os.rename(filename, img_file_name)
html = """
<center>
<h1>Astronomy Picture of the Day</h1>
<img src="%s">
<p><b>%s</b></p>
</center>
""" % (img_file_name, title)
html_file_name = 'nasa_apod_%s.html' %current_date
print "Content-type:text/html\r\n\r\n" **Where it says parenthesis**
print '<html>'
print '<head>'
print '<title>Astronomy Picture of the Day</title>'
print '</head>'
print '<body>'
print '<h1>Astronomy Picture of the Day</h1>'
print '</body>'
print '</html>'

This is because you are using python 3. In python 3 print is a function not a statement. So this means you need to add parentheses around anything you print.
# this will fail in Python 3
print "Content-type:text/html\r\n\r\n"
# but this will work
print("Content-type:text/html\r\n\r\n")
As you do, earlier on with print(str(data, 'utf-8'))

Related

Python open file not identifying break line

I'm trying to fix an JSON file that was badly generated, so I need to replace all the '}{' by '},{' and stuff. The problem is that Python stop recognizing '\n' as a break line so when read the files by readlines() it print all the file, the same with ASCII characteres, they are all printed in the file and not recognized as ASCII characteres.
My code
def strip_accents(text):
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError): # unicode is a default on python 3
print('ai carai')
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return text
file_name = 'json/get_tweets.json'
with open(file_name, 'r') as f:
file_s = ''
for i in f.readlines():
print(i)
i = i.replace('}{','},{')
i = strip_accents(i)
file_s += i
file_s = '[' + file_s + ']'
My file is around 4GB almost impossible to print here, so there is a print.
I already tried different encondings, but no result.
Can someone help to find a solution?
EDIT: The print wasnt uploaded. Sorry.

How to make a python program to convert a bunch of pdfs to html?

I was trying to do a program that would read all the pdfs in a folder and convert all of them to htmls, for example file1.pdf, file2.pdf, file3.pdf then run the program and create something like file1.html, file2.html, file3.htm. Without losing the main pdf of course, untill now I only could do it to one file, I don't how to make for every file on the folder with a loop.
Here's my code:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
#bin_path = 'C:\\Python27\\pdf2htmlEX\\pdf2htmlEX.exe'
#if not os.path.isfile(bin_path):
# print "Could not find %s" % bin_path
# exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (file_name, file_path)
success, output, errors = run("pdf2txt.py -o %s.html %s" %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
This is a complete solution that uses os.walk and pdf2htmlEX:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\Users\\Admin\\Desktop\\learningpython\\PROJECT'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdf2htmlEX-master\\pdf2htmlEX.exe'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, dir_path, file_path)
success, output, errors = run("%s --dest-dir %s %s" % args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
to compile pdf2html project by https://github.com/coolwanglu/pdf2htmlEX, and system call cmd pdf2html by python

How to soup a browser response

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable
Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.

Python Replace throwing errors when replacing "</html>"

I am very new to Python and I'm trying to understand and use the script from this link in Anaconda running on Python 3.5.2. I have had to change some things so that the script can run in this version of Python since it is from 2013. The script (as amended by inexperienced me) is as follows and my problem is in the try block in the line html = f.read().replace("</html>", "") + "</html>".
I simply cannot understand the reason of the + "</html>" that comes after the close parenthesis. From what I have found out on the replace() method is that it takes at least two parameters, the old character/s and the new ones. As it is, this script is jumping to the except Exception as e: and prints out a bytes-like object is required, not 'str'.
Now this is, as far as I can tell, because the reading is being done as bytes whereas the replace method takes strings. I tried to divide the line into:
html = f.read
html = str.replace("</html>", "") + "</html>"
but this throws replace() takes at least 2 arguments (1 given). I also tried changing the contents of html from bytes to str as follows
html = str(f.read(), 'utf-8')
html = str.replace("</html>", "")
but this also returns the error that replace() takes two arguments (1 given). When I removed the html = str.replace("</html>", "") + "</html>" altogether and so skipped to the soup = BeautifulSoup(html), I ended up with a warning that no parser was explicitly specified and later on an AttributeError that NoneType object has no attribute get_dictionary.
Any help about the need for the mentioned line and why it is used and how to use it would be greatly appreciated. Thank you.
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
# url = 'http://twitter.com/%s/status/%s' % (uid, sid)
# print url
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
print('URL: ', f.geturl())
# Thanks to Arturo!
# html = f.read()
html = f.read().replace("</html>", "") + "</html>"
soup = BeautifulSoup(html)
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print len(tweets)
# print tweets
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if js.has_key("embedData"):
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
# except Exception as e:
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print json.dumps(tweet, indent=2)
print("\t".join(fields + [text]).encode('utf-8'))
str.replace is using replace in its static form (calling the method from the type-class str instead of an str object).
str.replace will actually need 3 arguments: the string to act on, the char or string to replace and the new char or string.
'abcd'.replace('d', 'z') is equivallent to str.replace('abcd', 'd', 'z'):
print('abcd'.replace('d', 'z'))
# abcz
print(str.replace('abcd', 'd', 'z'))
# abcz
I have accepted the solution kindly given by #DeepSpace as an answer as it helped me to realise how to overcome the problem I was facing. The code below can now execute under Python 3 if run from command prompt as follows (Please note that I executed this from Windows command prompt):
python download_tweets.py inpuot_file.tsv > output_file.tsv. The code follows:
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
# print('URL: ', f.geturl())
# Thanks to Arturo!
html = str.replace(str(f.read(), 'utf-8'), "</html>", "")
# html = f.read().replace("</html>", "") + "</html>" # original line
soup = BeautifulSoup(html, "lxml") # added "lxml" as it was giving warnings
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print(len(tweets))
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if "embedData" in js:
# if js.has_key("embedData"): # original line
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print(json.dumps("dump: ", tweet, indent=2))
print(" \t ".join(fields + [text]).encode('utf-8'))

Python crashes running assemble_data.py for Flickr Style dataset

I'm trying to download the Flickr Style dataset using assemble_data.py provided in the examples folder. However, whenever I run this python crashes with error 'python quit unexpectedly'.
It seems to be related to multiprocessing and urllib. When I replace pool.map with a single threaded loop it works but is very slow. Also, if I run with multiprocessing but remove urlretrieve it seems to work too.
Answering my own question here... I resolved this by using urllib3 instead.
http = urllib3.PoolManager(10)
def download_image(args_tuple):
"For use with multiprocessing map. Returns filename on fail."
url, filename = args_tuple
try:
if not os.path.exists(filename):
print url + ' -> ' + filename
# Dont redirect.
response = http.request('GET', url, redirect=False)
with open(filename, 'wb') as f:
f.write(response.data)
with open(filename) as f:
assert hashlib.sha1(f.read()).hexdigest() != MISSING_IMAGE_SHA1
test_read_image = io.imread(filename)
return True
except KeyboardInterrupt:
raise Exception() # multiprocessing doesn't catch keyboard exceptions
except:
os.remove(filename)
return False
Gist here.