How to soup a browser response - html

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable

Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.

Related

Python3 save a json to a csv file from Coingeko API

I am struggling to convert a json file to a csv file. Any help would be appreciated. I am using Python3
Code
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
##parsing response
myfile=open("coingecko1.csv","w",encoding="utf8")
headers="Prices,MrkCap,TolVol \n"
myfile.write(headers)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
print (cont)#Just to check json result
for market in cont:
prices =(cont["prices"])
market_caps = (cont["market_caps"])
total_volumes = (cont["total_volumes"])
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
myfile.write(content)
print("job complete")
Python Result
{'prices': [[1629331200000, 45015.46554608543], [1629361933000, 44618.52978218442]], 'market_caps': [[1629331200000, 847143004614.999], [1629361933000, 837151985590.3453]], 'total_volumes': [[1629331200000, 34668999387.83819], [1629361933000, 33367392889.386738]]}
Traceback (most recent call last):
File "ma1.py", line 22, in <module>
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
TypeError: can only concatenate list (not "str") to list
CSV Result
CSV Result
Thank You
Your JSON is nested which is list of lists. To read easily in CSV you must flatten it out
I've reformatted the code to dump to CSV. check below
import csv
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
# flatten the JSON data to read csv easily
flatten_data = {}
for key in cont:
for value in cont[key]:
if value[0] not in flatten_data:
flatten_data[value[0]] = {}
flatten_data[value[0]].update({key: value[1]})
# write csv with DictWriter
with open('coingecko1.csv', 'w', encoding='utf-8') as csvfile:
headers = ['Item', 'Prices', 'MrkCap', 'TolVol']
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
for k, v in flatten_data.items():
v.update({'Item': k})
# renamed the columns as required
v['Prices'] = v.pop('prices')
v['MrkCap'] = v.pop('market_caps')
v['TolVol'] = v.pop('total_volumes')
writer.writerow(v)
print("job complete")

ROS service failed to save files

I want to have a service 'save_readings' that automatically saves data from a rostopic into a file. But each time the service gets called, it doesn't save any file.
I've tried to run those saving-file code in python without using a rosservice and the code works fine.
I don't understand why this is happening.
#!/usr/bin/env python
# license removed for brevity
import rospy,numpy
from std_msgs.msg import String,Int32MultiArray,Float32MultiArray,Bool
from std_srvs.srv import Empty,EmptyResponse
import geometry_msgs.msg
from geometry_msgs.msg import WrenchStamped
import json
# import settings
pos_record = []
wrench_record = []
def ftmsg2listandflip(ftmsg):
return [ftmsg.wrench.force.x,ftmsg.wrench.force.y,ftmsg.wrench.force.z, ftmsg.wrench.torque.x,ftmsg.wrench.torque.y,ftmsg.wrench.torque.z]
def callback_pos(data):
global pos_record
pos_record.append(data.data)
def callback_wrench(data):
global wrench_record
ft = ftmsg2listandflip(data)
wrench_record.append([data.header.stamp.to_sec()] + ft)
def exp_listener():
stop_sign = False
rospy.Subscriber("stage_pos", Float32MultiArray, callback_pos)
rospy.Subscriber("netft_data", WrenchStamped, callback_wrench)
rospy.spin()
def start_read(req):
global pos_record
global wrench_record
pos_record = []
wrench_record = []
return EmptyResponse()
def save_readings(req):
global pos_record
global wrench_record
filename = rospy.get_param('save_file_name')
output_data = {'pos_list':pos_record, 'wrench_list': wrench_record }
rospy.loginfo("output_data %s",output_data)
with open(filename, 'w') as outfile: # write data to 'data.json'
print('dumping json file')
json.dump(output_data, outfile) #TODO: find out why failing to save the file.
outfile.close()
print("file saved")
rospy.sleep(2)
return EmptyResponse()
if __name__ == '__main__':
try:
rospy.init_node('lisener_node', log_level = rospy.INFO)
s_1 = rospy.Service('start_read', Empty, start_read)
s_1 = rospy.Service('save_readings', Empty, save_readings)
exp_listener()
print ('mylistener ready!')
except rospy.ROSInterruptException:
pass
Got it. I need to specify a path for the file to be saved.
save_path = '/home/user/catkin_ws/src/motionstage/'
filename = save_path + filename

How to make a python program to convert a bunch of pdfs to html?

I was trying to do a program that would read all the pdfs in a folder and convert all of them to htmls, for example file1.pdf, file2.pdf, file3.pdf then run the program and create something like file1.html, file2.html, file3.htm. Without losing the main pdf of course, untill now I only could do it to one file, I don't how to make for every file on the folder with a loop.
Here's my code:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
#bin_path = 'C:\\Python27\\pdf2htmlEX\\pdf2htmlEX.exe'
#if not os.path.isfile(bin_path):
# print "Could not find %s" % bin_path
# exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (file_name, file_path)
success, output, errors = run("pdf2txt.py -o %s.html %s" %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
This is a complete solution that uses os.walk and pdf2htmlEX:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\Users\\Admin\\Desktop\\learningpython\\PROJECT'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdf2htmlEX-master\\pdf2htmlEX.exe'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, dir_path, file_path)
success, output, errors = run("%s --dest-dir %s %s" % args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
to compile pdf2html project by https://github.com/coolwanglu/pdf2htmlEX, and system call cmd pdf2html by python

Python Replace throwing errors when replacing "</html>"

I am very new to Python and I'm trying to understand and use the script from this link in Anaconda running on Python 3.5.2. I have had to change some things so that the script can run in this version of Python since it is from 2013. The script (as amended by inexperienced me) is as follows and my problem is in the try block in the line html = f.read().replace("</html>", "") + "</html>".
I simply cannot understand the reason of the + "</html>" that comes after the close parenthesis. From what I have found out on the replace() method is that it takes at least two parameters, the old character/s and the new ones. As it is, this script is jumping to the except Exception as e: and prints out a bytes-like object is required, not 'str'.
Now this is, as far as I can tell, because the reading is being done as bytes whereas the replace method takes strings. I tried to divide the line into:
html = f.read
html = str.replace("</html>", "") + "</html>"
but this throws replace() takes at least 2 arguments (1 given). I also tried changing the contents of html from bytes to str as follows
html = str(f.read(), 'utf-8')
html = str.replace("</html>", "")
but this also returns the error that replace() takes two arguments (1 given). When I removed the html = str.replace("</html>", "") + "</html>" altogether and so skipped to the soup = BeautifulSoup(html), I ended up with a warning that no parser was explicitly specified and later on an AttributeError that NoneType object has no attribute get_dictionary.
Any help about the need for the mentioned line and why it is used and how to use it would be greatly appreciated. Thank you.
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
# url = 'http://twitter.com/%s/status/%s' % (uid, sid)
# print url
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
print('URL: ', f.geturl())
# Thanks to Arturo!
# html = f.read()
html = f.read().replace("</html>", "") + "</html>"
soup = BeautifulSoup(html)
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print len(tweets)
# print tweets
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if js.has_key("embedData"):
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
# except Exception as e:
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print json.dumps(tweet, indent=2)
print("\t".join(fields + [text]).encode('utf-8'))
str.replace is using replace in its static form (calling the method from the type-class str instead of an str object).
str.replace will actually need 3 arguments: the string to act on, the char or string to replace and the new char or string.
'abcd'.replace('d', 'z') is equivallent to str.replace('abcd', 'd', 'z'):
print('abcd'.replace('d', 'z'))
# abcz
print(str.replace('abcd', 'd', 'z'))
# abcz
I have accepted the solution kindly given by #DeepSpace as an answer as it helped me to realise how to overcome the problem I was facing. The code below can now execute under Python 3 if run from command prompt as follows (Please note that I executed this from Windows command prompt):
python download_tweets.py inpuot_file.tsv > output_file.tsv. The code follows:
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
# print('URL: ', f.geturl())
# Thanks to Arturo!
html = str.replace(str(f.read(), 'utf-8'), "</html>", "")
# html = f.read().replace("</html>", "") + "</html>" # original line
soup = BeautifulSoup(html, "lxml") # added "lxml" as it was giving warnings
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print(len(tweets))
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if "embedData" in js:
# if js.has_key("embedData"): # original line
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print(json.dumps("dump: ", tweet, indent=2))
print(" \t ".join(fields + [text]).encode('utf-8'))

I'm using TMDB api to show movie poster but I'm getting some errors

**here is my code**
import os
import requests
CONFIG_PATTERN = 'http://api.themoviedb.org/3/search/movie?query=Monsters+University&api_key=xxx'
IMG_PATTERN = 'http://api.themoviedb.org/3/movie?query=Monsters+University&api_key=xxx'
KEY = 'xxx'
def _get_json(url):
r = requests.get(url)
return r.json()
def _download_images(urls, path='.'):
"""download all images in list 'urls' to 'path' """
for nr, url in enumerate(urls):
r = requests.get(url)
filetype = r.headers['content-type'].split('/')[-1]
filename = 'poster_{0}.{1}'.format(nr+1,filetype)
filepath = os.path.join(path, filename)
with open(filepath,'wb') as w:
w.write(r.content)
def get_poster_urls(imdbid):
""" return image urls of posters for IMDB id
returns all poster images from 'themoviedb.org'. Uses the
maximum available size.
Args:
imdbid (str): IMDB id of the movie
Returns:
list: list of urls to the images
"""
config = _get_json(CONFIG_PATTERN.format(key=KEY))
base_url = config['images']['base_url']
sizes = config['images']['poster_sizes']
def size_str_to_int(x):
return float("inf") if x == 'original' else int(x[1:])
max_size = max(sizes, key=size_str_to_int)
posters = _get_json(IMG_PATTERN.format(key=KEY,imdbid=imdbid))['posters']
poster_urls = []
for poster in posters:
rel_path = poster['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
poster_urls.append(url)
return poster_urls
def tmdb_posters(imdbid, count=None, outpath='.'):
urls = get_poster_urls(imdbid)
if count is not None:
urls = urls[:count]
_download_images(urls, outpath)
if __name__=="__main__":
tmdb_posters('tt0095016')
please format the code accordingly iam fetching the json data using TMDB api but iam getting errors
here are errors
Traceback (most recent call last):
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 57, in <module>
tmdb_posters('tt0095016')
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 51, in tmdb_posters
urls = get_poster_urls(imdbid)
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 33, in get_poster_urls
base_url = config['images']['base_url']
KeyError: 'images'
Process finished with exit code 1
But iam unable to figuring out the errors it seems like everything is right but urrghh these errors are not going i have tried everything
I guess it should work if you change CONFIG_PATTERN to http://api.themoviedb.org/3/configuration?api_key=<your_key>
BTW, edited your question since you posted your API key in it.