Python Replace throwing errors when replacing "</html>" - html

I am very new to Python and I'm trying to understand and use the script from this link in Anaconda running on Python 3.5.2. I have had to change some things so that the script can run in this version of Python since it is from 2013. The script (as amended by inexperienced me) is as follows and my problem is in the try block in the line html = f.read().replace("</html>", "") + "</html>".
I simply cannot understand the reason of the + "</html>" that comes after the close parenthesis. From what I have found out on the replace() method is that it takes at least two parameters, the old character/s and the new ones. As it is, this script is jumping to the except Exception as e: and prints out a bytes-like object is required, not 'str'.
Now this is, as far as I can tell, because the reading is being done as bytes whereas the replace method takes strings. I tried to divide the line into:
html = f.read
html = str.replace("</html>", "") + "</html>"
but this throws replace() takes at least 2 arguments (1 given). I also tried changing the contents of html from bytes to str as follows
html = str(f.read(), 'utf-8')
html = str.replace("</html>", "")
but this also returns the error that replace() takes two arguments (1 given). When I removed the html = str.replace("</html>", "") + "</html>" altogether and so skipped to the soup = BeautifulSoup(html), I ended up with a warning that no parser was explicitly specified and later on an AttributeError that NoneType object has no attribute get_dictionary.
Any help about the need for the mentioned line and why it is used and how to use it would be greatly appreciated. Thank you.
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
# url = 'http://twitter.com/%s/status/%s' % (uid, sid)
# print url
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
print('URL: ', f.geturl())
# Thanks to Arturo!
# html = f.read()
html = f.read().replace("</html>", "") + "</html>"
soup = BeautifulSoup(html)
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print len(tweets)
# print tweets
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if js.has_key("embedData"):
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
# except Exception as e:
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print json.dumps(tweet, indent=2)
print("\t".join(fields + [text]).encode('utf-8'))

str.replace is using replace in its static form (calling the method from the type-class str instead of an str object).
str.replace will actually need 3 arguments: the string to act on, the char or string to replace and the new char or string.
'abcd'.replace('d', 'z') is equivallent to str.replace('abcd', 'd', 'z'):
print('abcd'.replace('d', 'z'))
# abcz
print(str.replace('abcd', 'd', 'z'))
# abcz

I have accepted the solution kindly given by #DeepSpace as an answer as it helped me to realise how to overcome the problem I was facing. The code below can now execute under Python 3 if run from command prompt as follows (Please note that I executed this from Windows command prompt):
python download_tweets.py inpuot_file.tsv > output_file.tsv. The code follows:
#!/usr/bin/python
import sys
import urllib.request
import re
import json
from bs4 import BeautifulSoup
import socket
socket.setdefaulttimeout(10)
cache = {}
for line in open(sys.argv[1]):
fields = line.rstrip('\n').split('\t')
sid = fields[0]
uid = fields[1]
tweet = None
text = "Not Available"
if sid in cache:
text = cache[sid]
else:
try:
f = urllib.request.urlopen("http://twitter.com/%s/status/%s" % (uid, sid))
# print('URL: ', f.geturl())
# Thanks to Arturo!
html = str.replace(str(f.read(), 'utf-8'), "</html>", "")
# html = f.read().replace("</html>", "") + "</html>" # original line
soup = BeautifulSoup(html, "lxml") # added "lxml" as it was giving warnings
jstt = soup.find_all("p", "js-tweet-text")
tweets = list(set([x.get_text() for x in jstt]))
# print(len(tweets))
if (len(tweets)) > 1:
continue
text = tweets[0]
cache[sid] = tweets[0]
for j in soup.find_all("input", "json-data", id="init-data"):
js = json.loads(j['value'])
if "embedData" in js:
# if js.has_key("embedData"): # original line
tweet = js["embedData"]["status"]
text = js["embedData"]["status"]["text"]
cache[sid] = text
break
except Exception as e:
print(e)
continue
if tweet is not None and tweet["id_str"] != sid:
text = "Not Available"
cache[sid] = "Not Available"
text = text.replace('\n', ' ', )
text = re.sub(r'\s+', ' ', text)
# print(json.dumps("dump: ", tweet, indent=2))
print(" \t ".join(fields + [text]).encode('utf-8'))

Related

CSV read into MySQLdb failing

I am having a problem with reading my csv file into the MySQL database. I have tried a number of solutions, but the errors just keep changing and the code isn't working. This same code had worked with another csv file, so I'm thinking I might be doing something wrong with this one?
Here is my code
from database_access import *
from builtins import bytes, int, str
import codecs
import csv
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
import cgi
import MySQLdb
import chardet
# from database_access import *
import MySQLdb
import simplejson
if __name__ == '__main__':
with open("SIMRA.csv",'r') as file:
reader = csv.reader(file)
#reader = csv.reader(text)
next(reader, None)
print ("project running")
#print (row[7])
#rowlist = []
all_links = []
all_project_ids = []
for row in reader:
if row[7] != "" and row[16] != "":
country = row[2]
city = row[8]
description = row[11] + '' + row[12]
title = row[7].replace("'", "''")
link = row[16]
#date_start = row[9]
#print a check here
print(title,description,country, city, link)
db = MySQLdb.connect(host, username, password, database, charset='utf8')
cursor = db.cursor()
new_project = True
proj_check = "SELECT * from Projects where ProjectName like '%" + title + "%'"
#proj_check = "SELECT * from Projects where ProjectName like %s",(title,)
#cur.execute("SELECT * FROM records WHERE email LIKE %s", (search,))
cursor.execute(proj_check)
num_rows = cursor.rowcount
if num_rows != 0:
new_project = False
url_compare = "SELECT * from Projects where ProjectWebpage like '" + link + "'"
#url_compare = "SELECT * from Projects where ProjectWebpage like %s",(link,)
cursor.execute(url_compare)
num_rows = cursor.rowcount
if num_rows != 0:
new_project = False
if new_project:
project_insert = "Insert into Projects (ProjectName,ProjectWebpage,FirstDataSource,DataSources_idDataSources) VALUES (%s,%s,%s,%s)"
cursor.execute(project_insert, (title, link,'SIMRA', 5))
projectid = cursor.lastrowid
print(projectid)
#ashoka_projectids.append(projectid)
db.commit()
ins_desc = "Insert into AdditionalProjectData (FieldName,Value,Projects_idProjects,DateObtained) VALUES (%s,%s,%s,NOW())"
cursor.executemany(ins_desc, ("Description", description, str(projectid)))
db.commit()
ins_location = "Insert into ProjectLocation (Type,Country,City,Projects_idProjects) VALUES (%s,%s,%s,%s)"
cursor.execute(ins_location, ("Main", country,city, str(projectid)))
db.commit()
else:
print('Project already exists!')
print(title)
all_links.append(link)
#print out SIMRA's links to a file for crawling later
with open('simra_links', 'w', newline='') as f:
write = csv.writer(f)
for row in all_links:
columns = [c.strip() for c in row.strip(', ').split(',')]
write.writerow(columns)
When I ran this, I got the following error:
File "/usr/lib/python3.8/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa3 in position 898: invalid start byte
I did some research and tried handling the encoding error by adding different forms of encoding, as seen here - UnicodeDecodeError: ‘utf8’ codec can’t decode byte 0xa5 in position 0: invalid start byte, and Python MySQLdb TypeError: not all arguments converted during string formatting. Added this in this in the csv open parameter -
with open("SIMRA.csv", 'r', encoding="cp437", errors='ignore') as file:
Running the code with these different encoding options came up with a different error:
MySQLdb._exceptions.ProgrammingError: not all arguments converted during bytes formatting
Further research suggested using tuples or lists in order to address this problem, so I added these in the 'select' function in the code, as suggested here - Python MySQLdb TypeError: not all arguments converted during string formatting and in the Python SQL documentation here - PythonMySqldb
So the select query became:
proj_check = "SELECT * from Projects where ProjectName like %s",(title,)
cursor.execute(proj_check)
num_rows = cursor.rowcount
if num_rows != 0:
new_project = False
url_compare = "SELECT * from Projects where ProjectWebpage like %s",(link,)
cursor.execute(url_compare)
num_rows = cursor.rowcount
if num_rows != 0:
new_project = False
When I ran the code, I came up with this Assertion Error and I have no idea what to do anymore.
File "/home/ros/.local/lib/python3.8/site-packages/MySQLdb/cursors.py", line 205, in execute
assert isinstance(query, (bytes, bytearray))
AssertionError
I have run out of ideas. It might be that I'm missing something small, but I can't figure this out now as I've been battling with this for two days now.
Can anyone help point out what I'm missing? It will be greatly appreciated. This code ran perfectly with another csv file. I am running this with Python 3.8 btw.
Have solved this now. I had to use a different encoding with the original code and this solved the problem. So, I changed the csv open parameter to:
with open("SIMRA.csv",'r', encoding="ISO-8859-1") as file:
reader = csv.reader(file)
Were you expecting £? You need to specify what the encoding of the file is. It may be "latin1". See the syntax of LOAD DATA for how to specify CHARACTER SET latin1.

Output to CGI using Python?? Error keeps telling me missing parenthesis?

Attempting to generate using python to screen using cgi. However, when I run it from the command line, I keep getting an error stating that it is missing parenthesis on the line print "Content-type:text/html\r\n\r\n".
#!/usr/bin/python3
import urllib.request
import json
import os
link = "https://api.nasa.gov/planetary/apod?api_key....."
resp = urllib.request.urlopen(link)
data = resp.read()
print(str(data, 'utf-8'))
returnJson = json.loads(data)
img_url = returnJson['url']
title = returnJson['title']
current_date = returnJson['date']
(filename, headers) = urllib.request.urlretrieve(img_url)
img_file_name = img_url.split('/')[-1]
os.rename(filename, img_file_name)
html = """
<center>
<h1>Astronomy Picture of the Day</h1>
<img src="%s">
<p><b>%s</b></p>
</center>
""" % (img_file_name, title)
html_file_name = 'nasa_apod_%s.html' %current_date
print "Content-type:text/html\r\n\r\n" **Where it says parenthesis**
print '<html>'
print '<head>'
print '<title>Astronomy Picture of the Day</title>'
print '</head>'
print '<body>'
print '<h1>Astronomy Picture of the Day</h1>'
print '</body>'
print '</html>'
This is because you are using python 3. In python 3 print is a function not a statement. So this means you need to add parentheses around anything you print.
# this will fail in Python 3
print "Content-type:text/html\r\n\r\n"
# but this will work
print("Content-type:text/html\r\n\r\n")
As you do, earlier on with print(str(data, 'utf-8'))

How to copy HTML code to clipboard using Python?

I'm trying to make a simple script that copies a custom text hyperlink (< a href="FakeSite.com">Example< /a>) to the clipboard using python on Windows 10 and still have it recognized as HTML when pasted, similar to how if you copy a custom text hyperlink in Microsoft Word.
I have tried the answers given in this response but to no avail, and I have also tried a package called jaraco.clipboard but it has also proven useless. Any help or direction would be appreciated.
I had the same doubt as you. After much research, I found the solution in python 2.
I updated the code and hope to help other people by posting here, the final result.
Follow the solution adapted to python 3.4+
Example of use:
input:
PutHtml("<p>Writing to the clipboard is <strong>easy</strong> with this code.</p>")
Then, try paste the clipboard (ctrl+v) in microsoft word or other local that support html formatting.
Paste result will be the word 'easy' in BOLD: Writing to the clipboard is easy with this code.
.
I even tested with table structures, images... Everything worked perfectly.
The script:
"""
Edit on Jan 02, 2020
#author: the_RR
Adapted for python 3.4+
Requires pywin32
original: http://code.activestate.com/recipes/474121/
# HtmlClipboard
# An interface to the "HTML Format" clipboard data format
__author__ = "Phillip Piper (jppx1[at]bigfoot.com)"
__date__ = "2006-02-21"
__version__ = "0.1"
"""
import re
import time
import random
import win32clipboard
#---------------------------------------------------------------------------
# Convenience functions to do the most common operation
def HasHtml():
"""
Return True if there is a Html fragment in the clipboard..
"""
cb = HtmlClipboard()
return cb.HasHtmlFormat()
def GetHtml():
"""
Return the Html fragment from the clipboard or None if there is no Html in the clipboard.
"""
cb = HtmlClipboard()
if cb.HasHtmlFormat():
return cb.GetFragment()
else:
return None
def PutHtml(fragment):
"""
Put the given fragment into the clipboard.
Convenience function to do the most common operation
"""
cb = HtmlClipboard()
cb.PutFragment(fragment)
#---------------------------------------------------------------------------
class HtmlClipboard:
CF_HTML = None
MARKER_BLOCK_OUTPUT = \
"Version:1.0\r\n" \
"StartHTML:%09d\r\n" \
"EndHTML:%09d\r\n" \
"StartFragment:%09d\r\n" \
"EndFragment:%09d\r\n" \
"StartSelection:%09d\r\n" \
"EndSelection:%09d\r\n" \
"SourceURL:%s\r\n"
MARKER_BLOCK_EX = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"StartSelection:(\d+)\s+" \
"EndSelection:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_EX_RE = re.compile(MARKER_BLOCK_EX)
MARKER_BLOCK = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_RE = re.compile(MARKER_BLOCK)
DEFAULT_HTML_BODY = \
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">" \
"<HTML><HEAD></HEAD><BODY><!--StartFragment-->%s<!--EndFragment--></BODY></HTML>"
def __init__(self):
self.html = None
self.fragment = None
self.selection = None
self.source = None
self.htmlClipboardVersion = None
def GetCfHtml(self):
"""
Return the FORMATID of the HTML format
"""
if self.CF_HTML is None:
self.CF_HTML = win32clipboard.RegisterClipboardFormat("HTML Format")
return self.CF_HTML
def GetAvailableFormats(self):
"""
Return a possibly empty list of formats available on the clipboard
"""
formats = []
try:
win32clipboard.OpenClipboard(0)
cf = win32clipboard.EnumClipboardFormats(0)
while (cf != 0):
formats.append(cf)
cf = win32clipboard.EnumClipboardFormats(cf)
finally:
win32clipboard.CloseClipboard()
return formats
def HasHtmlFormat(self):
"""
Return a boolean indicating if the clipboard has data in HTML format
"""
return (self.GetCfHtml() in self.GetAvailableFormats())
def GetFromClipboard(self):
"""
Read and decode the HTML from the clipboard
"""
# implement fix from: http://teachthe.net/?p=1137
cbOpened = False
while not cbOpened:
try:
win32clipboard.OpenClipboard(0)
src = win32clipboard.GetClipboardData(self.GetCfHtml())
src = src.decode("UTF-8")
#print(src)
self.DecodeClipboardSource(src)
cbOpened = True
win32clipboard.CloseClipboard()
except Exception as err:
# If access is denied, that means that the clipboard is in use.
# Keep trying until it's available.
if err.winerror == 5: # Access Denied
pass
# wait on clipboard because something else has it. we're waiting a
# random amount of time before we try again so we don't collide again
time.sleep( random.random()/50 )
elif err.winerror == 1418: # doesn't have board open
pass
elif err.winerror == 0: # open failure
pass
else:
print( 'ERROR in Clipboard section of readcomments: %s' % err)
pass
def DecodeClipboardSource(self, src):
"""
Decode the given string to figure out the details of the HTML that's on the string
"""
# Try the extended format first (which has an explicit selection)
matches = self.MARKER_BLOCK_EX_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.selection = src[int(matches.group(6)):int(matches.group(7))]
self.source = matches.group(8)
else:
# Failing that, try the version without a selection
matches = self.MARKER_BLOCK_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.source = matches.group(6)
self.selection = self.fragment
def GetHtml(self, refresh=False):
"""
Return the entire Html document
"""
if not self.html or refresh:
self.GetFromClipboard()
return self.html
def GetFragment(self, refresh=False):
"""
Return the Html fragment. A fragment is well-formated HTML enclosing the selected text
"""
if not self.fragment or refresh:
self.GetFromClipboard()
return self.fragment
def GetSelection(self, refresh=False):
"""
Return the part of the HTML that was selected. It might not be well-formed.
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.selection
def GetSource(self, refresh=False):
"""
Return the URL of the source of this HTML
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.source
def PutFragment(self, fragment, selection=None, html=None, source=None):
"""
Put the given well-formed fragment of Html into the clipboard.
selection, if given, must be a literal string within fragment.
html, if given, must be a well-formed Html document that textually
contains fragment and its required markers.
"""
if selection is None:
selection = fragment
if html is None:
html = self.DEFAULT_HTML_BODY % fragment
if source is None:
source = "file://HtmlClipboard.py"
fragmentStart = html.index(fragment)
fragmentEnd = fragmentStart + len(fragment)
selectionStart = html.index(selection)
selectionEnd = selectionStart + len(selection)
self.PutToClipboard(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
def PutToClipboard(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source="None"):
"""
Replace the Clipboard contents with the given html information.
"""
try:
win32clipboard.OpenClipboard(0)
win32clipboard.EmptyClipboard()
src = self.EncodeClipboardSource(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
src = src.encode("UTF-8")
#print(src)
win32clipboard.SetClipboardData(self.GetCfHtml(), src)
finally:
win32clipboard.CloseClipboard()
def EncodeClipboardSource(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source):
"""
Join all our bits of information into a string formatted as per the HTML format specs.
"""
# How long is the prefix going to be?
dummyPrefix = self.MARKER_BLOCK_OUTPUT % (0, 0, 0, 0, 0, 0, source)
lenPrefix = len(dummyPrefix)
prefix = self.MARKER_BLOCK_OUTPUT % (lenPrefix, len(html)+lenPrefix,
fragmentStart+lenPrefix, fragmentEnd+lenPrefix,
selectionStart+lenPrefix, selectionEnd+lenPrefix,
source)
return (prefix + html)
def DumpHtml():
cb = HtmlClipboard()
print("GetAvailableFormats()=%s" % str(cb.GetAvailableFormats()))
print("HasHtmlFormat()=%s" % str(cb.HasHtmlFormat()))
if cb.HasHtmlFormat():
cb.GetFromClipboard()
print("prefix=>>>%s<<<END" % cb.prefix)
print("htmlClipboardVersion=>>>%s<<<END" % cb.htmlClipboardVersion)
print("GetSelection()=>>>%s<<<END" % cb.GetSelection())
print("GetFragment()=>>>%s<<<END" % cb.GetFragment())
print("GetHtml()=>>>%s<<<END" % cb.GetHtml())
print("GetSource()=>>>%s<<<END" % cb.GetSource())
if __name__ == '__main__':
def test_SimpleGetPutHtml():
data = "<p>Writing to the clipboard is <strong>easy</strong> with this code.</p>"
PutHtml(data)
if GetHtml() == data:
print("passed")
else:
print("failed")
test_SimpleGetPutHtml()
#DumpHtml()
You can do this using win32clipboard via the example listed here. It requires a little customisation setting the encoding to UTF-8 (as explained in the comments) but the end result in the clipboard will contain html like properties.

How to soup a browser response

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable
Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.

Writing Encoded JSON data to a csv using tweepy

I've been stuck on this one for a while. Right now this function writes date,latitude,longitude,userid,text of a live tweet to a csv file.
The problem is that text of the tweet often contains letters from other alphabets e.g. arabic. These letters show up in this form (\u0641\u064a).
Is it possible to encode the text to a utf-8 string and append it to the rest of the data, so that the csv file would correclty display all characters?
def on_data(self, data):
try:
tweets = json.loads(data)
with open('Data.csv','a',encoding = 'utf-8') as f:
if(tweets['coordinates'] is not None):
coordinates_string = json.dumps(tweets["coordinates"]["coordinates"])
val_lg = coordinates_string.split(',')[0].strip("[")
val_lt = coordinates_string.split(',')[1].strip("]")
else:
val_lg = "None"
val_lt = "None"
text = json.dumps(tweets["text"])
user_id = json.dumps(tweets["user"]["id_str"])
time = json.dumps(tweets["created_at"])
data_string = time + "," + val_lt + "," + val_lg + "," + user_id + "," + text + "\n"
print(data_string)
f.write(data_string)
except:
pass
You've got some overuse of json. Once you loads the tweet, group the data fields in a list and use the csv module to write it nicely.
import json
import csv
# A guess on the data format of the tweet that was parsable by the OP's original code.
D = {'coordinates' : {'coordinates' : [45.6,122.3]}, 'text' : u'some text\u0641\u064a',
'user' : {'id_str' : 'some id'}, 'created_at': 'some date'}
data = json.dumps(D)
tweets = json.loads(data)
# 'utf-8-sig' makes sure the output csv will open in Excel if that is a goal.
# newline='' is a requirement for csv.writer in Python 3.
with open('Data.csv','a',encoding = 'utf-8-sig', newline='') as f:
# This forces quoting of strings like the OP got from json.dumps
w = csv.writer(f,quoting=csv.QUOTE_NONNUMERIC)
if tweets['coordinates'] is not None:
val_lg = tweets['coordinates']['coordinates'][1]
val_lt = tweets['coordinates']['coordinates'][0]
else:
val_lg = "None"
val_lt = "None"
text = tweets["text"]
user_id = tweets["user"]["id_str"]
time = tweets["created_at"]
# group the fields in a list for writerow
data = [time,val_lt,val_lg,user_id,text]
print(data)
w.writerow(data)
Output (UTF-8 terminal):
['some date', 45.6, 122.3, 'some id', 'some textفي']
Output (Data.csv):
"some date",45.6,122.3,"some id","some textفي"