How to parse next page by Beautiful Soup? - html

I use code as below to parse page with next page:
def parseNextThemeUrl(url):
ret = []
ret1 = []
html = urllib.request.urlopen(url)
html = BeautifulSoup(html, PARSER)
html = html.find('a', class_='pager_next')
if html:
html = urljoin(url, html.get('href'))
ret1 = parseNextThemeUrl(html)
for r in ret1:
ret.append(r)
else:
ret.append(url)
return ret
But I got error as below, how can I parse next link if there is a link.
Traceback (most recent call last):
html = urllib.request.urlopen(url)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 456, in open
req.timeout = timeout
AttributeError: 'list' object has no attribute 'timeout'

I got my own answer as below:
def parseNextThemeUrl(url):
urls = []
urls.append(url)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
new_page = soup.find('a', class_='pager_next')
if new_page:
new_url = urljoin(url, new_page.get('href'))
urls1 = parseNextThemeUrl(new_url)
for url1 in urls1:
urls.append(url1)
return urls

Related

Flask Session does not save data across app.routes

I have been struggling with a problem in my code for a long time now and I cannot seem to figure out what I am doing/is going wrong.
So basically I am making a chat app with flask and socket.io. I wanted to use flask session to save data, but my data does not seem to save.
import os
from flask import Flask, render_template, request, session, redirect
from flask_socketio import SocketIO, emit
from flask_session import Session
app = Flask(__name__)
app.config["SECRET_KEY"] = os.getenv("SECRET_KEY")
app.config["SESSION_PERMANENT"] = False
app.config["SESSION_TYPE"] = "filesystem"
socketio = SocketIO(app, manage_session=False)
Session(app)
users = []
chatrooms = []
#app.route("/", methods=["GET","POST"])
def index():
if request.method == "POST":
displayname = request.form.get("dname")
for user in users:
if displayname == user:
return render_template("index.html", error_message = "Displayname is already taken.")
session['displayname'] = displayname
session["logged_in"] = True
return render_template("main.html", displayname=displayname, chatrooms=chatrooms, users=users)
if request.method == "GET":
if 'displayname' in session:
displayname = session['displayname']
return redirect("/main")
return render_template("index.html")
#app.route("/main", methods=["GET", "POST"])
def main():
if request.method == "GET":
displayname = request.form.get("goback")
session['displayname'] = displayname
if displayname in users:
users.remove(displayname)
if 'chatroomname' in session:
chatroomname = session['chatroomname']
return redirect("/getchatroom")
return render_template("main.html", displayname=displayname, chatrooms=chatrooms, users=users)
if request.method == "POST":
displayname = request.form.get("makechatroom")
session['displayname'] = displayname
chatroomname = request.form.get("chatroomname")
for chatroom in chatrooms:
if chatroom == chatroomname:
error= "This chatroom already exists! Choose another name!"
return render_template("main.html", displayname=displayname, chatrooms=chatrooms, users=users, error=error)
chatrooms.append(chatroomname)
session['chatroomname'] = chatroomname
users.append(displayname)
return render_template("chatroom.html", displayname=displayname, chatroomname=chatroomname, users=users)
#app.route("/getchatroom", methods=["POST"])
def getchatroom():
if request.method == "POST":
chatroomname = request.form.get("openchatroom")
session['chatroomname'] = chatroomname
displayname = request.form.get("disname")
session['displayname'] = displayname
users.append(displayname)
return render_template("chatroom.html", displayname=displayname, chatroomname=chatroomname, users=users)
#app.route("/logout", methods=["GET"])
def logout():
displayname = request.form.get("logout")
if displayname in users:
users.remove(displayname)
session.pop('username', None)
session["logged_in"] = False
return redirect("/")
I don't really know if the HTML is relevant, as it doesn't seem to be an issue that has to do with the HTML.
This is my error message:
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 2464, in __call__
return self.wsgi_app(environ, start_response)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask_socketio\__init__.py", line 45, in __call__
return super(_SocketIOMiddleware, self).__call__(environ,
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\engineio\middleware.py", line 74, in __call__
return self.wsgi_app(environ, start_response)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 2450, in wsgi_app
response = self.handle_exception(e)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 1867, in handle_exception
reraise(exc_type, exc_value, tb)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\_compat.py", line 39, in reraise
raise value
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 2447, in wsgi_app
response = self.full_dispatch_request()
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
rv = self.handle_user_exception(e)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 1821, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\_compat.py", line 39, in reraise
raise value
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 1950, in full_dispatch_request
rv = self.dispatch_request()
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\flask\app.py", line 1936, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "C:\Users\linda\OneDrive\Bureaublad\project2\application.py", line 47, in main
displayname = session['displayname']
File "C:\Users\linda\OneDrive\Bureaublad\project2\venv\Lib\site-packages\werkzeug\local.py", line 377, in <lambda>
__getitem__ = lambda x, i: x._get_current_object()[i]
KeyError: 'displayname'
Thanks in advance for everyone trying to help!
xx Linda
There is a nasty bug in cachelib of python 2.7 version.
Exception is suppressed in lib\site-packages\cachelib\file.py", line 158, in set
Someone tried to fix this problem https://github.com/fengsp/flask-session/issues/119, but "fix" broke the code even more (My interpreter raises 'ModuleException `os` has no attribute `replace`').
So, my solution is just replace broken functions like that (simply paste after your imports):
def _hotfix(old, new):
try:
os.rename(old, new)
except WindowsError as e:
if e.errno == 17:
os.remove(new)
os.rename(old, new)
else:
class UniverseDestructionException(Exception):
pass
raise UniverseDestructionException
os.replace = os.rename = _hotfix
another reminder that python2.7 is no longer supported

Odoo V11 restapi - TypeError(repr(o) + “ is not JSON serializable”)

Am using odoo v11 integrating with restapi though working fine in CRUD operations when getting report data with error
file.py
def generate_report(self, xml_id, ids):
self_reports = {}
self_reports = {'result': False, 'state': False, 'exception': None}
try:
result, format = request.env.ref(xml_id).sudo().render_qweb_pdf([ids])
if not result:
tb = sys.exc_info()
self_reports['exception'] = odoo.exceptions.DeferredException('RML is not available at specified location or not enough data to print!', tb)
self_reports['result'] = result
self_reports['format'] = format
self_reports['state'] = True
self_reports.update({'id': ids})
except Exception as exception:
_logger.exception('Exception: %s\n', exception)
if hasattr(exception, 'name') and hasattr(exception, 'value'):
self_reports['exception'] = odoo.exceptions.DeferredException(tools.ustr(exception.name), tools.ustr(exception.value))
else:
tb = sys.exc_info()
self_reports['exception'] = odoo.exceptions.DeferredException(tools.exception_to_unicode(exception), tb)
self_reports['state'] = True
exc = self_reports['exception']
if exc:
raise UserError('%s: %s' % (exc.message, exc.traceback))
if self_reports['state']:
if tools.config['reportgz']:
import zlib
res2 = zlib.compress(result)
else:
if isinstance(result, str):
res2 = result.encode('latin1', 'replace')
else:
res2 = result
if res2:
self_reports['result'] = base64.encodestring(res2)
return self_reports
def get_response(self, status_code, status, data=None):
"""Returns Response Object with given status code and status"""
response = Response()
response.status = status
if data:
response.data = isinstance(data, str) and data or json.dumps(data)
response.status_code = status_code
return response
this is data format
list: [{'exception': None, 'state': True, 'id': 3, 'format': 'pdf', 'result':
b'SlZCRVJpMHhMak1LTVNBd0lHOWlhZ284UEFvdlZIbHdaU0F2VUdGblpYTUtMME52ZFc1MElERUtM\nMHRwWkh
NZ1d5QXpJREFnVWlCZApDajQrQ21WdVpHOWlhZ295SURBZ2IySnFDanc4Q2k5UWNtOWtk\nV05sY2lBb1VIbFFSR
Vl5S1FvK1BncGxibVJ2WW1vS015QXdJRzlpCmFnbzhQQW92VW1WemIzVnlZ\nMlZ6SURVZ01DQlNDaTlCYm01dmR
ITWdNVGdnTUNCU0NpOVFZWEpsYm5RZ01TQXdJRklLTDFSNWNH\nVWcKTDFCaFoyVUtMMDFsWkdsaFFtOTRJR
nNnTUNBd0lEWXhNaUEzT1RJZ1hRb3ZRMjl1ZEdWdWRI\nTWdNVGtnTUNCU0NqNCtDbVZ1Wkc5aQphZ28wSUR
BZ2IySnFDanc4Q2k5VWVYQmxJQzlEWVhSaGJH\nOW5DaTlRWVdkbGN5QXhJREFnVWdvK1BncGxibVJ2WW1vS05T
QXdJRzlpCmFnbzhQQW92UTI5c2Iz\nSlRjR0ZqWlNBOFBBb3ZRMU53SUM5RVpYWnBZMlZTUjBJS0wwTlRjR2NnTD
BSbGRtbGpaVWR5WVhr\nS0wxQkQKVTNBZ05pQXdJRklLUGo0S0wwVjRkRWRUZEdGMFpTQThQQW92UjFOaElEY
2dNQ0JTQ2o0\nK0NpOUdiMjUwSUR3OENpOUdPQ0E0SURBZwpVZ292UmpjZ01UTWdNQ0JTQ2o0K0NpOVFZWFI ...
Error Traceback:
File "E:\Odoo\odoo11\addons\restapi\controllers\main.py", line 343, in call_report
return self.get_response(200, str(200), {'report': datas})
File "E:\Odoo\odoo11\addons\restapi\controllers\main.py", line 135, in get_response
response.data = isinstance(data, str) and data or json.dumps(data)
File "C:\Program Files\Python\Python35\lib\json\__init__.py", line 230, in dumps
return _default_encoder.encode(obj)
File "C:\Program Files\Python\Python35\lib\json\encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "C:\Program Files\Python\Python35\lib\json\encoder.py", line 257, in iterencode
return _iterencode(o, 0)
File "C:\Program Files\Python\Python35\lib\json\encoder.py", line 180, in default
raise TypeError(repr(o) + " is not JSON serializable")
I need the pdf report in binary data in api response , getting error in response data. Please anyone help me to resolve this
This is because you might you forget to import json lib. In your header declare like import json this might help you to solve your error.

How to add dictionary element from beautifulsoup to json file

Can you help me how to import from dictionary to json file, I already get all tags from web but still confuse to save all tags. this is my code
array= []
data = {}
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
with open('data.json', 'w') as outfile:
json.dump(data, outfile)
when running the program, I just get one dictionary
{"title": "......", "image": ".....", "link": "...."}
Put your output statement in the loop where you are assigning data. You are overwriting the data on each iteration. If you change your code to be:
array= []
data = {}
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
with open('data.json', 'a') as outfile:
json.dump(data, outfile)
It should give you what you want.
Alternatively you could do:
array= []
data = {}
data_list = []
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
data_list.append(data)
data = {'data_list': data_list}
with open('data.json', 'w') as outfile:
json.dump(data, outfile)

Open Binary File in a URL

In Document Module when I click on a link of binary file , it will be downloaded as JSON Object.
How can I open it directly in Web Browser.
I found the code implement the downloading attachment file for Document Module :
enter co#http.route('/web/binary/saveas_ajax', type='http', auth="user")
#serialize_exception
def saveas_ajax(self, data, token):
jdata = simplejson.loads(data)
model = jdata['model']
field = jdata['field']
data = jdata['data']
id = jdata.get('id', None)
filename_field = jdata.get('filename_field', None)
context = jdata.get('context', {})
Model = request.session.model(model)
fields = [field]
if filename_field:
fields.append(filename_field)
if data:
res = { field: data }
elif id:
res = Model.read([int(id)], fields, context)[0]
else:
res = Model.default_get(fields, context)
filecontent = base64.b64decode(res.get(field, ''))
if not filecontent:
raise ValueError(_("No content found for field '%s' on '%s:%s'") %
(field, model, id))
else:
filename = '%s_%s' % (model.replace('.', '_'), id)
if filename_field:
filename = res.get(filename_field, '') or filename
return request.make_response(filecontent,
headers=[('Content-Type', 'application/octet-stream'),
('Content-Disposition', content_disposition(filename))],
cookies={'fileToken': token})de here

scrapy unhandled exception

I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.