How to add dictionary element from beautifulsoup to json file - json

Can you help me how to import from dictionary to json file, I already get all tags from web but still confuse to save all tags. this is my code
array= []
data = {}
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
with open('data.json', 'w') as outfile:
json.dump(data, outfile)
when running the program, I just get one dictionary
{"title": "......", "image": ".....", "link": "...."}

Put your output statement in the loop where you are assigning data. You are overwriting the data on each iteration. If you change your code to be:
array= []
data = {}
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
with open('data.json', 'a') as outfile:
json.dump(data, outfile)
It should give you what you want.
Alternatively you could do:
array= []
data = {}
data_list = []
for divdata in soup.findAll('div', {"class": "ratio9_8 box_img fl mr10"}):
for div in divdata.findAll('div', {'class': 'img_con lqd'}):
for getatag in div.findAll('a', {'data-category': 'WP Kanal Berita'},href = True):
for getimgtag in getatag.findAll('img',title=True,src=True):
array.append(getimgtag['title'])
array.append(getimgtag['src'])
array.append(getatag['href'])
data['title'] = array[0]
data['image'] = array[1]
data['link'] = array[2]
data_list.append(data)
data = {'data_list': data_list}
with open('data.json', 'w') as outfile:
json.dump(data, outfile)

Related

Converting COCO Format to LabelMe Format

I am trying to convert COCO json file to LabelMe json file. I used a python script called "coco2labelme.py" to convert the json file.
It successfully converts the json file, the only problem is that I get an error every time I try to load the converted json file in LabelMe. An error occurs regarding the 'imageData' of the file.
Does anyone have an idea on how to convert from COCO to LabelMe format with the image data?
below is the code for coco2labelme.py
[Source: https://gist.github.com/travishsu/6efa5c9fb92ece37b4748036026342f6]
import os
import json
import subprocess
import numpy as np
import pandas as pd
from skimage.measure import find_contours
class CocoDatasetHandler:
def __init__(self, jsonpath, imgpath):
with open(jsonpath, 'r') as jsonfile:
ann = json.load(jsonfile)
images = pd.DataFrame.from_dict(ann['images']).set_index('id')
annotations = pd.DataFrame.from_dict(ann['annotations']).set_index('id')
categories = pd.DataFrame.from_dict(ann['categories']).set_index('id')
annotations = annotations.merge(images, left_on='image_id', right_index=True)
annotations = annotations.merge(categories, left_on='category_id', right_index=True)
annotations = annotations.assign(
shapes=annotations.apply(self.coco2shape, axis=1))
self.annotations = annotations
self.labelme = {}
self.imgpath = imgpath
self.images = pd.DataFrame.from_dict(ann['images']).set_index('file_name')
def coco2shape(self, row):
if row.iscrowd == 1:
shapes = self.rle2shape(row)
elif row.iscrowd == 0:
shapes = self.polygon2shape(row)
return shapes
def rle2shape(self, row):
rle, shape = row['segmentation']['counts'], row['segmentation']['size']
mask = self._rle_decode(rle, shape)
padded_mask = np.zeros(
(mask.shape[0]+2, mask.shape[1]+2),
dtype=np.uint8,
)
padded_mask[1:-1, 1:-1] = mask
points = find_contours(mask, 0.5)
shapes = [
[[int(point[1]), int(point[0])] for point in polygon]
for polygon in points
]
return shapes
def _rle_decode(self, rle, shape):
mask = np.zeros([shape[0] * shape[1]], np.bool)
for idx, r in enumerate(rle):
if idx < 1:
s = 0
else:
s = sum(rle[:idx])
e = s + r
if e == s:
continue
assert 0 <= s < mask.shape[0]
assert 1 <= e <= mask.shape[0], "shape: {} s {} e {} r {}".format(shape, s, e, r)
if idx % 2 == 1:
mask[s:e] = 1
# Reshape and transpose
mask = mask.reshape([shape[1], shape[0]]).T
return mask
def polygon2shape(self, row):
# shapes: (n_polygons, n_points, 2)
shapes = [
[[int(points[2*i]), int(points[2*i+1])] for i in range(len(points)//2)]
for points in row.segmentation
]
return shapes
def coco2labelme(self):
fillColor = [255, 0, 0, 128]
lineColor = [0, 255, 0, 128]
groups = self.annotations.groupby('file_name')
for file_idx, (filename, df) in enumerate(groups):
record = {
'imageData': None,
'fillColor': fillColor,
'lineColor': lineColor,
'imagePath': filename,
'imageHeight': int(self.images.loc[filename].height),
'imageWidth': int(self.images.loc[filename].width),
}
record['shapes'] = []
instance = {
'line_color': None,
'fill_color': None,
'shape_type': "polygon",
}
for inst_idx, (_, row) in enumerate(df.iterrows()):
for polygon in row.shapes:
copy_instance = instance.copy()
copy_instance.update({
'label': row['name'],
'group_id': inst_idx,
'points': polygon
})
record['shapes'].append(copy_instance)
if filename not in self.labelme.keys():
self.labelme[filename] = record
def save_labelme(self, file_names, dirpath, save_json_only=False):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
else:
raise ValueError(f"{dirpath} has existed")
for file in file_names:
filename = os.path.basename(os.path.splitext(file)[0])
with open(os.path.join(dirpath, filename+'.json'), 'w') as jsonfile:
json.dump(self.labelme[file], jsonfile, ensure_ascii=True, indent=2)
if not save_json_only:
subprocess.call(['cp', os.path.join(self.imgpath, file), dirpath])
ds = CocoDatasetHandler('cocodataset/annotations/instances_train2014.json', 'cocodataset/train2014/')
ds.coco2labelme()
ds.save_labelme(ds.labelme.keys(), 'cocodataset/labelme/train2014')

why does JSON dump doesn't work in my code?

I'm trying to put python objects into a JSON file by getting the API from one of the sites but somehow when I run the code nothing has been put in the JSON file. API is working well, as well when I print out the code by json.load I get the output but I have no idea why does dump doesn't work.
here is my code:
from django.shortcuts import render
import requests
import json
import datetime
import re
def index(request):
now = datetime.datetime.now()
format = "{}-{}-{}".format(now.year, now.month, now.day)
source = []
author = []
title = []
date = []
url = "http://newsapi.org/v2/everything"
params = {
'q': 'bitcoin',
'from': format,
'sortBy': 'publishedAt',
'apiKey': '1186d3b0ccf24e6a91ab9816de603b90'
}
response = requests.request("GET", url, params=params)
for news in response.json()['articles']:
matching = re.match("\d+-\d+-\d+", news['publishedAt'])
if format == matching.group():
source.append(news['source'])
author.append(news['author'])
title.append(news['title'])
date.append(news['publishedAt'])
data = \
{
'source': source,
'author': author,
'title': title,
'date': date
}
with open('data.json', "a+") as fp:
x = json.dump(data, fp, indent=4)
return render(request, 'news/news.html', {'response': response})

How to parse next page by Beautiful Soup?

I use code as below to parse page with next page:
def parseNextThemeUrl(url):
ret = []
ret1 = []
html = urllib.request.urlopen(url)
html = BeautifulSoup(html, PARSER)
html = html.find('a', class_='pager_next')
if html:
html = urljoin(url, html.get('href'))
ret1 = parseNextThemeUrl(html)
for r in ret1:
ret.append(r)
else:
ret.append(url)
return ret
But I got error as below, how can I parse next link if there is a link.
Traceback (most recent call last):
html = urllib.request.urlopen(url)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 456, in open
req.timeout = timeout
AttributeError: 'list' object has no attribute 'timeout'
I got my own answer as below:
def parseNextThemeUrl(url):
urls = []
urls.append(url)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
new_page = soup.find('a', class_='pager_next')
if new_page:
new_url = urljoin(url, new_page.get('href'))
urls1 = parseNextThemeUrl(new_url)
for url1 in urls1:
urls.append(url1)
return urls

Serialize Gtk TreeStore / ListStore using JSON

I made a new example which shows much better what I am trying to do. The new example gives the following ouput. Is there a way that the data can go into the respective store key (the {} brackets)?
{
"copy": [
[
[
5.0,
8.0,
9.0
]
],
[
[
4.0,
0.0,
1.0
]
]
],
"name": "dataset1",
"sets": [
{
"store": {},
"type": "vector"
},
{
"store": {},
"type": "vector"
}
]
}
New example
from gi.repository import Gtk
import json
import random
class Vector(object):
def __init__(self, data):
self.store = Gtk.ListStore(float, float, float)
self.store.append([data[0], data[1], data[2]])
self.type = "vector"
def return_data(self):
store_data = []
def iterate_over_data(model, path, itr):
row = model[path]
store_data.append([row[0], row[1], row[2]])
self.store.foreach(iterate_over_data)
return store_data
class DataSet(object):
def __init__(self, name):
self.name = name
self.sets = []
def add_vector(self):
data = [random.randint(0,9) for x in range(3)]
self.sets.append(Vector(data))
def to_json(self):
self.copy = []
for s in self.sets:
self.copy.append(s.return_data())
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
obj1 = DataSet("dataset1")
for x in range(2):
obj1.add_vector()
print(obj1.to_json())
Old example
I am currently figuring out how to serialize a Gtk ListStore that is nested in a Gtk TreeStore. I got a small example to work, but am not sure if this approach will scale for programs that have more data attached (For example the layer object could hold a color or a date of creation). Is there maybe another way to to this?
My current approach is to gather the data in list and dictionary form myself and then just create the JSON-dump. I have the feeling that this would be rather difficult to maintain if I need to attach 25 values to each layer-object.
from gi.repository import Gtk, Gdk
import json
import random
class LayerTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Name", renderer, text=0)
self.append_column(column)
class DataTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
self.store = store
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Data", renderer, text=0)
self.append_column(column)
class MainWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="TreeView Serialize")
self.connect("delete-event", Gtk.main_quit)
self.set_border_width(10)
self.set_default_size(400, 300)
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6, expand=True)
self.add(vbox)
self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
button = Gtk.Button("Cut")
button.connect("clicked", self.on_cut_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_COPY)
button.connect("clicked", self.on_copy_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_PASTE)
button.connect("clicked", self.on_paste_clicked)
hbox.pack_start(button, True, True, 0)
vbox.add(hbox)
self.layer_store = Gtk.TreeStore(str, object, object)
self.layer_view = LayerTreeView(self.layer_store)
self.layer_sw = Gtk.ScrolledWindow()
self.data_sw = Gtk.ScrolledWindow()
self.layer_sw.add(self.layer_view)
treebox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6, expand=True)
treebox.pack_start(self.layer_sw, True, True, 0)
treebox.pack_start(self.data_sw, True, True, 0)
vbox.add(treebox)
self.select = self.layer_view.get_selection()
self.select.connect("changed", self.on_selection_changed)
self.add_test_data()
def add_test_data(self):
for x in range(3):
data_store = Gtk.ListStore(str)
data_view = DataTreeView(data_store)
for y in range(5):
data_store.append([str(y+x)])
self.layer_store.append(None, ["Data {}".format(x), data_store, data_view])
def on_selection_changed(self, selection):
"""
When layer is switched load respective data
"""
model, treeiter = selection.get_selected()
if treeiter != None:
data_view = model[treeiter][2]
child = self.data_sw.get_child()
if child != None:
self.data_sw.remove(self.data_sw.get_child())
self.data_sw.add(data_view)
self.show_all()
def on_cut_clicked(self, button):
pass
def on_copy_clicked(self, button):
copy_list = ["safe-to-paste"]
data_dict = {}
for row in self.layer_store:
name = row[0]
data_obj = row[1]
value_list = []
for datarow in data_obj:
value = datarow[0]
value_list.append(value)
data_dict[name] = value_list
copy_list.append(data_dict)
data = json.dumps(copy_list)
self.clipboard.set_text(data, -1)
def on_paste_clicked(self, button):
paste_str = self.clipboard.wait_for_text()
try:
parse = json.loads(paste_str)
json_str = True
except:
json_str = False
if json_str is False:
return
keyword = parse[0]
if keyword != "safe-to-paste":
return
data_dict = parse[1]
for x in data_dict:
data_list = data_dict[x]
data_store = Gtk.ListStore(str)
data_view = DataTreeView(data_store)
for y in data_list:
data_store.append([str(y)])
self.layer_store.append(None, [x, data_store, data_view])
win = MainWindow()
win.show_all()
Gtk.main()
I have an improved version of your code with dict comprehension and #staticmethod that makes the signal callbacks more readable and shorter. Nevertheless, this does not really solve your problem as it still generates the json manually. If the ListStore gets more complex, it would probably be better to let the DataListStore class generate its own json with a corresponding method.
from gi.repository import Gtk, Gdk
import json
class LayerTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Name", renderer, text=0)
self.append_column(column)
class DataTreeView(Gtk.TreeView):
def __init__(self):
Gtk.TreeView.__init__(self)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Data", renderer, text=0)
self.append_column(column)
class DataListStore(Gtk.ListStore):
#staticmethod
def from_json(*args, values=[]):
store = DataListStore(*args)
for value in values:
store.append((value,))
return store
class MainWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="TreeView Serialize")
self.connect("delete-event", Gtk.main_quit)
self.set_border_width(10)
self.set_default_size(400, 300)
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6, expand=True)
self.add(vbox)
self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
button = Gtk.Button("Cut")
button.connect("clicked", self.on_cut_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_COPY)
button.connect("clicked", self.on_copy_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_PASTE)
button.connect("clicked", self.on_paste_clicked)
hbox.pack_start(button, True, True, 0)
vbox.add(hbox)
self.layer_store = Gtk.TreeStore(str, object)
self.layer_view = LayerTreeView(self.layer_store)
self.data_view = DataTreeView()
layer_sw = Gtk.ScrolledWindow()
layer_sw.add(self.layer_view)
data_sw = Gtk.ScrolledWindow()
data_sw.add(self.data_view)
treebox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6, expand=True)
treebox.pack_start(layer_sw, True, True, 0)
treebox.pack_start(data_sw, True, True, 0)
vbox.add(treebox)
select = self.layer_view.get_selection()
select.connect("changed", self.on_selection_changed)
self.add_test_data()
def add_test_data(self):
for x in range(3):
data_list = [str(y+x) for y in range(5)]
self.layer_store.append(None, ["Data {}".format(x), data_list])
def on_selection_changed(self, selection):
"""
When layer is switched load respective data
"""
model, treeiter = selection.get_selected()
if treeiter != None:
self.data_view.set_model(
DataListStore.from_json(str, values=model[treeiter][1])
)
def on_cut_clicked(self, button):
pass
def on_copy_clicked(self, button):
copy_list = [
'safe-to-paste',
{row[0]: row[1] for row in self.layer_store},
]
data = json.dumps(copy_list)
self.clipboard.set_text(data, -1)
def on_paste_clicked(self, button):
paste_str = self.clipboard.wait_for_text()
try:
parse = json.loads(paste_str)
except:
return
if parse[0] != "safe-to-paste":
return
data_dict = parse[1]
for x in data_dict:
self.layer_store.append(None, [x, data_dict[x]])
win = MainWindow()
win.show_all()
Gtk.main()

scrapy unhandled exception

I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.