the codes I have on scrapy are shown below:
def parse(self, response):
for quote in response.css('div.search-item '):
f = quote.css('a.stack::attr(href)').extract_first()
f = response.urljoin(f)
# print(f)
yield {
'text': quote.css('span.tags::text').extract_first(),
'title': quote.css('h3 em::text').extract_first(),
}
yield response.follow(f, self.parse_program)
def parse_program(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
yield {
'name': extract_with_css('div.headline h1::text'),}
The result turn out to be like:
{'text':"sdada",'title':"12321q"}
{'name':"sdasdsa"}
I want to make it like:
{'text':"sdada",'title':"12321q",'name':"sdasdsa"}
what should I do?
You need to use request meta for this, and yield the item only when you have all the data
def parse(self, response):
for quote in response.css('div.search-item '):
f = quote.css('a.stack::attr(href)').extract_first()
f = response.urljoin(f)
# print(f)
data = {
'text': quote.css('span.tags::text').extract_first(),
'title': quote.css('h3 em::text').extract_first(),
}
yield response.follow(f, self.parse_program, meta={"data": data})
def parse_program(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
data = response.meta["data"]
data['name'] = extract_with_css('div.headline h1::text')
yield data
Related
I am working with APIs based in REST Architecture. I know Django has a framework to work with this APIs but my homework is do it from scratch. I got an API of a movies site where users can go and search information about a bunch of movies and i am trying to get the data into JSON format from the model Movie which has a Many-to-Many relationship whith the Actor model. I am using Class-based views for this.
The code from my models.py and views.py files is nested below:
class Actor(models.Model):
full_name = models.CharField(max_length=125)
role = models.CharField(max_length=125)
def __str__(self):
return self.full_name
class Movie(models.Model):
ACTION = 'AC'
DRAMA = 'DR'
COMEDY = 'CM'
SCIENCE_FICTION = 'SF'
THRILLER = 'TR'
RELIGIOUS = 'RG'
GENRE_CHOICES = [
(ACTION, 'Accion'),
(DRAMA, 'Drama'),
(COMEDY, 'Comedy'),
(SCIENCE_FICTION, 'Ciencia Ficcion'),
(THRILLER, 'Triler'),
(RELIGIOUS, 'Religioso')
]
title = models.CharField(max_length=155, blank=False)
synopsis = models.TextField(max_length=1000, blank=True)
genre = models.CharField(max_length=100, choices=GENRE_CHOICES, default='', blank=False)
tag = models.JSONField(default=dict, blank=True)
actors = models.ManyToManyField(Actor, related_name='movies', blank=True)
def __str__(self):
views.py
from django.views import View
from django.http.response import JsonResponse
from .models import Movie
from django.utils.decorators import method_decorator
from django.views.decorators.csrf import csrf_exempt
import json
class MovieView(View):
#method_decorator(csrf_exempt)
def dispatch(self, request, *args, **kwargs):
return super().dispatch(request, *args, **kwargs)
def get(self, request, pk=0):
"""
Return the list of all movies, or a single movie
:param pk:
:param request:
:return:
"""
if pk > 0:
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
movie = movies[0]
data = {'message': "Success", 'movie': movie}
else:
data = {'message': "Movie not found... "}
return JsonResponse(data)
else:
movies = list(Movie.objects.values('title', 'synopsis', 'genre', 'actors__full_name').order_by('pk'))
if len(movies) > 0:
data = {'message': "Success", 'movies': movies}
else:
data = {'message': "Movies not found ..."}
return JsonResponse(data)
def post(self, request):
"""
Create a new movie
:param request:
:return:
"""
json_data = json.loads(request.body)
Movie.objects.create(title=json_data['title'], synopsis=json_data['synopsis'], genre=json_data['genre'],
tag=json_data['tag'], actors=json_data['actors'])
data = {'message': "Success"}
return JsonResponse(data)
def put(self, request, pk):
"""
Update a single movie
:param request:
:param pk:
:return:
"""
json_data = json.loads(request.body)
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
movie = Movie.objects.get(pk=pk)
movie.title = json_data['title']
movie.synopsis = json_data['synopsis']
movie.genre = json_data['genre']
movie.tag = json_data['tag']
movie.save()
data = {'message': "Success"}
else:
data = {'message': "Movie not found ..."}
return JsonResponse(data)
def delete(self, request, pk):
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
Movie.objects.filter(pk=pk).delete()
data = {'message': "Success"}
else:
data = {'message': "Movie not found ..."}
return JsonResponse(data)
thi is my error"expected str, bytes or os.PathLike object, not _io.BytesIO " while running this code.
here my Apiview
class UploadViewSet(APIView):
parser_classes = (MultiPartParser, FormParser)
permission_classes = (AllowAny,)
def post(self, request, *args, **kwargs):
file = BytesIO(request.FILES['file'].read())
with open(file, "r") as csv_file:
reader = csv.reader(csv_file)
for row in reader():
new_company = Company(
name=row['name'],
hr_name=row['hr_name'],
hr_email=row['hr_email'],
hr_verified=row['hr_verified'],
user_id=row['user_id'],
primary_phone=row['primary_phone'],
comments=row['comments'],
)
new_company.save()
return Response({"status": "success"},status.HTTP_201_CREATED)
I'm trying to put python objects into a JSON file by getting the API from one of the sites but somehow when I run the code nothing has been put in the JSON file. API is working well, as well when I print out the code by json.load I get the output but I have no idea why does dump doesn't work.
here is my code:
from django.shortcuts import render
import requests
import json
import datetime
import re
def index(request):
now = datetime.datetime.now()
format = "{}-{}-{}".format(now.year, now.month, now.day)
source = []
author = []
title = []
date = []
url = "http://newsapi.org/v2/everything"
params = {
'q': 'bitcoin',
'from': format,
'sortBy': 'publishedAt',
'apiKey': '1186d3b0ccf24e6a91ab9816de603b90'
}
response = requests.request("GET", url, params=params)
for news in response.json()['articles']:
matching = re.match("\d+-\d+-\d+", news['publishedAt'])
if format == matching.group():
source.append(news['source'])
author.append(news['author'])
title.append(news['title'])
date.append(news['publishedAt'])
data = \
{
'source': source,
'author': author,
'title': title,
'date': date
}
with open('data.json', "a+") as fp:
x = json.dump(data, fp, indent=4)
return render(request, 'news/news.html', {'response': response})
I'm using json to output a bunch of classes.
The problem that i am trying to solve is what is the best way to output json for a nested class. The following simple example highlights the problem.
# python 3.x
import json
class Class1:
def __init__(self):
self.a = 'this'
self.b = 33
def toJSON(self):
return json.dumps({'a': self.a,
'b': self.b},
sort_keys=False, indent=2, separators=(',', ': '))
class Class2:
def __init__(self):
self.x = 'hello'
self.y = Class1()
self.z = 'world'
def toJSON(self):
return json.dumps({'x': self.x,
'y': self.y.toJSON(),
'z': self.z},
sort_keys=False, indent=2, separators=(',', ': '))
if __name__ == '__main__':
a = Class2()
print(a.toJSON())
This is the output I'm getting. Which contains control characters
{
"x": "hello",
"y": "{\n \"a\": \"this\",\n \"b\": 33\n}",
"z": "world"
}
This is the output I want would look something like this
{
"x":"hello",
"y":{
"a":"this",
"b":33
},
"z":"world"
}
Any assistance would be greatly appreciated.
I think you could implement cls encoder and then pass cls to json.dumps function like the following example.
import json
class Class1:
def __init__(self):
self.a = 'this'
self.b = 33
class Class2:
def __init__(self):
self.x = 'hello'
self.y = Class1()
self.z = 'world'
class ComplexEncoder(json.JSONEncoder):
def default(self, obj):
if hasattr(obj, '__dict__'):
return obj.__dict__
else:
return json.JSONEncoder.default(self, obj)
if __name__ == '__main__':
a = Class2()
print(json.dumps(a, cls=ComplexEncoder, sort_keys=False, indent=2, separators=(',', ': ')))
You are dumping a JSON as string. Loading it as JSON and then returning would help
# python 3.x
import json
class Class1:
def __init__(self):
self.a = 'this'
self.b = 33
def toJSON(self):
return json.loads(json.dumps({'a': self.a,
'b': self.b},
sort_keys=False, indent=4, separators=(',', ': ')))
class Class2:
def __init__(self):
self.x = 'hello'
self.y = Class1()
self.z = 'world'
def toJSON(self):
return json.loads(json.dumps({'x': self.x,
'y': self.y.toJSON(),
'z': self.z},
sort_keys=False, indent=2, separators=(',', ': ')))
if __name__ == '__main__':
a = Class2()
print(a.toJSON())
Output :
{'x': 'hello', 'y': {'a': 'this', 'b': 33}, 'z': 'world'}
I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.