I'm trying to run the spider found in this crawler and for simplicity sake I'm using this start_url because it is just a list of 320 movies. (So, the crawler won't run for 5 hours as given in the github page).
I crawl using scrapy crawl imdb -o output.json but the output.json file contains nothing. It has just a [ in it.
import scrapy
from texteval.items import ImdbMovie, ImdbReview
import urlparse
import math
import re
class ImdbSpider(scrapy.Spider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [
# "http://www.imdb.com/chart/top",
# "http://www.imdb.com/chart/bottom"
"http://www.imdb.com/search/title?countries=csxx&sort=moviemeter,asc"
]
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.robotstxt.ROBOTSTXT_OBEY': True,
}
base_url = "http://www.imdb.com"
def parse(self, response):
movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href")
for i in xrange(len(movies)):
l = self.base_url + movies[i].extract()
print l
request = scrapy.Request(l, callback=self.parse_movie)
yield request
next = response.xpath("//*[#id='right']/span/a")[-1]
next_url = self.base_url + next.xpath(".//#href")[0].extract()
next_text = next.xpath(".//text()").extract()[0][:4]
if next_text == "Next":
request = scrapy.Request(next_url, callback=self.parse)
yield request
'''
for sel in response.xpath("//table[#class='chart']/tbody/tr"):
url = urlparse.urljoin(response.url, sel.xpath("td[2]/a/#href").extract()[0].strip())
request = scrapy.Request(url, callback=self.parse_movie)
yield request
'''
def parse_movie(self, response):
movie = ImdbMovie()
i1 = response.url.find('/tt') + 1
i2 = response.url.find('?')
i2 = i2 - 1 if i2 > -1 else i2
movie['id'] = response.url[i1:i2]
movie['url'] = "http://www.imdb.com/title/" + movie['id']
r_tmp = response.xpath("//div[#class='titlePageSprite star-box-giga-star']/text()")
if r_tmp is None or r_tmp == "" or len(r_tmp) < 1:
return
movie['rating'] = int(float(r_tmp.extract()[0].strip()) * 10)
movie['title'] = response.xpath("//span[#itemprop='name']/text()").extract()[0]
movie['reviews_url'] = movie['url'] + "/reviews"
# Number of reviews associated with this movie
n = response.xpath("//*[#id='titleUserReviewsTeaser']/div/div[3]/a[2]/text()")
if n is None or n == "" or len(n) < 1:
return
n = n[0].extract().replace("See all ", "").replace(" user reviews", "")\
.replace(" user review", "").replace(",", "").replace(".", "").replace("See ", "")
if n == "one":
n = 1
else:
n = int(n)
movie['number_of_reviews'] = n
r = int(math.ceil(n / 10))
for x in xrange(1, r):
start = x * 10 - 10
url = movie['reviews_url'] + "?start=" + str(start)
request = scrapy.Request(url, callback=self.parse_review)
request.meta['movieObj'] = movie
yield request
def parse_review(self, response):
ranks = response.xpath("//*[#id='tn15content']/div")[0::2]
texts = response.xpath("//*[#id='tn15content']/p")
del texts[-1]
if len(ranks) != len(texts):
return
for i in xrange(0, len(ranks) - 1):
review = ImdbReview()
review['movieObj'] = response.meta['movieObj']
review['text'] = texts[i].xpath("text()").extract()
rating = ranks[i].xpath(".//img[2]/#src").re("-?\\d+")
if rating is None or rating == "" or len(rating) < 1:
return
review['rating'] = int(rating[0])
yield review
Can someone tell me where am I going wrong?
In my opinion, this web site should be load the list of movies use by js. Fristly, I suggest you should check the output about: movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href"). If you want to get js content, you can use webkit in scrapy as a downloader middleware.
Related
I have a code to retreive data from autoscout24.com for my thesis which I will be working on used car analytics. However, I cannot retreive data and loops do not end. I do not understand the reason behind it. Can anyone help me? Here is the code.
brand = \[\]
model = \[\]
price = \[\]
total = \[\]
a = 101
k = 100
l = 20000 # calculates between € 0 - € 2.000.000
j = 1
while j \<= l:
i = 1
website = 'https://www.autoscout24.com/lst?sort=price&desc=0&cy=NL&atype=C&ustate=N%2CU&damaged_listing=exclude&powertype=kw&pricefrom=' + str(a) + '&priceto=' + str(a+k-1) + '&search_id=2gfy6suaasl&page='
a = a + k
j = j + 1
while i <= 20:
website = website + str(i)
response = requests.get(website)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find_all('div', {'class' : 'ListItem_wrapper__J_a_C'})
i = i + 1
for result in results:
brand.append(result.find('h2').get_text())
model.append(result.find('span', {'class':'ListItem_version__jNjur'}).get_text())
price.append(result.find('p', {'class':'Price_price__WZayw'}).get_text().strip())
total.append(result.find('div', {'class':'VehicleDetailTable_container__mUUbY'}).get_text())
I used my computer, I used Google CoLab; however, I could not reach anything at all. If you do this manually, you can reach the data; but no results if you do it in a loop.
Why am I getting this error when I try to use writerow
db.writerow(loanList[i])
TypeError: writerows() argument must be iterable
'''
import csv
header = ['Name','Results']
file = open("loanResults.csv", "w", newline = "")
a = 10000
b = [0.032,0.043,0.037,0.043,0.044,0.029,0.028,0.030]
c = [6,7,4,3,4,6,7,9,]
loanList = []
def monthlyRepayment(principalAmount,interest,year):
for i in range(len(b)):
finalAmount = principalAmount * ((1 + interest[i]) **year[i])
monthlyAmount = (finalAmount / (year[i]*12))
loanList.append(round(monthlyAmount))
print(round(monthlyAmount))
testCase1 = monthlyRepayment(a,b,c)
db = csv.writer(file)
db.writerow(header)
for i in range(len(loanList)):
db.writerows(loanList[i])
file.close()
'''
Hi
I was wondering if I can load the page completely with python, for example, a hashtag page form Instagram
there is code I tried but it wouldn't load completely
Here's my code
import json
import re
import requests
x = input("Enter your hashtag: ")
response = requests.get('https://www.instagram.com/explore/tags/' + x + '/?__a=1')
if response.status_code == 404:
print('page not found')
input()
exit()
data = response.text
x = re.findall("\"shortcode\":\"[^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\,]", data)
y = [i.split('"')[3] for i in x]
x = 0
z = len(y)
print(str(z)+' Posts found')
while x < z:
print('\r' + str(x) + ' posts done', end="")
data = requests.get('https://www.instagram.com/p/' + y[x] + '/?__a=1')
y[x] = data.text
x = x + 1
print()
print('post link finished')
Usernames = []
Posts = []
Followers = []
Following = []
x = 0
while x < z:
print('\r' + str(x) + ' Usernames done' , end="")
data = json.loads(y[x])
Usernames.append(data['graphql']['shortcode_media']['owner']['username'])
x = x + 1
print()
print('Usernames finished')
print(len(Usernames))
I want to have more usernames like 100k or more if you can help me with other libraries it isn't important
I am trying to get a search function working. I have a database using excel and would like to be able to search it while on the go. I am stuck trying to get the search form working. Below is the code in python for the form:
from forms import ComicSearchForm
#app.route('/make_search', methods=['GET', 'POST'])
def search():
search = ComicSearchForm(request.form)
if request.method == 'GET, POST':
return search_results(result)
return render_template('search.html', form=search)
#app.route('/search')
# #login required
def search_results(result):
bob = create_bob('*', '*', '*', '*', '*', '*')
bobby = []
current_page = request.args.get('page', 1, type=int)
per_page = 10
end = (current_page * per_page) + 1
if end > len(bob):
end = len(bob)
start = ((current_page - 1) * per_page) + 1
sort_bob = sorted(bob, key=lambda v: (v.issue_type, v.publisher, v.sort, v.character, v.volume, v.issues,
v.publication_date))
if datetime.strptime(sort_bob[0][7], '%B, %Y') >= datetime.now():
sort_bob = sorted(sort_bob, key=lambda v: (v.publication_date, '%B, %Y'))
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
next = str(current_page + 1) if end < len(bob) else '0'
prev = str(current_page - 1)
if not result:
flash('No results found!')
return redirect('make_search')
else:
# display results
return render_template('search.html', bobby=bobby, header=original_header, next=next, prev=prev)
Here is the form:
from wtforms import Form, StringField, SelectField
class ComicSearchForm(Form):
choices = [('Series Title', 'Series Title'),
('Author', 'Author'),
('Artist', 'Artist'),
('Publisher', 'Publisher'),
('Publication Date', 'Publication Date')]
select = SelectField('Search for comics:', choices=choices)
search = StringField('')
I am stuck trying to figure out:
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
This is where i am currently think this needs work. I would love some ideas of where i need to go to make this work.
Thanks,
Zach
Basically, I want to convert the query_set to JSON. But I also want to add one more field something like size = some number in the query_set which is not present in the query_set attributes (it is computed attribute). Can you tell me how to do it?
query_set = PotholeCluster.objects.all().values('bearing', 'center_lat', 'center_lon', 'grid_id')
return JsonResponse(list(query_set), safe=False)
I tried the code below. It works, but I would like to know if there is any cleaner way to do this.
query_set = PotholeCluster.objects.all()
response_list = []
for pc in query_set:
d = {}
d['bearing'] = pc.get_bearing()
d['center_lat'] = pc.center_lat
d['center_lon'] = pc.center_lat
d['grid_id'] = pc.grid_id
d['size'] = pc.pothole_set.all().count()
response_list.append(d)
serialized = json.dumps(response_list)
return HttpResponse(serialized, content_type='application/json')
class PotholeCluster(models.Model):
center_lat = models.FloatField(default=0)
center_lon = models.FloatField(default=0)
snapped_lat = models.FloatField(default=0)
snapped_lon = models.FloatField(default=0)
size = models.IntegerField(default=-1)
# avgspeed in kmph
speed = models.FloatField(default=-1)
# in meters
accuracy = models.FloatField(default=-1)
# avg bearing in degree
bearing = models.FloatField(default=-1)
grid = models.ForeignKey(
Grid,
on_delete=models.SET_NULL,
null=True,
blank=True
)
def __str__(self):
raw_data = serialize('python', [self])
output = json.dumps(raw_data[0]['fields'])
return "pk = {}|{}".format(self.id, output)
def get_bearing(self):
if self.bearing != -1:
return self.bearing
potholes = self.pothole_set.all()
bearings = [pothole.location.bearing for pothole in potholes]
bearings.sort()
i = 0
if bearings[-1] >= 350:
while bearings[-1] - bearings[i] >= 340:
if bearings[i] <= 10:
bearings[i] += 360
i += 1
self.bearing = sum(bearings) / len(bearings) % 360
self.save()
return self.bearing
def get_size(self):
if self.size != -1:
return self.size
self.size = len(self.pothole_set.all())
self.save()
return self.size