doing a search of the database python/flask no sql - html

I am trying to get a search function working. I have a database using excel and would like to be able to search it while on the go. I am stuck trying to get the search form working. Below is the code in python for the form:
from forms import ComicSearchForm
#app.route('/make_search', methods=['GET', 'POST'])
def search():
search = ComicSearchForm(request.form)
if request.method == 'GET, POST':
return search_results(result)
return render_template('search.html', form=search)
#app.route('/search')
# #login required
def search_results(result):
bob = create_bob('*', '*', '*', '*', '*', '*')
bobby = []
current_page = request.args.get('page', 1, type=int)
per_page = 10
end = (current_page * per_page) + 1
if end > len(bob):
end = len(bob)
start = ((current_page - 1) * per_page) + 1
sort_bob = sorted(bob, key=lambda v: (v.issue_type, v.publisher, v.sort, v.character, v.volume, v.issues,
v.publication_date))
if datetime.strptime(sort_bob[0][7], '%B, %Y') >= datetime.now():
sort_bob = sorted(sort_bob, key=lambda v: (v.publication_date, '%B, %Y'))
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
next = str(current_page + 1) if end < len(bob) else '0'
prev = str(current_page - 1)
if not result:
flash('No results found!')
return redirect('make_search')
else:
# display results
return render_template('search.html', bobby=bobby, header=original_header, next=next, prev=prev)
Here is the form:
from wtforms import Form, StringField, SelectField
class ComicSearchForm(Form):
choices = [('Series Title', 'Series Title'),
('Author', 'Author'),
('Artist', 'Artist'),
('Publisher', 'Publisher'),
('Publication Date', 'Publication Date')]
select = SelectField('Search for comics:', choices=choices)
search = StringField('')
I am stuck trying to figure out:
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
This is where i am currently think this needs work. I would love some ideas of where i need to go to make this work.
Thanks,
Zach

Related

Programmatic access to SQLModel class attributes

I am trying to plug a SQLModel table onto a dash data table, handling pagination, filtering and page count in the backend, as explained here https://dash.plotly.com/datatable/callbacks
Working code
from sqlmodel import SQLModel
from sqlmodel import select
from sqlmodel import col
from sqlmodel import Session
from typing import Dict
from typing import List
from typing import Tuple
class Foo(SQLModel, table=True):
bara: str
barb: int
filters = {'bara': ('contains ', 'toto'), 'barb': ('>', 1)}
def filter_foos(filters: Dict[str, Tuple[str, str]]):
"""
try to filter foos
"""
query = select(Foo)
if values := filters.get('bara'):
query = query.where(col(Foo.bara).contains(values[1]))
if values := filters.get('barb'):
if values[0] == '>=':
query = query.where(col(Foo.barb) >= values[1])
elif values[0] == '<=':
query = query.where(col(Foo.barb) <= values[1])
elif values[0] == '!=':
query = query.where(col(Foo.barb) != values[1])
elif values[0] == '=':
query = query.where(col(Foo.barb) == values[1])
elif values[0] == '>':
query = query.where(col(Foo.barb) > values[1])
elif values[0] == '<':
query = query.where(col(Foo.barb) < values[1])
return query
def select_relevant_db_lines(
session: Session,
limit: int,
offset: int,
filters: Dict[str, Tuple[str, str]]
) -> List:
"""
Select relevant row lines from Foo.
"""
if limit is not None and offset is not None:
return list(session.exec(filter_foos(filters).offset(offset*limit).limit(limit)))
My issue is that the filter function is awefully ugly, and not modular at all. If I have an new class
class Fooo(SQLModel, table=True):
toto: str
titi: int
tutu: int
I will hand up redoing the same filter_foos boiler plate code
What I would like would be to have a dictionary to access Foo class attributes, something like (pseudo-code, does not wok)
foo_attributes: Dict = {
'bara': Foo.bara
'barb': Foo.barb
}
That way I can dissociate the generic str int datetime and whatnot treatment and then map them on class attributes. Something like (pseudo-code, not working)
def filter_ints(query, model_field: ???, operator: str, value: int):
"""
try to filter ints
"""
if not operator or not value:
return query
if operator == '>=':
query = query.where(col(model_field) >= value)
elif operator == '<=':
query = query.where(col(model_field) <= value)
elif operator == '!=':
query = query.where(col(model_field) != value)
elif operator == '=':
query = query.where(col(model_field) == value)
elif operator == '>':
query = query.where(col(model_field) > value)
elif operator == '<':
query = query.where(col(model_field) < value)
return query
def filter_strs(query, model_field: ???, value: int):
"""
try to filter strs
"""
if not value:
return query
query = query.where(col(model_field).contains(value))
def filter_models(model: Any, filters: Dict[str, Tuple[str, str]]):
"""
try to filter any model
"""
query = select(model)
if not filters:
return query
for key, (operator, value) in filters:
update_query(query, model, key, operator, value)
Is it possible to do such a thing, or will I have to implement one ugly method after another each time I add a new table to show in my dash app?

wanting to write a txt with Tkinter entry box

just trying to allow users to write in their name and email address for it to be then written into a text file. There are no error messages that pop up it's just, it isn't writing into the file. also, the message box isn't coming up with (+ aname + '\n' + full email + '\n') it just comes up with the message. Cheers
import tkinter as tk #shortening tkinter
import tkinter.messagebox as box
import csv
from tkinter import *
def store_customers():
aname = name.get()
aemail = email.get()
aemailaddress = emailaddress.get()
fullemail = aemail + aemailaddress
print(fullemail)
if (name == "" or email == ""):
print('Error')
messagebox.showerror('error',"their was some issur with your information")
email.set('')
name.set('')
else:
result = messagebox.askquestion('question', 'Your about to enter yor information \n' + aname + '\n' + fullemail + '\n' )
if (result == 'yes'):
print('here')
with open ('customersdata.txt', 'a') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([aname, fullemail])
csvfile.close()
else:
name.set('')
email.set('')
name = StringVar()
email = StringVar()
emailaddress = StringVar()
name = tk.Entry(frame4, text="", bg = '#F0EAD6', font=('Arial',24) )
name.place(x= 400, y = 600)
email = tk.Entry(frame4, text="", bg = '#F0EAD6', font=('Arial',24) )
email.place(x= 400, y = 660)
list1 = ['#yahoo.com','#bing.com','#jpc.vic.edu.au', '#gmail.com', '#hotmail.com' ]
emailaddres= OptionMenu(frame4,emailaddress,*list1)
emailaddres.config(height = 2 )
emailaddress.set('#***.***.edu.au')
emailaddres.place(x= 685, y= 660)
storebtn = tk.Button(frame4, text = 'complete', bg= '#F0EAD6', font=('Arial',24), command = store_customers)
storebtn.place (x = 430, y= 700)
tk.Label(frame4, text= "Your name", bg= '#F0EAD6', font=('Arial',24)).place(x = 260, y = 600)
tk.Label(frame4, text= "email address", bg= '#F0EAD6', font=('Arial',24)).place(x = 240, y = 660)
frame4.mainloop()
Code with Tkinter.
import csv
import tkinter
import tkinter.messagebox
def save():
nome_to_save = namevalue.get()
email_to_save = emailvalue.get()
if len(nome_to_save) == 0 or len(email_to_save) == 0:
tkinter.messagebox.showinfo('Error', 'You need to enter name and e-mail!')
else:
with open('customersdata.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([nome_to_save, email_to_save])
csvfile.close()
tkinter.messagebox.showinfo('Success', f'Information of {nome_to_save} saved!')
window = tkinter.Tk()
window.title("Write CSV")
namevalue = tkinter.StringVar()
emailvalue = tkinter.StringVar()
tkinter.Label(window, text = "Name").grid(row = 0)
name = tkinter.Entry(window, textvariable=namevalue).grid(row = 0, column = 1)
tkinter.Label(window, text = "E-mail").grid(row = 1)
email = tkinter.Entry(window, textvariable=emailvalue).grid(row = 1, column = 1)
tkinter.Button(window, text="Save", command = save).grid(row = 2)
tkinter.mainloop()
You probably receive some error, right?
You have imported tkinter.messagebox as box.
So replace messagebox.showerror and messagebox.askquestion with box.showerror and box.askquestion
PS: I am new here, so I cannot comment. Sorry!
This code can help you.
import csv
def store_customers(name, email):
if name == "" or email == "":
print('Error, i need name and email')
else:
with open('customersdata.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([name, email])
csvfile.close()
#store_customers('', '')
store_customers('Diego', 'diego#false.io')
store_customers('Ana Maria', 'ana.maria#myemail.io')
store_customers('Julia Neau', 'jneau#cool.io')

Load completely with request in python (or other ways)

Hi
I was wondering if I can load the page completely with python, for example, a hashtag page form Instagram
there is code I tried but it wouldn't load completely
Here's my code
import json
import re
import requests
x = input("Enter your hashtag: ")
response = requests.get('https://www.instagram.com/explore/tags/' + x + '/?__a=1')
if response.status_code == 404:
print('page not found')
input()
exit()
data = response.text
x = re.findall("\"shortcode\":\"[^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\"][^\,]", data)
y = [i.split('"')[3] for i in x]
x = 0
z = len(y)
print(str(z)+' Posts found')
while x < z:
print('\r' + str(x) + ' posts done', end="")
data = requests.get('https://www.instagram.com/p/' + y[x] + '/?__a=1')
y[x] = data.text
x = x + 1
print()
print('post link finished')
Usernames = []
Posts = []
Followers = []
Following = []
x = 0
while x < z:
print('\r' + str(x) + ' Usernames done' , end="")
data = json.loads(y[x])
Usernames.append(data['graphql']['shortcode_media']['owner']['username'])
x = x + 1
print()
print('Usernames finished')
print(len(Usernames))
I want to have more usernames like 100k or more if you can help me with other libraries it isn't important

How do I create a generated scan report for PDF in Arachni Web Application Security Scanner Framework?

I have been working on a project where I have to modify the Arachni Web Application Security Scanner Framework and get it to generate and download a scan report in PDF format. The web application itself is powered by Ruby on Rails. Here is the link to the application that I am talking about.
https://www.arachni-scanner.com/
Now, the default application generates five file formats: HTML (or HTML.zip), JSON, Marshal, YAML, and XML. I am trying to include PDF, and the PDF report has to look exactly like the contents of the HTML report, charts and all. The application folder has its own libraries directory. There is one library called Arachni-1.5.1, and I have come to the conclusion that it is the heart of where the generated file formats are being made. I can give one example, specifically the library file that supposedly creates the HTML.zip report. It is code in the Arachni-1.5.1 library under a folder called reporters, and the name of the file is html.rb.
`module TemplateUtilities
def base64_encode( string )
Base64.encode64( string ).gsub( /\n/, '' )
end
def normalize( str )
str.to_s.recode
end
def md( markdown )
html = Kramdown::Document.new( markdown ).to_html.recode
Loofah.fragment( html ).scrub!(:prune).to_s
end
def key_to_words( k )
k.to_s.capitalize.gsub( '_', ' ' )
end
def code_highlight( code, language = :html, options = {} )
return if !code
lines = CodeRay.scan( code.recode, language ).
html( css: :style ).lines.to_a
if options[:from]
from = [0, options[:from]].max
else
from = 0
end
if options[:to]
to = [lines.size, options[:to]].min
else
to = lines.size - 1
end
code = '<div class="code-container"><table class="CodeRay"><tbody><tr><td class="line-numbers"><pre>'
from.upto(to) do |i|
if options[:anchor_id]
line = "<a href='#{id_to_location "#{options[:anchor_id]}-#{i}"}'>#{i}</a>"
else
line = "#{i}"
end
if options[:breakpoint] && options[:breakpoint] == i
code << "<span class='breakpoint'>#{line}</span>"
else
code << line
end
code << "\n"
end
code << '</pre></td><td class="code"><pre>'
from.upto(to) do |i|
line = "<span id='#{options[:anchor_id]}-#{i}'>#{lines[i]}</span>"
if options[:breakpoint] && options[:breakpoint] == i
code << "<span class='breakpoint'>#{line}</span>"
else
code << line.to_s
end
end
code + '</pre></td></tr></tbody></table></div>'
end
def highlight_proof( string, proof )
proof = proof.to_s.recode
string = string.to_s.recode
return escapeHTML( string ) if proof.to_s.empty?
return escapeHTML( string ) if !string.include?( proof )
escaped_proof = escapeHTML( proof )
escaped_response_body = escapeHTML( string )
escaped_response_body.gsub(
escaped_proof,
"<span class=\"issue-proof-highlight\">#{escaped_proof}</span>"
)
end
def data_dump( data )
ap = AwesomePrint::Inspector.new( plain: true, html: true )
"<pre class='data-dump'>#{ap.awesome( data )}</pre>"
end
# Carefully escapes HTML and converts to UTF-8 while removing
# invalid character sequences.
def escapeHTML( str )
CGI.escapeHTML( normalize( str ) )
end
def highlight_issue_page_body( issue, span_class )
return escapeHTML( issue.page.body ) if !issue.page.body.include?( issue.proof )
escaped_proof = escapeHTML( issue.proof )
escaped_response_body = escapeHTML( issue.page.body )
escaped_response_body.gsub(
escaped_proof,
"<span class=\"#{span_class}\">#{escaped_proof}</span>"
)
end
def issue_location( issue )
id_to_location( issue_id( issue ) )
end
def issue_id( issue )
issue = report.issue_by_digest( issue.digest )
"issues-#{'un' if issue.untrusted?}trusted-severity-" <<
"#{issue.severity}-#{issue.check[:shortname]}-#{issue.digest}"
end
def id_to_location( id )
"#!/#{id.gsub( '-', '/' )}"
end
def erb( tpl, params = {} )
scope = TemplateScope.new( params )
tpl = tpl.to_s + '.erb' if tpl.is_a?( Symbol )
path = File.exist?( tpl ) ? tpl : TEMPLATE_DIR + tpl
ERB.new( IO.read( path ).recode ).result( scope.get_binding )
rescue
ap tpl
raise
end
end
include TemplateUtilities
class TemplateScope
include TemplateUtilities
ISSUES_URL = 'https://github.com/Arachni/arachni/issues'
def initialize( params = {} )
update params
update self.class.global_data
end
def update( params )
params.each { |name, value| self[name] = value }
self
end
def []=( name, value )
self.class.send( :attr_accessor, name )
instance_variable_set( "##{name.to_s}", value )
self
end
def prep_description( str )
escapeHTML Arachni::Reporters::HTML.prep_description( str )
end
def get_plugin_info( name )
report.plugins[name.to_sym]
end
def js_multiline( str )
"\"" + normalize( str ).gsub( "\n", '\n' ) + "\""
end
def get_binding
binding
end
def self.global_data=( data )
#global_data = data
end
def self.global_data
#global_data
end
end
def global_data
grouped_issues = {
trusted: {},
untrusted: {}
}
Arachni::Issue::Severity::ORDER.each do |severity|
by_severity = report.issues.select { |i| i.severity.to_sym == severity }
next if by_severity.empty?
by_name = {}
by_severity.each do |issue|
by_name[issue.name] ||= []
by_name[issue.name] << issue
end
next if by_name.empty?
grouped_issues[:trusted][by_severity.first.severity] =
by_name.inject({}) do |h, (name, issues)|
i = issues.select(&:trusted?)
next h if i.empty?
h[name] = i
h
end
grouped_issues[:untrusted][by_severity.first.severity] =
by_name.inject({}) do |h, (name, issues)|
i = issues.select(&:untrusted?)
next h if i.empty?
h[name] = i
h
end
[:trusted, :untrusted].each do |t|
if grouped_issues[t][by_severity.first.severity].empty?
grouped_issues[t].delete by_severity.first.severity
end
end
end
[:trusted, :untrusted].each do |t|
grouped_issues.delete( t ) if grouped_issues[t].empty?
end
prepare_data.merge(
report: report,
grouped_issues: grouped_issues,
plugins: format_plugin_results
)
end
# Runs the HTML report.
def run
FileUtils.rm_rf outfile
print_line
print_status 'Creating HTML report...'
TemplateScope.global_data = global_data
tmpdir = "#{Arachni::Options.paths.tmpdir}/#{generate_token}/"
FileUtils.rm_rf tmpdir
FileUtils.mkdir_p tmpdir
FileUtils.mkdir_p "#{tmpdir}/js/lib"
FileUtils.mkdir_p "#{tmpdir}/css/lib"
FileUtils.cp_r "#{TEMPLATE_DIR}/fonts", "#{tmpdir}/"
FileUtils.cp_r "#{TEMPLATE_DIR}/js/lib", "#{tmpdir}/js/"
FileUtils.cp_r "#{TEMPLATE_DIR}/css/lib", "#{tmpdir}/css/"
%w(js/helpers.js js/init.js.erb js/charts.js.erb js/configuration.js.erb
css/main.css).each do |f|
if f.end_with? '.erb'
IO.write( "#{tmpdir}/#{f.split('.erb').first}", erb( "#{TEMPLATE_DIR}/#{f}" ) )
else
FileUtils.cp( "#{TEMPLATE_DIR}/#{f}" , "#{tmpdir}/#{f}" )
end
end
IO.write( "#{tmpdir}/index.html", erb( TEMPLATE_FILE ) )
compress( tmpdir, outfile )
FileUtils.rm_rf tmpdir
print_status "Saved in '#{outfile}'."
end
def self.info
{
name: 'HTML',
description: %q{Exports the audit results as a compressed HTML report.},
content_type: 'application/zip',
author: 'Tasos "Zapotek" Laskos <tasos.laskos#arachni-scanner.com>',
version: '0.4.3',
options: [
Options.outfile( '.html.zip' ),
Options.skip_responses
]
}
end
private
def compress( directory, archive )
Zip::File.open( archive, Zip::File::CREATE ) do |zipfile|
Dir[File.join(directory, '**', '**')].each do |file|
zipfile.add( file.sub( directory, '' ), file )
end
end
archive
end
def self.prep_description( str )
placeholder = '--' + rand( 1000 ).to_s + '--'
cstr = str.gsub( /^\s*$/xm, placeholder )
cstr.gsub!( /^\s*/xm, '' )
cstr.gsub!( placeholder, "\n" )
cstr.chomp
end
def prepare_data
graph_data = {
severities: {
Severity::HIGH.to_sym => 0,
Severity::MEDIUM.to_sym => 0,
Severity::LOW.to_sym => 0,
Severity::INFORMATIONAL.to_sym => 0
},
severity_for_issue: {},
severity_index_for_issue: {},
severity_regions: {},
issues: {},
issues_shortnames: Set.new,
trusted_issues: {},
untrusted_issues: {},
elements: {
Element::Form.type => 0,
Element::Form::DOM.type => 0,
Element::Link.type => 0,
Element::Link::DOM.type => 0,
Element::Cookie.type => 0,
Element::Cookie::DOM.type => 0,
Element::LinkTemplate.type => 0,
Element::LinkTemplate::DOM.type => 0,
Element::Header.type => 0,
Element::Body.type => 0,
Element::Path.type => 0,
Element::Server.type => 0,
Element::GenericDOM.type => 0,
Element::JSON.type => 0,
Element::XML.type => 0,
Element::UIInput::DOM.type => 0,
Element::UIForm::DOM.type => 0
},
verification: {
'Yes' => 0,
'No' => 0
},
trust: {
'Trusted' => 0,
'Untrusted' => 0
}
}
total_severities = 0
total_elements = 0
has_trusted_issues = false
has_untrusted_issues = false
last_severity = nil
report.issues.each.with_index do |issue, i|
graph_data[:severities][issue.severity.to_sym] += 1
total_severities += 1
graph_data[:issues][issue.name] ||= 0
graph_data[:issues][issue.name] += 1
graph_data[:elements][issue.vector.class.type] += 1
total_elements += 1
verification = issue.untrusted? ? 'Yes' : 'No'
graph_data[:verification][verification] += 1
graph_data[:untrusted_severities] ||= {}
graph_data[:untrusted_severities][issue.severity.to_sym] ||= 0
graph_data[:trusted_severities] ||= {}
graph_data[:trusted_severities][issue.severity.to_sym] ||= 0
graph_data[:trusted_issues][issue.name] ||= 0
graph_data[:untrusted_issues][issue.name] ||= 0
graph_data[:issues_shortnames] << issue.check[:shortname]
graph_data[:severity_for_issue][issue.check[:shortname]] = issue.severity.to_s
new_region = !graph_data[:severity_regions].include?( issue.severity.to_sym )
graph_data[:severity_regions][issue.severity.to_sym] ||= {}
graph_data[:severity_regions][issue.severity.to_sym][:class] =
"severity-#{issue.severity.to_sym}"
graph_data[:severity_regions][issue.severity.to_sym][:start] ||=
graph_data[:issues].size - 1
if new_region && last_severity
graph_data[:severity_regions][last_severity][:end] =
graph_data[:issues].size - 2
end
last_severity = issue.severity.to_sym
graph_data[:severity_index_for_issue][issue.name] =
Issue::Severity::ORDER.reverse.index( issue.severity.to_sym ) + 1
if issue.trusted?
has_trusted_issues = true
graph_data[:trust]['Trusted'] += 1
graph_data[:trusted_severities][issue.severity.to_sym] += 1
graph_data[:trusted_issues][issue.name] += 1
else
has_untrusted_issues = true
graph_data[:trust]['Untrusted'] += 1
graph_data[:untrusted_severities][issue.severity.to_sym] += 1
graph_data[:untrusted_issues][issue.name] += 1
end
end
graph_data[:issues_shortnames] = graph_data[:issues_shortnames].to_a
graph_data[:severity_regions] = graph_data[:severity_regions].values
{
graph_data: graph_data,
total_severities: total_severities,
total_elements: total_elements,
has_trusted_issues: has_trusted_issues,
has_untrusted_issues: has_untrusted_issues
}
end`
Now, how would I turn something like that into PDF? I should also note that any editing I apply in this library can cause the application to crash whenever I visit a scan page. The web application also comes with library Kramdown-1.4.1, which has some sort of PDF converter; but I am honestly not sure if it's the most ideal tool to use. I have looked up countless sources, and this does not seem to be a known issue to many.

Scrapy returns no output - just a [

I'm trying to run the spider found in this crawler and for simplicity sake I'm using this start_url because it is just a list of 320 movies. (So, the crawler won't run for 5 hours as given in the github page).
I crawl using scrapy crawl imdb -o output.json but the output.json file contains nothing. It has just a [ in it.
import scrapy
from texteval.items import ImdbMovie, ImdbReview
import urlparse
import math
import re
class ImdbSpider(scrapy.Spider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [
# "http://www.imdb.com/chart/top",
# "http://www.imdb.com/chart/bottom"
"http://www.imdb.com/search/title?countries=csxx&sort=moviemeter,asc"
]
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.robotstxt.ROBOTSTXT_OBEY': True,
}
base_url = "http://www.imdb.com"
def parse(self, response):
movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href")
for i in xrange(len(movies)):
l = self.base_url + movies[i].extract()
print l
request = scrapy.Request(l, callback=self.parse_movie)
yield request
next = response.xpath("//*[#id='right']/span/a")[-1]
next_url = self.base_url + next.xpath(".//#href")[0].extract()
next_text = next.xpath(".//text()").extract()[0][:4]
if next_text == "Next":
request = scrapy.Request(next_url, callback=self.parse)
yield request
'''
for sel in response.xpath("//table[#class='chart']/tbody/tr"):
url = urlparse.urljoin(response.url, sel.xpath("td[2]/a/#href").extract()[0].strip())
request = scrapy.Request(url, callback=self.parse_movie)
yield request
'''
def parse_movie(self, response):
movie = ImdbMovie()
i1 = response.url.find('/tt') + 1
i2 = response.url.find('?')
i2 = i2 - 1 if i2 > -1 else i2
movie['id'] = response.url[i1:i2]
movie['url'] = "http://www.imdb.com/title/" + movie['id']
r_tmp = response.xpath("//div[#class='titlePageSprite star-box-giga-star']/text()")
if r_tmp is None or r_tmp == "" or len(r_tmp) < 1:
return
movie['rating'] = int(float(r_tmp.extract()[0].strip()) * 10)
movie['title'] = response.xpath("//span[#itemprop='name']/text()").extract()[0]
movie['reviews_url'] = movie['url'] + "/reviews"
# Number of reviews associated with this movie
n = response.xpath("//*[#id='titleUserReviewsTeaser']/div/div[3]/a[2]/text()")
if n is None or n == "" or len(n) < 1:
return
n = n[0].extract().replace("See all ", "").replace(" user reviews", "")\
.replace(" user review", "").replace(",", "").replace(".", "").replace("See ", "")
if n == "one":
n = 1
else:
n = int(n)
movie['number_of_reviews'] = n
r = int(math.ceil(n / 10))
for x in xrange(1, r):
start = x * 10 - 10
url = movie['reviews_url'] + "?start=" + str(start)
request = scrapy.Request(url, callback=self.parse_review)
request.meta['movieObj'] = movie
yield request
def parse_review(self, response):
ranks = response.xpath("//*[#id='tn15content']/div")[0::2]
texts = response.xpath("//*[#id='tn15content']/p")
del texts[-1]
if len(ranks) != len(texts):
return
for i in xrange(0, len(ranks) - 1):
review = ImdbReview()
review['movieObj'] = response.meta['movieObj']
review['text'] = texts[i].xpath("text()").extract()
rating = ranks[i].xpath(".//img[2]/#src").re("-?\\d+")
if rating is None or rating == "" or len(rating) < 1:
return
review['rating'] = int(rating[0])
yield review
Can someone tell me where am I going wrong?
In my opinion, this web site should be load the list of movies use by js. Fristly, I suggest you should check the output about: movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href"). If you want to get js content, you can use webkit in scrapy as a downloader middleware.