I'm trying to build a small app for a university project with Scrapy.
The spider is scraping the items, but my pipeline is not inserting data into mysql database. In order to test whether the pipeline is not working or the pymysl implementation is not working I wrote a test script:
Code Start
#!/usr/bin/python3
import pymysql
str1 = "hey"
str2 = "there"
str3 = "little"
str4 = "script"
db = pymysql.connect("localhost","root","**********","stromtarife" )
cursor = db.cursor()
cursor.execute("SELECT * FROM vattenfall")
cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (str1, str2, str3, str4))
cursor.execute("SELECT * FROM vattenfall")
data = cursor.fetchone()
print(data)
db.commit()
cursor.close()
db.close()
Code End
After i run this script my database has a new record, so its not my pymysql.connect() function, which is broke.
I'll provide my scrapy code:
vattenfall_form.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from stromtarife.items import StromtarifeItem
from scrapy.http import FormRequest
class VattenfallEasy24KemptenV1500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v1500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "1500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
class VattenfallEasy24KemptenV2500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v2500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "2500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
process = CrawlerProcess()
process.crawl(VattenfallEasy24KemptenV1500Spider)
process.crawl(VattenfallEasy24KemptenV2500Spider)
process.start()
pipelines.py
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
self.cursor.close()
self.connection.close()
settings.py (i changed only that line)
ITEM_PIPELINES = {
'stromtarife.pipelines.StromtarifePipeline': 300,
}
So what is wrong with my code ? I couldn't figure it out and would be really happy if someone is seeing something i'm missing. Thanks in advance!
You should not close your pymsql connection every time you process an item.
You should write the close_spider function in your pipeline like this, so the connection is closed just once, at the end of the execution:
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
Moreover you neeed to return your item at the end of process_item
Your file pipeline.py should look like this:
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
UPDATE :
I tried your code, the problem is in the pipeline, there are two problems:
You try to index the euro symbol € and I think mysql does not like it.
Your query string is not well built.
I managed to get things done by writting the pipeline like this:
def process_item(self, item, spider):
query = """INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)""" % ("1", "2", "3", "4")
self.cursor.execute(query)
self.connection.commit()
return item
I thing you should remove the € from the prices you try to insert.
Hope this helps, let me know.
There is another problem with your scraper besides the fact that your SQL Pipeline closes the SQL connection after writing the first item (as Adrien pointed out).
The other problem is: your scraper only scrapes one single item per results page (and also visits only one results page). I checked Vattenfall and there are usually multiple results displayed and I guess you want to scrape them all.
Means you'll also have to iterate over the results on the page and create multiple items while doing so. The scrapy tutorial here gives a good explanation how to do this: https://doc.scrapy.org/en/latest/intro/tutorial.html#extracting-quotes-and-authors
First of all, in Code Start print(data) must come after db.commit(), otherwise the data which was just inserted into your database will not show up in the print.
Lastly, judging by the names of your columns, it's probably an issue of encoding if the idea above doesn't work.
Related
I have a command where you can enter a correct answer. If it is correct, the user is credited with points in a JSON. But my update function seems to be broken, because after another correct execution an entry is made again in the JSON for the same user. However, I just want the points to update. Also, the JSON stops after the second entry about the user. What was wrong?
Code:
correct_answers = "A"
# Open the JSON after start
def json_open():
with open('users.json', 'r', encoding='utf-8') as f:
users = json.load(f)
return users
class Questions(commands.Cog, name='Question'):
"""Question bot"""
def __init__(self, bot):
super().__init__()
self.bot = bot
#commands.command()
async def question(self, ctx, answer):
self.question.enabled = False
global correct_answers
if correct_answers != answer:
await ctx.author.send(f"You guessed {answer} which is **wrong**. Good luck next time!")
await ctx.message.delete()
return
# OPEN JSON FILE, LOAD DATA
with open('users.json', 'r') as f:
users = json.load(f)
await self.update_data(users, ctx.message.author)
await self.add_experience(users, ctx.message.author, 10)
with open('users.json', 'w') as f:
json.dump(users, f)
await ctx.message.delete()
# UPDATE DATA
async def update_data(self, users, user):
if not user.id in users:
users[user.id] = {}
users[user.id]['Points'] = 0
#users[user.id]['level'] = 1
async def add_experience(self, users, user, exp):
users[user.id]['Points'] += exp
It looks like the last functions do not work or is the add_experience function not needed?
The JSON looks like this after the second execution:
{"MYID": {"Points": 10}, "MYIDAGAIN": {"Points": 10}}
Somehow it is converted into a str so you have to update the function a bit. To explain it better:
Turn the user.id into a str.
async def update_data(self, users, user):
key = str(user.id)
if key not in users:
users[key] = {}
users[key]['Points'] = 0
async def add_experience(self, users, user, exp):
users[str(user.id)]['Points'] += exp
Maybe also have a look at the page where the problem is explained.
I think that you have to string the user ID like this
users[str(user.id)]['Points'] += exp
I have a code that fetches data from a MySQL database, it return values but not the keywords. I need keywords so I can make calls later in JavaScript.
Instead of keywords which are firstname lastname username and mail it returns as 0 1 2 3.
0: "Luka"
1: "Tubic"
2: "Tubex"
3: "test#mail.com"
It should return
"firstname": "Luka"
"lastname":"Tubic"
"username":"Tubex"
"mail":"test#mail.com"
this is python script
from flask import Flask
from flask import jsonify, request, redirect, url_for
import mysql.connector
import random
import string
import smtplib
app = Flask(__name__, static_folder='www')
def _connect():
connection = mysql.connector.connect(host='localhost',
user='root',
password='root',
auth_plugin='mysql_native_password',
database='User')
connection.commit()
return connection
#app.route('/admin/users', methods=['GET', 'POST', 'OPTION'])
def dummyadmin():
if request.method == 'GET':
connection = _connect()
c = connection.cursor(buffered=True)
query = 'SELECT first_name,last_name,username,email from user.user'
c.execute(query)
users = c.fetchall()
if users is None:
return None
connection.commit()
c.close()
print(users)
return jsonify(users)
if request.method == 'POST':
connection = _connect()
c = connection.cursor(buffered=True)
#app.after_request
def after_request(response):
response.headers.add('Access-Control-Allow-Origin', '*')
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
return response
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0')
The solution is documented: use a DictCursor.
c = connection.cursor(dictionary=True, buffered=True)
NB : by default a db-api cursor yields tuples which are indeed indexed by position, not by column name. This is a pretty standard and natural representation of relational data, and is less costly than dicts.
MySQL has a function called "json_object" which takes the list of input and return as JSON object as below, so, in your case instead of fetching rows in the different column, you can consider getting as json_object from Db itself.
SELECT json_object('first_name',first_name,'last_name',last_name,'username',username,'email',email) as jsonString from user.user;
Output format
{"first_name": "John", "last_name": "Edial", "username":"john.edial","email": "john#wddil.com"}
I've tried to implement this pipeline in my spider.
After installing the necessary dependencies I am able to run the spider without any errors but for some reason it doesn't write to my database.
I'm pretty sure there is something going wrong with connecting to the database. When I give in a wrong password, I still don't get any error.
When the spider scraped all the data, it needs a few minutes before it starts dumping the stats.
2017-08-31 13:17:12 [scrapy] INFO: Closing spider (finished)
2017-08-31 13:17:12 [scrapy] INFO: Stored csv feed (27 items) in: test.csv
2017-08-31 13:24:46 [scrapy] INFO: Dumping Scrapy stats:
Pipeline:
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = {}
SETTINGS['DB_HOST'] = 'mysql.domain.com'
SETTINGS['DB_USER'] = 'username'
SETTINGS['DB_PASSWD'] = 'password'
SETTINGS['DB_PORT'] = 3306
SETTINGS['DB_DB'] = 'database_name'
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
print "init"
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print "close"
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
print "insert"
result = tx.execute(
" INSERT INTO matches(type,home,away,home_score,away_score) VALUES (soccer,"+item["home"]+","+item["away"]+","+item["score"].explode("-")[0]+","+item["score"].explode("-")[1]+")"
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
print "error"
log.err(e)
Spider:
import scrapy
import dateparser
from crawling.items import KNVBItem
class KNVBspider(scrapy.Spider):
name = "knvb"
start_urls = [
'http://www.knvb.nl/competities/eredivisie/uitslagen',
]
custom_settings = {
'ITEM_PIPELINES': {
'crawling.pipelines.MySQLPipeline': 301,
}
}
def parse(self, response):
# www.knvb.nl/competities/eredivisie/uitslagen
for row in response.xpath('//div[#class="table"]'):
for div in row.xpath('./div[#class="row"]'):
match = KNVBItem()
match['home'] = div.xpath('./div[#class="value home"]/div[#class="team"]/text()').extract_first()
match['away'] = div.xpath('./div[#class="value away"]/div[#class="team"]/text()').extract_first()
match['score'] = div.xpath('./div[#class="value center"]/text()').extract_first()
match['date'] = dateparser.parse(div.xpath('./preceding-sibling::div[#class="header"]/span/span/text()').extract_first(), languages=['nl']).strftime("%d-%m-%Y")
yield match
If there are better pipelines available to do what I'm trying to achieve that'd be welcome as well. Thanks!
Update:
With the link provided in the accepted answer I eventually got to this function that's working (and thus solved my problem):
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
query.addBoth(lambda _: item)
return query
Take a look at this for how to use adbapi with MySQL for saving scraped items. Note the difference in your process_item and their process_item method implementation. While you return the item immediately, they return Deferred object which is the result of runInteraction method and which returns the item upon its completion. I think this is the reason your _insert_record never gets called.
If you can see the insert in your output that's already a good sign.
I'd rewrite the insert function this way:
def _insert_record(self, tx, item):
print "insert"
raw_sql = "INSERT INTO matches(type,home,away,home_score,away_score) VALUES ('%s', '%s', '%s', '%s', '%s')"
sql = raw_sql % ('soccer', item['home'], item['away'], item['score'].explode('-')[0], item['score'].explode('-')[1])
print sql
result = tx.execute(sql)
if result > 0:
self.stats.inc_value('database/items_added')
It allows you to debug the sql you're using. In you version you're not wrapping the string in ' which is a syntax error in mysql.
I'm not sure about your last values (score) so I treated them as strings.
I am trying to get scrapy to insert crawled data into mysql and my code crawls fine and collects the data in the buffer, does not error, but database is never updated.
'no luck', 'no error'
pipeline.py
from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors
class SQLStorePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='craigs',
user='bra', passwd='boobs', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)
def process_item(self, items, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, items)
query.addErrback(self.handle_error)
return items
def _conditional_insert(self, tx, items):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from scraped where link = %s", (items['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % items, level=log.DEBUG)
else:
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
log.msg("Item stored in db: %s" % items, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
crawl code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigs.items import CraigsItem
class MySpider(CrawlSpider):
name = "craigs"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('/html/body/blockquote[3]/p/a',)), follow=True, callback='parse_profile')]
def parse_profile(self, response):
items = []
img = CraigsItem()
hxs = HtmlXPathSelector(response)
img['title'] = hxs.select('//h2[contains(#class, "postingtitle")]/text()').extract()
img['posting_id'] = hxs.select('//html/body/article/section/section[2]/div/p/text()').extract()
items.append(img)
return items[0]
return img[0]
settings.py
BOT_NAME = 'craigs'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['craigs.spiders']
NEWSPIDER_MODULE = 'craigs.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
The reason why the pipeline code is not being called at all is because it hasn't been activated. This activation is done by adding a new section to settings.py, as per the Item Pipelines page in the documentation. e.g
ITEM_PIPELINES = [
'craigs.pipeline.SQLStorePipeline',
]
Additionally, your parse_profile function should just return img. You'd only add an items list to return if a single response page would result in multiple items.
activate Pipeline in settings and use yield instead of return
You should COMMIT the current transaction, which making changes permanent.
So after
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
you have to
db.commit()
db here is something like
db = MySQLdb.connect(host="localhost",user = "root", passwd = "1234", db="database_name")
Please try it.
I am new to Scrapy, I had the spider code
class Example_spider(BaseSpider):
name = "example"
allowed_domains = ["www.example.com"]
def start_requests(self):
yield self.make_requests_from_url("http://www.example.com/bookstore/new")
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//div[#class="bookListingBookTitle"]/a/#href').extract()
for i in urls:
yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)
def parse_url(self, response):
hxs = HtmlXPathSelector(response)
main = hxs.select('//div[#id="bookshelf-bg"]')
items = []
for i in main:
item = Exampleitem()
item['book_name'] = i.select('div[#class="slickwrap full"]/div[#id="bookstore_detail"]/div[#class="book_listing clearfix"]/div[#class="bookstore_right"]/div[#class="title_and_byline"]/p[#class="book_title"]/text()')[0].extract()
item['price'] = i.select('div[#id="book-sidebar-modules"]/div[#class="add_to_cart_wrapper slickshadow"]/div[#class="panes"]/div[#class="pane clearfix"]/div[#class="inner"]/div[#class="add_to_cart 0"]/form/div[#class="line-item"]/div[#class="line-item-price"]/text()').extract()
items.append(item)
return items
And pipeline code is:
class examplePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='blurb',
user='root',
passwd='redhat',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
# run db query in thread pool
assert isinstance(item, Exampleitem)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
print "db connected-=========>"
# create record if doesn't exist.
tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""INSERT INTO example_book_store (book_name,price)
VALUES (%s,%s)""",
(item['book_name'],item['price'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
After running this I am getting the following error
exceptions.NameError: global name 'Exampleitem' is not defined
I got the above error when I added the below code in process_item method
assert isinstance(item, Exampleitem)
and without adding this line I am getting
**exceptions.TypeError: 'Example_spider' object is not subscriptable
Can anyone make this code run and make sure that all the items saved into database?
Try the following code in your pipeline
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd',
'dbname', charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_book_store (book_name, price)
VALUES (%s, %s)""",
(item['book_name'].encode('utf-8'),
item['price'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Your process_item method should be declared as: def process_item(self, item, spider): instead of def process_item(self, spider, item): -> you switched the arguments around.
This exception: exceptions.NameError: global name 'Exampleitem' is not defined indicates you didn't import the Exampleitem in your pipeline.
Try adding: from myspiders.myitems import Exampleitem (with correct names/paths ofcourse).
I think this way is better and more concise:
#Item
class pictureItem(scrapy.Item):
topic_id=scrapy.Field()
url=scrapy.Field()
#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"
#usage
cur.execute(self.save_picture,dict(item))
It's just like
cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})
Cause (you can read more about Items in Scrapy)
The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.