scrapy and mysql - mysql

I am trying to get scrapy to insert crawled data into mysql and my code crawls fine and collects the data in the buffer, does not error, but database is never updated.
'no luck', 'no error'
pipeline.py
from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors
class SQLStorePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='craigs',
user='bra', passwd='boobs', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)
def process_item(self, items, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, items)
query.addErrback(self.handle_error)
return items
def _conditional_insert(self, tx, items):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from scraped where link = %s", (items['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % items, level=log.DEBUG)
else:
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
log.msg("Item stored in db: %s" % items, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
crawl code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigs.items import CraigsItem
class MySpider(CrawlSpider):
name = "craigs"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('/html/body/blockquote[3]/p/a',)), follow=True, callback='parse_profile')]
def parse_profile(self, response):
items = []
img = CraigsItem()
hxs = HtmlXPathSelector(response)
img['title'] = hxs.select('//h2[contains(#class, "postingtitle")]/text()').extract()
img['posting_id'] = hxs.select('//html/body/article/section/section[2]/div/p/text()').extract()
items.append(img)
return items[0]
return img[0]
settings.py
BOT_NAME = 'craigs'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['craigs.spiders']
NEWSPIDER_MODULE = 'craigs.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)

The reason why the pipeline code is not being called at all is because it hasn't been activated. This activation is done by adding a new section to settings.py, as per the Item Pipelines page in the documentation. e.g
ITEM_PIPELINES = [
'craigs.pipeline.SQLStorePipeline',
]
Additionally, your parse_profile function should just return img. You'd only add an items list to return if a single response page would result in multiple items.

activate Pipeline in settings and use yield instead of return

You should COMMIT the current transaction, which making changes permanent.
So after
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
you have to
db.commit()
db here is something like
db = MySQLdb.connect(host="localhost",user = "root", passwd = "1234", db="database_name")
Please try it.

Related

FastAPI not running all the functions to return the right values from database

I am trying to make a twitter points program. Basically, you get points based off of the number of likes, retweets and replies your post with a specified hashtag gets. I made an API to get these points from a database but fastAPI is not doing all the funtions specified to return the correct values.
API code:
DATABASE_URL = "mysql+mysqlconnector://root:password#localhost:3306/twitterdb"
database = Database(DATABASE_URL)
metadata_obj = MetaData()
engine = create_engine(
DATABASE_URL, connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
metadata = sqlalchemy.MetaData()
Base = declarative_base()
user_points = sqlalchemy.Table(
"points",
metadata_obj,
sqlalchemy.Column("username", sqlalchemy.String,),
sqlalchemy.Column("rt_points", sqlalchemy.Integer,),
sqlalchemy.Column("reply_points", sqlalchemy.Integer),
sqlalchemy.Column("like_points", sqlalchemy.Integer),
sqlalchemy.Column("total_points", sqlalchemy.Integer)
)
engine = sqlalchemy.create_engine(
DATABASE_URL
)
metadata.create_all(engine)
app = FastAPI()
#app.on_event("startup")
async def connect():
await database.connect()
#app.on_event("shutdown")
async def shutdown():
await database.disconnect()
class UserName(BaseModel):
rt_points: int
reply_points: int
like_points: int
total_points : int
#app.get('/userdata/', response_model=UserName)
async def get_points(user: str):
username=user
metrics.clear()
tweets_list = tweet_id(username)
tweets_list.get_tweet_ids(str(username))
metrics.main()
summing=summer(username)
summing.sum_fun(str(username))
query = user_points.select().where(user_points.c.username == username)
user = await database.fetch_one(query)
return {**user}
if __name__ == "__main__":
uvicorn.run("main:app", reload= True, host="127.0.0.1", port=5000, log_level="info")
code for metrics.py:
ids=[]
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='password')
cursor = connection.cursor()
query="truncate table twitterdb.points"
query1="truncate table twitterdb.Metrics"
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(query)
cursor.execute(query1)
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
finally:
if connection.is_connected():
cursor.close()
connection.close()
def create_url():
tweet_fields = "tweet.fields=public_metrics"
converted_list = [str(element) for element in ids]
id_list = ",".join(converted_list)
url = "https://api.twitter.com/2/tweets?ids={}&{}".format(id_list, tweet_fields)
return url
#curl 'https://api.twitter.com/2/tweets?ids=1459764778088337413&tweet.fields=public_metrics&expansions=attachments.media_keys&media.fields=public_metrics' --header 'Authorization: Bearer $Bearer
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {} {}".format(
response.status_code, response.text, ids
)
)
return url
return response.json()
def main():
def append_to_database(json_response):
#Loop through each tweet
for tweet in json_response['data']:
# Tweet ID
tweetid = tweet['id']
# Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
connect(tweetid, retweet_count, reply_count, like_count, quote_count)
def connect(tweetid, retweet_count, reply_count, like_count, quote_count):
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='passsword', charset='utf8')
if con.is_connected():
"""
Insert twitter data
"""
cursor = con.cursor(buffered=True)
# twitter, golf
delete_previous_data_query = "truncate table Metrics"
query = "INSERT INTO Metrics (tweetid,retweet_count,reply_count,like_count,quote_count) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(delete_previous_data_query)
cursor.execute(query, (tweetid,retweet_count,reply_count,like_count,quote_count))
con.commit()
except Error as e:
print(e)
cursor.close()
con.close()
return
url = create_url()
json_response = connect_to_endpoint(url)
append_to_database(json_response)
#Function to calculate sum of points and display it
class summer:
def __init__(self, name):
self.name = name
def sum_fun(self, name):
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='password', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
def create_points_table():
query= ("INSERT INTO twitterdb.points(username, rt_points,reply_points,like_points,total_points) (SELECT %s, SUM(quote_count + retweet_count) * 150, SUM(reply_count) * 50, SUM(like_count) * 10, SUM(quote_count + retweet_count) * 150 + SUM(reply_count) * 50 + SUM(like_count) * 10 FROM twitterdb.Metrics)")
cursor.execute(query, (name,))
con.commit()
create_points_table();
except Error as e:
print(e)
cursor.close()
con.close()
def clear():
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='password', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
clear_points = ("truncate table twitterdb.points")
cursor.execute(clear_points)
except Error as e:
print(e)
cursor.close()
con.close()
return
What happens here is that there's a database named twitterdb with the tables StreamData, metrics, and points.
StreamData containts tweetids and usernames of the posts that were tweeted with the specified hashtag and it is build with the Streaming API.
Here the issues is that, suppose I have the following usernames mark and ramon in the streamdata table. So when I input the username via the API as mark no issues happen, it returns the correct points for mark, but if I then enter something like mark1 or any random value, it returns the points for mark again. But then if I enter ramon it gives the right points for ramon but then if I enter the random values again, I get the same points for ramon.
Furthermore, the first time when we start the API and if we enter a random value, it returns an error that is specified in the exception as defined in connect_to_endpoint function.
The code logic here is that,
We enter a username via the API, and the get_tweet_ids function looks for that username in the streamdata table and selects all the tweet ids corresponding to that username and saves it to a list, ids. This list of ids is given to the twitter metrics API endpoint and the required values from the response is saved to the table metrics.
Then, the sum_fun is called to select the sum of values of likes, rts and replies from the metrics table, multiply it with the specified points and save it to the table points along with the username.
The API at last returns the values in the table points matching the username.
How can I get it to stop returning the values for random data? If an invalid data is given, it must raise the exception in connect_to_endpoint function, but it just returns whatever value is in the table points previously.
I tried multiple approaches to this like, clearing the values of points before all other functions and checking to return only the values corresponding to the username in the points table. But neither of them worked. When the username was checked in the points table after running it with random values, it contained the random value but with the points of the previous valid username.
NOTE: The table points is a temporary table and values are assigned only when an API call is made.
I am a complete beginner to all this and this is more of a pet project I have been working on, so please help out. Any and all help and guidance regarding my logic and design and a fix for this will be of much use. Thanks.
if the code that you have provided for metrics.py is correct your problem should comme from how you declare the variable ids.
in your code you have declare it as a global so it will not be reset at every function call or class instance creation.
what you should to is declare it in get_tweet_ids()
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
ids=[] # modification here
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='password')
cursor = connection.cursor()
query="truncate table twitterdb.points"
query1="truncate table twitterdb.Metrics"
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(query)
cursor.execute(query1)
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
return ids # modification here
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
finally:
if connection.is_connected():
cursor.close()
connection.close()
with this you will have a new instance of ids at every get_tweet_ids call.
You will have to change the rest of your code according to this return statement

Why isn't fastAPI making the database and returning the result here?

Long code ahead, kindly help out.
I am trying to create a point system for tweets. I have streamed tweets with #Python to a MySQL database and I am trying to create a points system for the same.
from typing_extensions import Self
import requests
import os
import json
import mysql.connector
from mysql.connector import Error
bearer_token = "$Bearer"#Getting tweet ids of specified user from database
ids=[]
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='pasword#123')
cursor = connection.cursor()
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
"""finally:
if connection.is_connected():
cursor.close()
connection.close()"""
def create_url():
tweet_fields = "tweet.fields=public_metrics"
converted_list = [str(element) for element in ids]
id_list = ",".join(converted_list)
url = "https://api.twitter.com/2/tweets?ids={}&{}".format(id_list, tweet_fields)
return url
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {} {}".format(
response.status_code, response.text, ids
)
)
return url
return response.json()
def main():
#def __init__(connect, append_to_database):
#Self.connect = connect
#Self.append_to_database = append_to_database
def connect(tweetid, retweet_count, reply_count, like_count, quote_count):
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
"""
Insert twitter data
"""
cursor = con.cursor(buffered=True)
# twitter, golf
query = "INSERT INTO Metrics (tweetid,retweet_count,reply_count,like_count,quote_count) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(query, (tweetid,retweet_count,reply_count,like_count,quote_count))
con.commit()
except Error as e:
print(e)
cursor.close()
con.close()
return
def append_to_database(json_response):
#Loop through each tweet
for tweet in json_response['data']:
# Tweet ID
tweetid = tweet['id']
# Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
connect(tweetid, retweet_count, reply_count, like_count, quote_count)
url = create_url()
json_response = connect_to_endpoint(url)
append_to_database(json_response)
#function for connecting and inserting to database
#Function to calculate sum of points and display it
class summer:
like_points=0
reply_points=0
total_rts=0
rt_points=0
total=0
def sum_fun():
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
def sum_rts():
cursor.execute("SELECT SUM(retweet_count) FROM twitterdb.Metrics")
sum1=cursor.fetchall()[0][0]
if sum1 is None:
return 0;
else:
return int(sum1)
def sum_replies():
cursor.execute("SELECT SUM(reply_count) FROM twitterdb.Metrics")
sum2=cursor.fetchall()[0][0]
if sum2 is None:
return 0
else:
return int(sum2)
def sum_likes():
cursor.execute("SELECT SUM(like_count) FROM twitterdb.Metrics")
sum3=cursor.fetchall()[0][0]
if sum3 is None:
return 0
else:
return int(sum3)
def sum_qts():
cursor.execute("SELECT SUM(quote_count) FROM twitterdb.Metrics")
sum4=cursor.fetchall()[0][0]
if sum4 is None:
return 0
else:
return int(sum4)
like_points= (20*(sum_likes()))
reply_points= (100 * (sum_replies()))
total_rts= (sum_rts() + sum_qts())
rt_points = (300 * total_rts)
total = (like_points + reply_points + rt_points)
return total
#print("Like Points:", like_points)
#print("Reply Points:", reply_points)
#print("Retweet Points:", rt_points)
#print("Total Points:",total)
# print(points)
except Error as e:
print(e)
cursor.close()
con.close()
def clear():
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='Mysql#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
cursor.execute("truncate table twitterdb.Metrics")
except Error as e:
print(e)
#cursor.close()
#con.close()
return
Furthermore I have created an API on FastAPI to trigger all the functionalities in the above script and get the outputs,like_points, reply_points,rt_points and total sent via an API.The API accepts the value username via a POST request and triggers the script.
API code:
from fastapi import FastAPI
from pydantic import BaseModel
from metrics import tweet_id
from metrics import create_urls
from metrics import summer
import metrics
import uvicorn
from typing_extensions import Self
app = FastAPI()
class Username(BaseModel):
username:str
#app.post('/Username')
def Username(Username : Username):
username=Username.username
tweets_list = tweet_id(username)
tweets_list.get_tweet_ids(str(username))
metrics.clear()
metrics.main()
points=summer.sum_fun()
return{points.total}
if __name__ == "__main__":
uvicorn.run("api:app", host="127.0.0.1", port=5000, log_level="info")
I am unable to get the output and even though the request is completed I get null as the result. Why is that happening? Also, I am very new to a lot of this so code improvement suggestions and modifications are very welcome. Thank you.
you have commented your return in sum_fun() function
total = (like_points + reply_points + rt_points)
#return total
#print("Like Points:", like_points)
that's the reason None is returned when sum_fun() is invocated.

Pipeline doesn't write to MySQL but also gives no error

I've tried to implement this pipeline in my spider.
After installing the necessary dependencies I am able to run the spider without any errors but for some reason it doesn't write to my database.
I'm pretty sure there is something going wrong with connecting to the database. When I give in a wrong password, I still don't get any error.
When the spider scraped all the data, it needs a few minutes before it starts dumping the stats.
2017-08-31 13:17:12 [scrapy] INFO: Closing spider (finished)
2017-08-31 13:17:12 [scrapy] INFO: Stored csv feed (27 items) in: test.csv
2017-08-31 13:24:46 [scrapy] INFO: Dumping Scrapy stats:
Pipeline:
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = {}
SETTINGS['DB_HOST'] = 'mysql.domain.com'
SETTINGS['DB_USER'] = 'username'
SETTINGS['DB_PASSWD'] = 'password'
SETTINGS['DB_PORT'] = 3306
SETTINGS['DB_DB'] = 'database_name'
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
print "init"
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print "close"
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
print "insert"
result = tx.execute(
" INSERT INTO matches(type,home,away,home_score,away_score) VALUES (soccer,"+item["home"]+","+item["away"]+","+item["score"].explode("-")[0]+","+item["score"].explode("-")[1]+")"
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
print "error"
log.err(e)
Spider:
import scrapy
import dateparser
from crawling.items import KNVBItem
class KNVBspider(scrapy.Spider):
name = "knvb"
start_urls = [
'http://www.knvb.nl/competities/eredivisie/uitslagen',
]
custom_settings = {
'ITEM_PIPELINES': {
'crawling.pipelines.MySQLPipeline': 301,
}
}
def parse(self, response):
# www.knvb.nl/competities/eredivisie/uitslagen
for row in response.xpath('//div[#class="table"]'):
for div in row.xpath('./div[#class="row"]'):
match = KNVBItem()
match['home'] = div.xpath('./div[#class="value home"]/div[#class="team"]/text()').extract_first()
match['away'] = div.xpath('./div[#class="value away"]/div[#class="team"]/text()').extract_first()
match['score'] = div.xpath('./div[#class="value center"]/text()').extract_first()
match['date'] = dateparser.parse(div.xpath('./preceding-sibling::div[#class="header"]/span/span/text()').extract_first(), languages=['nl']).strftime("%d-%m-%Y")
yield match
If there are better pipelines available to do what I'm trying to achieve that'd be welcome as well. Thanks!
Update:
With the link provided in the accepted answer I eventually got to this function that's working (and thus solved my problem):
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
query.addBoth(lambda _: item)
return query
Take a look at this for how to use adbapi with MySQL for saving scraped items. Note the difference in your process_item and their process_item method implementation. While you return the item immediately, they return Deferred object which is the result of runInteraction method and which returns the item upon its completion. I think this is the reason your _insert_record never gets called.
If you can see the insert in your output that's already a good sign.
I'd rewrite the insert function this way:
def _insert_record(self, tx, item):
print "insert"
raw_sql = "INSERT INTO matches(type,home,away,home_score,away_score) VALUES ('%s', '%s', '%s', '%s', '%s')"
sql = raw_sql % ('soccer', item['home'], item['away'], item['score'].explode('-')[0], item['score'].explode('-')[1])
print sql
result = tx.execute(sql)
if result > 0:
self.stats.inc_value('database/items_added')
It allows you to debug the sql you're using. In you version you're not wrapping the string in ' which is a syntax error in mysql.
I'm not sure about your last values (score) so I treated them as strings.

Scrapy Pipeline doesn't insert into MySQL

I'm trying to build a small app for a university project with Scrapy.
The spider is scraping the items, but my pipeline is not inserting data into mysql database. In order to test whether the pipeline is not working or the pymysl implementation is not working I wrote a test script:
Code Start
#!/usr/bin/python3
import pymysql
str1 = "hey"
str2 = "there"
str3 = "little"
str4 = "script"
db = pymysql.connect("localhost","root","**********","stromtarife" )
cursor = db.cursor()
cursor.execute("SELECT * FROM vattenfall")
cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (str1, str2, str3, str4))
cursor.execute("SELECT * FROM vattenfall")
data = cursor.fetchone()
print(data)
db.commit()
cursor.close()
db.close()
Code End
After i run this script my database has a new record, so its not my pymysql.connect() function, which is broke.
I'll provide my scrapy code:
vattenfall_form.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from stromtarife.items import StromtarifeItem
from scrapy.http import FormRequest
class VattenfallEasy24KemptenV1500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v1500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "1500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
class VattenfallEasy24KemptenV2500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v2500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "2500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
process = CrawlerProcess()
process.crawl(VattenfallEasy24KemptenV1500Spider)
process.crawl(VattenfallEasy24KemptenV2500Spider)
process.start()
pipelines.py
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
self.cursor.close()
self.connection.close()
settings.py (i changed only that line)
ITEM_PIPELINES = {
'stromtarife.pipelines.StromtarifePipeline': 300,
}
So what is wrong with my code ? I couldn't figure it out and would be really happy if someone is seeing something i'm missing. Thanks in advance!
You should not close your pymsql connection every time you process an item.
You should write the close_spider function in your pipeline like this, so the connection is closed just once, at the end of the execution:
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
Moreover you neeed to return your item at the end of process_item
Your file pipeline.py should look like this:
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
UPDATE :
I tried your code, the problem is in the pipeline, there are two problems:
You try to index the euro symbol € and I think mysql does not like it.
Your query string is not well built.
I managed to get things done by writting the pipeline like this:
def process_item(self, item, spider):
query = """INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)""" % ("1", "2", "3", "4")
self.cursor.execute(query)
self.connection.commit()
return item
I thing you should remove the € from the prices you try to insert.
Hope this helps, let me know.
There is another problem with your scraper besides the fact that your SQL Pipeline closes the SQL connection after writing the first item (as Adrien pointed out).
The other problem is: your scraper only scrapes one single item per results page (and also visits only one results page). I checked Vattenfall and there are usually multiple results displayed and I guess you want to scrape them all.
Means you'll also have to iterate over the results on the page and create multiple items while doing so. The scrapy tutorial here gives a good explanation how to do this: https://doc.scrapy.org/en/latest/intro/tutorial.html#extracting-quotes-and-authors
First of all, in Code Start print(data) must come after db.commit(), otherwise the data which was just inserted into your database will not show up in the print.
Lastly, judging by the names of your columns, it's probably an issue of encoding if the idea above doesn't work.

Writing items to a MySQL database in Scrapy

I am new to Scrapy, I had the spider code
class Example_spider(BaseSpider):
name = "example"
allowed_domains = ["www.example.com"]
def start_requests(self):
yield self.make_requests_from_url("http://www.example.com/bookstore/new")
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//div[#class="bookListingBookTitle"]/a/#href').extract()
for i in urls:
yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)
def parse_url(self, response):
hxs = HtmlXPathSelector(response)
main = hxs.select('//div[#id="bookshelf-bg"]')
items = []
for i in main:
item = Exampleitem()
item['book_name'] = i.select('div[#class="slickwrap full"]/div[#id="bookstore_detail"]/div[#class="book_listing clearfix"]/div[#class="bookstore_right"]/div[#class="title_and_byline"]/p[#class="book_title"]/text()')[0].extract()
item['price'] = i.select('div[#id="book-sidebar-modules"]/div[#class="add_to_cart_wrapper slickshadow"]/div[#class="panes"]/div[#class="pane clearfix"]/div[#class="inner"]/div[#class="add_to_cart 0"]/form/div[#class="line-item"]/div[#class="line-item-price"]/text()').extract()
items.append(item)
return items
And pipeline code is:
class examplePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='blurb',
user='root',
passwd='redhat',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
# run db query in thread pool
assert isinstance(item, Exampleitem)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
print "db connected-=========>"
# create record if doesn't exist.
tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""INSERT INTO example_book_store (book_name,price)
VALUES (%s,%s)""",
(item['book_name'],item['price'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
After running this I am getting the following error
exceptions.NameError: global name 'Exampleitem' is not defined
I got the above error when I added the below code in process_item method
assert isinstance(item, Exampleitem)
and without adding this line I am getting
**exceptions.TypeError: 'Example_spider' object is not subscriptable
Can anyone make this code run and make sure that all the items saved into database?
Try the following code in your pipeline
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd',
'dbname', charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_book_store (book_name, price)
VALUES (%s, %s)""",
(item['book_name'].encode('utf-8'),
item['price'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Your process_item method should be declared as: def process_item(self, item, spider): instead of def process_item(self, spider, item): -> you switched the arguments around.
This exception: exceptions.NameError: global name 'Exampleitem' is not defined indicates you didn't import the Exampleitem in your pipeline.
Try adding: from myspiders.myitems import Exampleitem (with correct names/paths ofcourse).
I think this way is better and more concise:
#Item
class pictureItem(scrapy.Item):
topic_id=scrapy.Field()
url=scrapy.Field()
#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"
#usage
cur.execute(self.save_picture,dict(item))
It's just like
cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})
Cause (you can read more about Items in Scrapy)
The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.