Pipeline doesn't write to MySQL but also gives no error - mysql

I've tried to implement this pipeline in my spider.
After installing the necessary dependencies I am able to run the spider without any errors but for some reason it doesn't write to my database.
I'm pretty sure there is something going wrong with connecting to the database. When I give in a wrong password, I still don't get any error.
When the spider scraped all the data, it needs a few minutes before it starts dumping the stats.
2017-08-31 13:17:12 [scrapy] INFO: Closing spider (finished)
2017-08-31 13:17:12 [scrapy] INFO: Stored csv feed (27 items) in: test.csv
2017-08-31 13:24:46 [scrapy] INFO: Dumping Scrapy stats:
Pipeline:
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = {}
SETTINGS['DB_HOST'] = 'mysql.domain.com'
SETTINGS['DB_USER'] = 'username'
SETTINGS['DB_PASSWD'] = 'password'
SETTINGS['DB_PORT'] = 3306
SETTINGS['DB_DB'] = 'database_name'
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
print "init"
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print "close"
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
print "insert"
result = tx.execute(
" INSERT INTO matches(type,home,away,home_score,away_score) VALUES (soccer,"+item["home"]+","+item["away"]+","+item["score"].explode("-")[0]+","+item["score"].explode("-")[1]+")"
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
print "error"
log.err(e)
Spider:
import scrapy
import dateparser
from crawling.items import KNVBItem
class KNVBspider(scrapy.Spider):
name = "knvb"
start_urls = [
'http://www.knvb.nl/competities/eredivisie/uitslagen',
]
custom_settings = {
'ITEM_PIPELINES': {
'crawling.pipelines.MySQLPipeline': 301,
}
}
def parse(self, response):
# www.knvb.nl/competities/eredivisie/uitslagen
for row in response.xpath('//div[#class="table"]'):
for div in row.xpath('./div[#class="row"]'):
match = KNVBItem()
match['home'] = div.xpath('./div[#class="value home"]/div[#class="team"]/text()').extract_first()
match['away'] = div.xpath('./div[#class="value away"]/div[#class="team"]/text()').extract_first()
match['score'] = div.xpath('./div[#class="value center"]/text()').extract_first()
match['date'] = dateparser.parse(div.xpath('./preceding-sibling::div[#class="header"]/span/span/text()').extract_first(), languages=['nl']).strftime("%d-%m-%Y")
yield match
If there are better pipelines available to do what I'm trying to achieve that'd be welcome as well. Thanks!
Update:
With the link provided in the accepted answer I eventually got to this function that's working (and thus solved my problem):
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
query.addBoth(lambda _: item)
return query

Take a look at this for how to use adbapi with MySQL for saving scraped items. Note the difference in your process_item and their process_item method implementation. While you return the item immediately, they return Deferred object which is the result of runInteraction method and which returns the item upon its completion. I think this is the reason your _insert_record never gets called.

If you can see the insert in your output that's already a good sign.
I'd rewrite the insert function this way:
def _insert_record(self, tx, item):
print "insert"
raw_sql = "INSERT INTO matches(type,home,away,home_score,away_score) VALUES ('%s', '%s', '%s', '%s', '%s')"
sql = raw_sql % ('soccer', item['home'], item['away'], item['score'].explode('-')[0], item['score'].explode('-')[1])
print sql
result = tx.execute(sql)
if result > 0:
self.stats.inc_value('database/items_added')
It allows you to debug the sql you're using. In you version you're not wrapping the string in ' which is a syntax error in mysql.
I'm not sure about your last values (score) so I treated them as strings.

Related

Problem with PettingZoo and Stable-Baselines3 with a ParallelEnv

I am having trouble in making things work with a Custom ParallelEnv I wrote by using PettingZoo. I am using SuperSuit's ss.pettingzoo_env_to_vec_env_v1(env) as a wrapper to Vectorize the environment and make it work with Stable-Baseline3 and documented here.
You can find attached a summary of the most relevant part of the code:
from typing import Optional
from gym import spaces
import random
import numpy as np
from pettingzoo import ParallelEnv
from pettingzoo.utils.conversions import parallel_wrapper_fn
import supersuit as ss
from gym.utils import EzPickle, seeding
def env(**kwargs):
env_ = parallel_env(**kwargs)
env_ = ss.pettingzoo_env_to_vec_env_v1(env_)
#env_ = ss.concat_vec_envs_v1(env_, 1)
return env_
petting_zoo = env
class parallel_env(ParallelEnv, EzPickle):
metadata = {'render_modes': ['ansi'], "name": "PlayerEnv-Multi-v0"}
def __init__(self, n_agents: int = 20, new_step_api: bool = True) -> None:
EzPickle.__init__(
self,
n_agents,
new_step_api
)
self._episode_ended = False
self.n_agents = n_agents
self.possible_agents = [
f"player_{idx}" for idx in range(n_agents)]
self.agents = self.possible_agents[:]
self.agent_name_mapping = dict(
zip(self.possible_agents, list(range(len(self.possible_agents))))
)
self.observation_spaces = spaces.Dict(
{agent: spaces.Box(shape=(len(self.agents),),
dtype=np.float64, low=0.0, high=1.0) for agent in self.possible_agents}
)
self.action_spaces = spaces.Dict(
{agent: spaces.Discrete(4) for agent in self.possible_agents}
)
self.current_step = 0
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
def observation_space(self, agent):
return self.observation_spaces[agent]
def action_space(self, agent):
return self.action_spaces[agent]
def __calculate_observation(self, agent_id: int) -> np.ndarray:
return self.observation_space(agent_id).sample()
def __calculate_observations(self) -> np.ndarray:
observations = {
agent: self.__calculate_observation(
agent_id=agent)
for agent in self.agents
}
return observations
def observe(self, agent):
return self.__calculate_observation(agent_id=agent)
def step(self, actions):
if self._episode_ended:
return self.reset()
observations = self.__calculate_observations()
rewards = random.sample(range(100), self.n_agents)
self.current_step += 1
self._episode_ended = self.current_step >= 100
infos = {agent: {} for agent in self.agents}
dones = {agent: self._episode_ended for agent in self.agents}
rewards = {
self.agents[i]: rewards[i]
for i in range(len(self.agents))
}
if self._episode_ended:
self.agents = {} # To satisfy `set(par_env.agents) == live_agents`
return observations, rewards, dones, infos
def reset(self,
seed: Optional[int] = None,
return_info: bool = False,
options: Optional[dict] = None,):
self.agents = self.possible_agents[:]
self._episode_ended = False
self.current_step = 0
observations = self.__calculate_observations()
return observations
def render(self, mode="human"):
# TODO: IMPLEMENT
print("TO BE IMPLEMENTED")
def close(self):
pass
Unfortunately when I try to test with the following main procedure:
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.env_checker import check_env
from dummy_env import dummy
from pettingzoo.test import parallel_api_test
if __name__ == '__main__':
# Testing the parallel algorithm alone
env_parallel = dummy.parallel_env()
parallel_api_test(env_parallel) # This works!
# Testing the environment with the wrapper
env = dummy.petting_zoo()
# ERROR: AssertionError: The observation returned by the `reset()` method does not match the given observation space
check_env(env)
# Model initialization
model = PPO("MlpPolicy", env, verbose=1)
# ERROR: ValueError: could not broadcast input array from shape (20,20) into shape (20,)
model.learn(total_timesteps=10_000)
I get the following error:
AssertionError: The observation returned by the `reset()` method does not match the given observation space
If I skip check_env() I get the following one:
ValueError: could not broadcast input array from shape (20,20) into shape (20,)
It seems like that ss.pettingzoo_env_to_vec_env_v1(env) is capable of splitting the parallel environment in multiple vectorized ones, but not for the reset() function.
Does anyone know how to fix this problem?
Plese find the Github Repository to reproduce the problem.
You should double check the reset() function in PettingZoo. It will return None instead of an observation like GYM
Thanks to discussion I had in the issue section of the SuperSuit repository, I am able to post the solution to the problem. Thanks to jjshoots!
First of all it is necessary to have the latest SuperSuit version. In order to get that I needed to install Stable-Baseline3 using the instructions here to make it work with gym 0.24+.
After that, taking the code in the question as example, it is necessary to substitute
def env(**kwargs):
env_ = parallel_env(**kwargs)
env_ = ss.pettingzoo_env_to_vec_env_v1(env_)
#env_ = ss.concat_vec_envs_v1(env_, 1)
return env_
with
def env(**kwargs):
env_ = parallel_env(**kwargs)
env_ = ss.pettingzoo_env_to_vec_env_v1(env_)
env_ = ss.concat_vec_envs_v1(env_, 1, base_class="stable_baselines3")
return env_
The outcomes are:
Outcome 1: leaving the line with check_env(env) I got an error AssertionError: Your environment must inherit from the gym.Env class cf https://github.com/openai/gym/blob/master/gym/core.py
Outcome 2: removing the line with check_env(env), the agent starts training successfully!
In the end, I think that the argument base_class="stable_baselines3" made the difference.
Only the small problem on check_env remains to be reported, but I think it can be considered as trivial if the training works.

Inserting to MySQL with mysql.connector - good practice/efficiency

I am working on a personal project and was wondering if my solution for inserting data to a MySQL database would be considered "pythonic" and efficient.
I have written a separate class for that, which will be called from an object which holds a dataframe. From there I am calling my save() function to write the dataframe to the database.
The script will be running once a day where I scrape some data from some websites and save it to my database. So it is important that it really runs through completely even when I have bad data or temporary connection issues (script and database run on different machines).
import mysql.connector
# custom logger
from myLog import logger
# custom class for formatting the data, a lot of potential errors are handled here
from myFormat import myFormat
# insert strings to mysql are stored and referenced here
import sqlStrings
class saveSQL:
def __init__(self):
self.frmt = myFormat()
self.host = 'XXX.XXX.XXX.XXX'
self.user = 'XXXXXXXX'
self.password = 'XXXXXXXX'
self.database = 'XXXXXXXX'
def save(self, payload, type):
match type:
case 'First':
return self.__first(payload)
case 'Second':
...
case _:
logger.error('Undefined Input for Type!')
def __first(self, payload):
try:
self.mydb = mysql.connector.connect(host=self.host,user=self.user,password=self.password,database=self.database)
mycursor = self.mydb.cursor()
except mysql.connector.Error as err:
logger.error('Couldn\'t establish connection to DB!')
try:
tmpList = payload.values.tolist()
except ValueError:
logger.error('Value error in converting dataframe to list: ' % payload)
try:
mycursor.executemany(sqlStrings.First, tmpList)
self.mydb.commit()
dbWrite = mycursor.rowcount
except mysql.connector.Error as err:
logger.error('Error in writing to database: %s' % err)
for ele in myList:
dbWrite = 0
try:
mycursor.execute(sqlStrings.First, ele)
self.mydb.commit()
dbWrite = dbWrite + mycursor.rowcount
except mysql.connector.Error as err:
logger.error('Error in writing to database: %s \n ele: %s' % [err,ele])
continue
pass
mycursor.close()
return dbWrite
Things I am wondering about:
Is the match case a good option to distinguish between writing to different tables depending on the data?
Are the different try/except blocks really necessary or are there easier ways of handling potential errors?
Do I really need the pass command at the end of the for-loop?

Python constructor confusion

I'm playing with python trying to create a basic repository class (normally a C++/C# for work) and am having an issue.
The following code has bombs out on
a = officesRepo(conn) saying "Too many positional arguments for constructor call", but it's being given the only argument specified, the MySql connection object.
I'm coding in vscode on linux using python3.8. I'm wondering if pylint is expecting me to pass in "self", when I don't think it's needed.
Any help/advice/tips greatly received. Flame away if you like, as long as it teaches me something! ;-)
import pymysql.cursors
import Pocos
class officesRepo:
def __init__(conn):
self.conn = conn
def create(office):
pass
def getAll():
cursor = conn.cursor()
SQL = "SELECT `officeCode`, `city`, `phone`, `addressLine1`, `addressLine2`, `state`, `country`, `postalCode`, `territory` "
SQL += "FROM `offices`"
cursor.execute(SQL)
#result = cursor.fetchone()
ret = []
for val in cursor:
ret.append(ret.append(val["officeCode"], val["city"], val["phone"], val["addressLine1"], val["addressLine2"], val["state"], val["country"], val["postalCode"], val["territory"]))
return ret
def getById(id):
pass
conn = pymysql.connect(host='localhost',
user='user',
password='password',
db='classicmodel',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
a = officesRepo(conn)
b = a.getAll()
print(b)
The first parameter of an instance method is self. You don't need to pass it explicitly, but you do need to include it in the parameter list. Right now though, the conn parameter is acting as self, then there's no other parameters after that (thus the error).
You'd need
def __init__(self, conn):
. . .
then similarly for the other methods. All instance methods require an explicit self parameter.

Scrapy Pipeline doesn't insert into MySQL

I'm trying to build a small app for a university project with Scrapy.
The spider is scraping the items, but my pipeline is not inserting data into mysql database. In order to test whether the pipeline is not working or the pymysl implementation is not working I wrote a test script:
Code Start
#!/usr/bin/python3
import pymysql
str1 = "hey"
str2 = "there"
str3 = "little"
str4 = "script"
db = pymysql.connect("localhost","root","**********","stromtarife" )
cursor = db.cursor()
cursor.execute("SELECT * FROM vattenfall")
cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (str1, str2, str3, str4))
cursor.execute("SELECT * FROM vattenfall")
data = cursor.fetchone()
print(data)
db.commit()
cursor.close()
db.close()
Code End
After i run this script my database has a new record, so its not my pymysql.connect() function, which is broke.
I'll provide my scrapy code:
vattenfall_form.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from stromtarife.items import StromtarifeItem
from scrapy.http import FormRequest
class VattenfallEasy24KemptenV1500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v1500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "1500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
class VattenfallEasy24KemptenV2500Spider(scrapy.Spider):
name = 'vattenfall-easy24-v2500-p87435'
def start_requests(self):
return [
FormRequest(
"https://www.vattenfall.de/de/stromtarife.htm",
formdata={"place": "87435", "zipCode": "87435", "cityName": "Kempten",
"electricity_consumptionprivate": "2500", "street": "", "hno": ""},
callback=self.parse
),
]
def parse(self, response):
item = StromtarifeItem()
item['jahrespreis'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[3]/td[2]/text()').extract_first()
item['treuebonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[2]/td/strong/text()').extract_first()
item['sofortbonus'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[2]/form[1]/div/div[2]/table/tbody/tr[1]/td/strong/text()').extract_first()
item['tarif'] = response.xpath('/html/body/main/div[1]/div[2]/div/div[3]/div[2]/div/div[1]/h2/span/text()').extract_first()
yield item
process = CrawlerProcess()
process.crawl(VattenfallEasy24KemptenV1500Spider)
process.crawl(VattenfallEasy24KemptenV2500Spider)
process.start()
pipelines.py
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
self.cursor.close()
self.connection.close()
settings.py (i changed only that line)
ITEM_PIPELINES = {
'stromtarife.pipelines.StromtarifePipeline': 300,
}
So what is wrong with my code ? I couldn't figure it out and would be really happy if someone is seeing something i'm missing. Thanks in advance!
You should not close your pymsql connection every time you process an item.
You should write the close_spider function in your pipeline like this, so the connection is closed just once, at the end of the execution:
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
Moreover you neeed to return your item at the end of process_item
Your file pipeline.py should look like this:
import pymysql
from stromtarife.items import StromtarifeItem
class StromtarifePipeline(object):
def __init__(self):
self.connection = pymysql.connect("localhost","root","**********","stromtarife")
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)", (item['tarif'], item['sofortbonus'], item['treuebonus'], item['jahrespreis']))
self.connection.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
UPDATE :
I tried your code, the problem is in the pipeline, there are two problems:
You try to index the euro symbol € and I think mysql does not like it.
Your query string is not well built.
I managed to get things done by writting the pipeline like this:
def process_item(self, item, spider):
query = """INSERT INTO vattenfall (tarif, sofortbonus, treuebonus, jahrespreis) VALUES (%s, %s, %s, %s)""" % ("1", "2", "3", "4")
self.cursor.execute(query)
self.connection.commit()
return item
I thing you should remove the € from the prices you try to insert.
Hope this helps, let me know.
There is another problem with your scraper besides the fact that your SQL Pipeline closes the SQL connection after writing the first item (as Adrien pointed out).
The other problem is: your scraper only scrapes one single item per results page (and also visits only one results page). I checked Vattenfall and there are usually multiple results displayed and I guess you want to scrape them all.
Means you'll also have to iterate over the results on the page and create multiple items while doing so. The scrapy tutorial here gives a good explanation how to do this: https://doc.scrapy.org/en/latest/intro/tutorial.html#extracting-quotes-and-authors
First of all, in Code Start print(data) must come after db.commit(), otherwise the data which was just inserted into your database will not show up in the print.
Lastly, judging by the names of your columns, it's probably an issue of encoding if the idea above doesn't work.

Writing items to a MySQL database in Scrapy

I am new to Scrapy, I had the spider code
class Example_spider(BaseSpider):
name = "example"
allowed_domains = ["www.example.com"]
def start_requests(self):
yield self.make_requests_from_url("http://www.example.com/bookstore/new")
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//div[#class="bookListingBookTitle"]/a/#href').extract()
for i in urls:
yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)
def parse_url(self, response):
hxs = HtmlXPathSelector(response)
main = hxs.select('//div[#id="bookshelf-bg"]')
items = []
for i in main:
item = Exampleitem()
item['book_name'] = i.select('div[#class="slickwrap full"]/div[#id="bookstore_detail"]/div[#class="book_listing clearfix"]/div[#class="bookstore_right"]/div[#class="title_and_byline"]/p[#class="book_title"]/text()')[0].extract()
item['price'] = i.select('div[#id="book-sidebar-modules"]/div[#class="add_to_cart_wrapper slickshadow"]/div[#class="panes"]/div[#class="pane clearfix"]/div[#class="inner"]/div[#class="add_to_cart 0"]/form/div[#class="line-item"]/div[#class="line-item-price"]/text()').extract()
items.append(item)
return items
And pipeline code is:
class examplePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='blurb',
user='root',
passwd='redhat',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
# run db query in thread pool
assert isinstance(item, Exampleitem)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
print "db connected-=========>"
# create record if doesn't exist.
tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""INSERT INTO example_book_store (book_name,price)
VALUES (%s,%s)""",
(item['book_name'],item['price'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
After running this I am getting the following error
exceptions.NameError: global name 'Exampleitem' is not defined
I got the above error when I added the below code in process_item method
assert isinstance(item, Exampleitem)
and without adding this line I am getting
**exceptions.TypeError: 'Example_spider' object is not subscriptable
Can anyone make this code run and make sure that all the items saved into database?
Try the following code in your pipeline
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd',
'dbname', charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_book_store (book_name, price)
VALUES (%s, %s)""",
(item['book_name'].encode('utf-8'),
item['price'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Your process_item method should be declared as: def process_item(self, item, spider): instead of def process_item(self, spider, item): -> you switched the arguments around.
This exception: exceptions.NameError: global name 'Exampleitem' is not defined indicates you didn't import the Exampleitem in your pipeline.
Try adding: from myspiders.myitems import Exampleitem (with correct names/paths ofcourse).
I think this way is better and more concise:
#Item
class pictureItem(scrapy.Item):
topic_id=scrapy.Field()
url=scrapy.Field()
#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"
#usage
cur.execute(self.save_picture,dict(item))
It's just like
cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})
Cause (you can read more about Items in Scrapy)
The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.