FastAPI not running all the functions to return the right values from database - mysql

I am trying to make a twitter points program. Basically, you get points based off of the number of likes, retweets and replies your post with a specified hashtag gets. I made an API to get these points from a database but fastAPI is not doing all the funtions specified to return the correct values.
API code:
DATABASE_URL = "mysql+mysqlconnector://root:password#localhost:3306/twitterdb"
database = Database(DATABASE_URL)
metadata_obj = MetaData()
engine = create_engine(
DATABASE_URL, connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
metadata = sqlalchemy.MetaData()
Base = declarative_base()
user_points = sqlalchemy.Table(
"points",
metadata_obj,
sqlalchemy.Column("username", sqlalchemy.String,),
sqlalchemy.Column("rt_points", sqlalchemy.Integer,),
sqlalchemy.Column("reply_points", sqlalchemy.Integer),
sqlalchemy.Column("like_points", sqlalchemy.Integer),
sqlalchemy.Column("total_points", sqlalchemy.Integer)
)
engine = sqlalchemy.create_engine(
DATABASE_URL
)
metadata.create_all(engine)
app = FastAPI()
#app.on_event("startup")
async def connect():
await database.connect()
#app.on_event("shutdown")
async def shutdown():
await database.disconnect()
class UserName(BaseModel):
rt_points: int
reply_points: int
like_points: int
total_points : int
#app.get('/userdata/', response_model=UserName)
async def get_points(user: str):
username=user
metrics.clear()
tweets_list = tweet_id(username)
tweets_list.get_tweet_ids(str(username))
metrics.main()
summing=summer(username)
summing.sum_fun(str(username))
query = user_points.select().where(user_points.c.username == username)
user = await database.fetch_one(query)
return {**user}
if __name__ == "__main__":
uvicorn.run("main:app", reload= True, host="127.0.0.1", port=5000, log_level="info")
code for metrics.py:
ids=[]
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='password')
cursor = connection.cursor()
query="truncate table twitterdb.points"
query1="truncate table twitterdb.Metrics"
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(query)
cursor.execute(query1)
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
finally:
if connection.is_connected():
cursor.close()
connection.close()
def create_url():
tweet_fields = "tweet.fields=public_metrics"
converted_list = [str(element) for element in ids]
id_list = ",".join(converted_list)
url = "https://api.twitter.com/2/tweets?ids={}&{}".format(id_list, tweet_fields)
return url
#curl 'https://api.twitter.com/2/tweets?ids=1459764778088337413&tweet.fields=public_metrics&expansions=attachments.media_keys&media.fields=public_metrics' --header 'Authorization: Bearer $Bearer
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {} {}".format(
response.status_code, response.text, ids
)
)
return url
return response.json()
def main():
def append_to_database(json_response):
#Loop through each tweet
for tweet in json_response['data']:
# Tweet ID
tweetid = tweet['id']
# Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
connect(tweetid, retweet_count, reply_count, like_count, quote_count)
def connect(tweetid, retweet_count, reply_count, like_count, quote_count):
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='passsword', charset='utf8')
if con.is_connected():
"""
Insert twitter data
"""
cursor = con.cursor(buffered=True)
# twitter, golf
delete_previous_data_query = "truncate table Metrics"
query = "INSERT INTO Metrics (tweetid,retweet_count,reply_count,like_count,quote_count) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(delete_previous_data_query)
cursor.execute(query, (tweetid,retweet_count,reply_count,like_count,quote_count))
con.commit()
except Error as e:
print(e)
cursor.close()
con.close()
return
url = create_url()
json_response = connect_to_endpoint(url)
append_to_database(json_response)
#Function to calculate sum of points and display it
class summer:
def __init__(self, name):
self.name = name
def sum_fun(self, name):
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='password', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
def create_points_table():
query= ("INSERT INTO twitterdb.points(username, rt_points,reply_points,like_points,total_points) (SELECT %s, SUM(quote_count + retweet_count) * 150, SUM(reply_count) * 50, SUM(like_count) * 10, SUM(quote_count + retweet_count) * 150 + SUM(reply_count) * 50 + SUM(like_count) * 10 FROM twitterdb.Metrics)")
cursor.execute(query, (name,))
con.commit()
create_points_table();
except Error as e:
print(e)
cursor.close()
con.close()
def clear():
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='password', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
clear_points = ("truncate table twitterdb.points")
cursor.execute(clear_points)
except Error as e:
print(e)
cursor.close()
con.close()
return
What happens here is that there's a database named twitterdb with the tables StreamData, metrics, and points.
StreamData containts tweetids and usernames of the posts that were tweeted with the specified hashtag and it is build with the Streaming API.
Here the issues is that, suppose I have the following usernames mark and ramon in the streamdata table. So when I input the username via the API as mark no issues happen, it returns the correct points for mark, but if I then enter something like mark1 or any random value, it returns the points for mark again. But then if I enter ramon it gives the right points for ramon but then if I enter the random values again, I get the same points for ramon.
Furthermore, the first time when we start the API and if we enter a random value, it returns an error that is specified in the exception as defined in connect_to_endpoint function.
The code logic here is that,
We enter a username via the API, and the get_tweet_ids function looks for that username in the streamdata table and selects all the tweet ids corresponding to that username and saves it to a list, ids. This list of ids is given to the twitter metrics API endpoint and the required values from the response is saved to the table metrics.
Then, the sum_fun is called to select the sum of values of likes, rts and replies from the metrics table, multiply it with the specified points and save it to the table points along with the username.
The API at last returns the values in the table points matching the username.
How can I get it to stop returning the values for random data? If an invalid data is given, it must raise the exception in connect_to_endpoint function, but it just returns whatever value is in the table points previously.
I tried multiple approaches to this like, clearing the values of points before all other functions and checking to return only the values corresponding to the username in the points table. But neither of them worked. When the username was checked in the points table after running it with random values, it contained the random value but with the points of the previous valid username.
NOTE: The table points is a temporary table and values are assigned only when an API call is made.
I am a complete beginner to all this and this is more of a pet project I have been working on, so please help out. Any and all help and guidance regarding my logic and design and a fix for this will be of much use. Thanks.

if the code that you have provided for metrics.py is correct your problem should comme from how you declare the variable ids.
in your code you have declare it as a global so it will not be reset at every function call or class instance creation.
what you should to is declare it in get_tweet_ids()
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
ids=[] # modification here
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='password')
cursor = connection.cursor()
query="truncate table twitterdb.points"
query1="truncate table twitterdb.Metrics"
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(query)
cursor.execute(query1)
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
return ids # modification here
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
finally:
if connection.is_connected():
cursor.close()
connection.close()
with this you will have a new instance of ids at every get_tweet_ids call.
You will have to change the rest of your code according to this return statement

Related

Why isn't fastAPI making the database and returning the result here?

Long code ahead, kindly help out.
I am trying to create a point system for tweets. I have streamed tweets with #Python to a MySQL database and I am trying to create a points system for the same.
from typing_extensions import Self
import requests
import os
import json
import mysql.connector
from mysql.connector import Error
bearer_token = "$Bearer"#Getting tweet ids of specified user from database
ids=[]
class tweet_id:
def __init__(self, name):
self.name = name
def get_tweet_ids(self, name):
try:
connection = mysql.connector.connect(host='localhost',
database='twitterdb',
user='root',
password='pasword#123')
cursor = connection.cursor()
sql_select_query = """SELECT tweetid FROM twitterdb.StreamData WHERE username = %s"""
# set variable in query
cursor.execute(sql_select_query, (name,))
# fetch result
record = cursor.fetchall()
for row in record:
ids.append(int(row[0]))
except mysql.connector.Error as error:
print("Failed to get record from MySQL table: {}".format(error))
"""finally:
if connection.is_connected():
cursor.close()
connection.close()"""
def create_url():
tweet_fields = "tweet.fields=public_metrics"
converted_list = [str(element) for element in ids]
id_list = ",".join(converted_list)
url = "https://api.twitter.com/2/tweets?ids={}&{}".format(id_list, tweet_fields)
return url
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {} {}".format(
response.status_code, response.text, ids
)
)
return url
return response.json()
def main():
#def __init__(connect, append_to_database):
#Self.connect = connect
#Self.append_to_database = append_to_database
def connect(tweetid, retweet_count, reply_count, like_count, quote_count):
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
"""
Insert twitter data
"""
cursor = con.cursor(buffered=True)
# twitter, golf
query = "INSERT INTO Metrics (tweetid,retweet_count,reply_count,like_count,quote_count) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(query, (tweetid,retweet_count,reply_count,like_count,quote_count))
con.commit()
except Error as e:
print(e)
cursor.close()
con.close()
return
def append_to_database(json_response):
#Loop through each tweet
for tweet in json_response['data']:
# Tweet ID
tweetid = tweet['id']
# Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
connect(tweetid, retweet_count, reply_count, like_count, quote_count)
url = create_url()
json_response = connect_to_endpoint(url)
append_to_database(json_response)
#function for connecting and inserting to database
#Function to calculate sum of points and display it
class summer:
like_points=0
reply_points=0
total_rts=0
rt_points=0
total=0
def sum_fun():
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='pasword#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
def sum_rts():
cursor.execute("SELECT SUM(retweet_count) FROM twitterdb.Metrics")
sum1=cursor.fetchall()[0][0]
if sum1 is None:
return 0;
else:
return int(sum1)
def sum_replies():
cursor.execute("SELECT SUM(reply_count) FROM twitterdb.Metrics")
sum2=cursor.fetchall()[0][0]
if sum2 is None:
return 0
else:
return int(sum2)
def sum_likes():
cursor.execute("SELECT SUM(like_count) FROM twitterdb.Metrics")
sum3=cursor.fetchall()[0][0]
if sum3 is None:
return 0
else:
return int(sum3)
def sum_qts():
cursor.execute("SELECT SUM(quote_count) FROM twitterdb.Metrics")
sum4=cursor.fetchall()[0][0]
if sum4 is None:
return 0
else:
return int(sum4)
like_points= (20*(sum_likes()))
reply_points= (100 * (sum_replies()))
total_rts= (sum_rts() + sum_qts())
rt_points = (300 * total_rts)
total = (like_points + reply_points + rt_points)
return total
#print("Like Points:", like_points)
#print("Reply Points:", reply_points)
#print("Retweet Points:", rt_points)
#print("Total Points:",total)
# print(points)
except Error as e:
print(e)
cursor.close()
con.close()
def clear():
"""
connect to MySQL database and insert twitter data
"""
try:
con = mysql.connector.connect(host='localhost',
database='twitterdb', user='root', password='Mysql#123', charset='utf8')
if con.is_connected():
cursor = con.cursor(buffered=True)
cursor.execute("truncate table twitterdb.Metrics")
except Error as e:
print(e)
#cursor.close()
#con.close()
return
Furthermore I have created an API on FastAPI to trigger all the functionalities in the above script and get the outputs,like_points, reply_points,rt_points and total sent via an API.The API accepts the value username via a POST request and triggers the script.
API code:
from fastapi import FastAPI
from pydantic import BaseModel
from metrics import tweet_id
from metrics import create_urls
from metrics import summer
import metrics
import uvicorn
from typing_extensions import Self
app = FastAPI()
class Username(BaseModel):
username:str
#app.post('/Username')
def Username(Username : Username):
username=Username.username
tweets_list = tweet_id(username)
tweets_list.get_tweet_ids(str(username))
metrics.clear()
metrics.main()
points=summer.sum_fun()
return{points.total}
if __name__ == "__main__":
uvicorn.run("api:app", host="127.0.0.1", port=5000, log_level="info")
I am unable to get the output and even though the request is completed I get null as the result. Why is that happening? Also, I am very new to a lot of this so code improvement suggestions and modifications are very welcome. Thank you.
you have commented your return in sum_fun() function
total = (like_points + reply_points + rt_points)
#return total
#print("Like Points:", like_points)
that's the reason None is returned when sum_fun() is invocated.

Why does my JSON create multiple entries instead of updating them?

I have a command where you can enter a correct answer. If it is correct, the user is credited with points in a JSON. But my update function seems to be broken, because after another correct execution an entry is made again in the JSON for the same user. However, I just want the points to update. Also, the JSON stops after the second entry about the user. What was wrong?
Code:
correct_answers = "A"
# Open the JSON after start
def json_open():
with open('users.json', 'r', encoding='utf-8') as f:
users = json.load(f)
return users
class Questions(commands.Cog, name='Question'):
"""Question bot"""
def __init__(self, bot):
super().__init__()
self.bot = bot
#commands.command()
async def question(self, ctx, answer):
self.question.enabled = False
global correct_answers
if correct_answers != answer:
await ctx.author.send(f"You guessed {answer} which is **wrong**. Good luck next time!")
await ctx.message.delete()
return
# OPEN JSON FILE, LOAD DATA
with open('users.json', 'r') as f:
users = json.load(f)
await self.update_data(users, ctx.message.author)
await self.add_experience(users, ctx.message.author, 10)
with open('users.json', 'w') as f:
json.dump(users, f)
await ctx.message.delete()
# UPDATE DATA
async def update_data(self, users, user):
if not user.id in users:
users[user.id] = {}
users[user.id]['Points'] = 0
#users[user.id]['level'] = 1
async def add_experience(self, users, user, exp):
users[user.id]['Points'] += exp
It looks like the last functions do not work or is the add_experience function not needed?
The JSON looks like this after the second execution:
{"MYID": {"Points": 10}, "MYIDAGAIN": {"Points": 10}}
Somehow it is converted into a str so you have to update the function a bit. To explain it better:
Turn the user.id into a str.
async def update_data(self, users, user):
key = str(user.id)
if key not in users:
users[key] = {}
users[key]['Points'] = 0
async def add_experience(self, users, user, exp):
users[str(user.id)]['Points'] += exp
Maybe also have a look at the page where the problem is explained.
I think that you have to string the user ID like this
users[str(user.id)]['Points'] += exp

Django rounds left 3 digits while displaying bigint in mysql

I am using Django to display rows in mysql.
The table in mysql has a primary key which it bigint, and one of them is 871195445245063168, 18 digits.
But on my page, I see 871195445245063200 displayed, the least 3 digits are rounded. I am wondering where I make it wrong.
1, I define a class with a function named data_query to query mysql.
class MyQuery:
self.conn = MySQLdb.connect(host = self.DBHOST, user = self.DBUSER,
passwd = self.DBPWD,port = self.DBPORT,charset = self.CHARSET,connect_timeout=3)
def data_query(self,sql):
cursor = self.conn.cursor(MySQLdb.cursors.DictCursor)
start = time.time()
cursor.execute(sql)
end = time.time()
sql_time = end - start
column_description = cursor.description
column_name = [ column[0] for column in column_description ]
res = cursor.fetchall()
cursor.close()
self.conn.close()
return res,column_name,sql_time
2, I defined a json encoder as follows
class CJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
try:
return obj.strftime('%Y-%m-%d %H:%M:%S')
except ValueError:
return str(obj)
elif isinstance(obj, datetime.date):
try:
return obj.strftime('%Y-%m-%d')
except ValueError:
return str(obj)
elif isinstance(obj,datetime.timedelta):
return str(obj)
elif isinstance(obj, decimal.Decimal):
return float(obj)
elif isinstance(obj,ObjectId):
return str(obj)
else:
return json.JSONEncoder.default(self, obj)
3, I get my display like this, with sensitive info replaced.
db = MyQuery(host, user, pwd, port)
sql_statement = 'select * from mytable where Findex=871195445245063168 limit 10'
sql_result, table_column_name, sql_time = db.data_query(sql_statement)
query_result = {}
column_name = column_format(table_column_name)
query_result['column'] = column_name
query_result['data'] = list(sql_result)
return HttpResponse(json.dumps(query_result, cls=CJsonEncoder), content_type='application/json')
So, what I go wrong here? Thanks.
This is a JavaScript issue. Your number is bigger than the largest safe integer in JavaScript (Number.MAX_SAFE_INTEGER), so it is rounded.
You can verify this in your browser console or in node.js
$ node
> x = 871195445245063168
871195445245063200
I assume you are either using your response in some kind of JavaScript frontend or you have some Browser extension to render the JSON, which is written in JavaScript.
If you request that URL with a client like curl, you will see that it is returned correctly from the server.

Lambda (Python 3.6) PyMySql Query EXISTS query always returns 1

I am trying to get a PyMySQL query in Lambda (Python 3.6) to return whether a user exists or not. I pass my slack user ID into the query. This is what I want to check in MySQL. I can run the same query through MySQL and it returns a 0, but for some reason, every time I call this query through lambda, it tells me the user exists (My database is empty). My query is function is this:
def userExists(user):
statement = f"SELECT EXISTS(SELECT 1 FROM slackDB.Assets WHERE userID LIKE '%{user}%')Assets"
tempBool = cursor.execute(statement, args=None)
conn.commit()
return tempBool
Here is the full code I am working with:
################################
# Slack Lambda handler.
################################
import sys
import logging
import os
import pymysql
import urllib
# Grab data from the environment.
BOT_TOKEN = os.environ["BOT_TOKEN"]
ASSET_TABLE = os.environ["ASSET_TABLE"]
REGION_NAME = os.getenv('REGION_NAME', 'us-east-2')
DB_NAME = "admin"
DB_PASSWORD = "somepassword"
DB_DATABASE = "someDB"
RDS_HOST = "myslackdb.somepseudourl.us-east-2.rds.amazonaws.com"
port = 3306
logger = logging.getLogger()
logger.setLevel(logging.INFO)
try:
conn = pymysql.connect(RDS_HOST, user=DB_NAME, passwd=DB_PASSWORD, db=DB_DATABASE, connect_timeout=5)
cursor = conn.cursor()
except:
logger.error("ERROR: Unexpected error: Could not connect to MySql instance.")
sys.exit()
# Define the URL of the targeted Slack API resource.
SLACK_URL = "https://slack.com/api/chat.postMessage"
def userExists(user):
statement = f"SELECT EXISTS(SELECT 1 FROM slackDB.Assets WHERE userID LIKE '%{user}%')Assets"
tempBool = cursor.execute(statement, args=None)
conn.commit()
return tempBool
def addUser(user):
statement = f"INSERT INTO `slackDB`.`Assets` (`userID`, `money`) VALUES ('{user}', '1000')"
tempBool = cursor.execute(statement, args=None)
conn.commit()
return tempBool
def lambda_handler(data, context):
# Slack challenge answer.
if "challenge" in data:
return data["challenge"]
# Grab the Slack channel data.
slack_event = data['event']
slack_userID = slack_event["user"]
slack_text = slack_event["text"]
channel_id = slack_event["channel"]
slack_reply = ""
# Ignore bot messages.
if "bot_id" in slack_event:
slack_reply = ""
else:
# Start data sift.
if slack_text.startswith("!networth"):
slack_reply = "Your networth is: "
elif slack_text.startswith("!price"):
command,asset = text.split()
slack_reply = f"The price of a(n) {asset} is: "
elif slack_text.startswith("!addme"):
if userExists(slack_userID):
slack_reply = f"User {slack_userID} already exists"
else:
slack_reply = f"Adding user {slack_userID}"
addUser(slack_userID)
# We need to send back three pieces of information:
data = urllib.parse.urlencode(
(
("token", BOT_TOKEN),
("channel", channel_id),
("text", slack_reply)
)
)
data = data.encode("ascii")
# Construct the HTTP request that will be sent to the Slack API.
request = urllib.request.Request(
SLACK_URL,
data=data,
method="POST"
)
# Add a header mentioning that the text is URL-encoded.
request.add_header(
"Content-Type",
"application/x-www-form-urlencoded"
)
# Fire off the request!
urllib.request.urlopen(request).read()
# Everything went fine.
return "200 OK"
I am typing '!addme' in slack and it always tells me the user exists. I have printed out my query statement and it is inputting my slack ID correctly. I have checked my table, and it is completely empty. I have run the query in MySQL and it returns a 0.
Does anyone have any ideas? Am I just derping this up on something easy? Any helps or hints is much appreciated.
Thanks,
I don't see a fetch from the cursor. Just the execute.
And the return from execute is the number of rows affected. For DML operations (INSERT/UPDATE/DELETE) that makes sense. But I wouldn't rely on the rows affected count for a SELECT.
In this case, the SELECT EXISTS query is going to either return a row, or throw an error. But the fact that the query returns a row doesn't tell us anything about the value of the Assets column.
From the query, it looks like we want to fetch a row, and then determine if the Assets column contains a 0 or 1 (or NULL).
After the query execution, try cur.fetchone to retrieve the row.
We could also execute a simpler query, and then use a fetch to determine if a row is returned or not.

scrapy and mysql

I am trying to get scrapy to insert crawled data into mysql and my code crawls fine and collects the data in the buffer, does not error, but database is never updated.
'no luck', 'no error'
pipeline.py
from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors
class SQLStorePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='craigs',
user='bra', passwd='boobs', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)
def process_item(self, items, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, items)
query.addErrback(self.handle_error)
return items
def _conditional_insert(self, tx, items):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from scraped where link = %s", (items['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % items, level=log.DEBUG)
else:
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
log.msg("Item stored in db: %s" % items, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
crawl code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigs.items import CraigsItem
class MySpider(CrawlSpider):
name = "craigs"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('/html/body/blockquote[3]/p/a',)), follow=True, callback='parse_profile')]
def parse_profile(self, response):
items = []
img = CraigsItem()
hxs = HtmlXPathSelector(response)
img['title'] = hxs.select('//h2[contains(#class, "postingtitle")]/text()').extract()
img['posting_id'] = hxs.select('//html/body/article/section/section[2]/div/p/text()').extract()
items.append(img)
return items[0]
return img[0]
settings.py
BOT_NAME = 'craigs'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['craigs.spiders']
NEWSPIDER_MODULE = 'craigs.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
The reason why the pipeline code is not being called at all is because it hasn't been activated. This activation is done by adding a new section to settings.py, as per the Item Pipelines page in the documentation. e.g
ITEM_PIPELINES = [
'craigs.pipeline.SQLStorePipeline',
]
Additionally, your parse_profile function should just return img. You'd only add an items list to return if a single response page would result in multiple items.
activate Pipeline in settings and use yield instead of return
You should COMMIT the current transaction, which making changes permanent.
So after
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
you have to
db.commit()
db here is something like
db = MySQLdb.connect(host="localhost",user = "root", passwd = "1234", db="database_name")
Please try it.