Storing scraped data with SCRAPY in MySQL database

Storing scraped data with SCRAPY in MySQL database - mysql

i'm new here, it's the first time i'm using scrapy and i really need help. I know that this was asked before and i did try a lot of solutions but none of them works.
My pipelines file:
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import DropItem
from scrapy.http import Request
from projetpfe.items import ProjetpfeItem
class MySQLStorePipeline(object):
def __init__(self):
try:
self.conn= MySQLdb.connect(user='root', passwd='root123', host='localhost', db='pressebam', use_unicode=True, charset='utf8')
self.cursor = self.conn.cursor()
self.cursor.execute("CREATE TABLE IF NOT EXISTS scrapeddata2( idscrapedData INT NOT NULL AUTO_INCREMENT PRIMARY KEY, nomOrganePresse VARCHAR(200), titreArticle VARCHAR(200), url VARCHAR(200), nomJournaliste VARCHAR(200), jour VARCHAR(100), annee VARCHAR(100), categorie VARCHAR(100), contenuArticle VARCHAR(5000), lienImage VARCHAR(200)) ")
self.conn.commit()
except (AttributeError, MySQLdb.OperationalError), e:
raise e
def process_item(self, item, spider):
try:
self.cursor.execute( "INSERT INTO scrapeddata2 ( nomOrganePresse, titreArticle, url, jour, contenuArticle, lienImage) VALUES (%s, %s, %s,%s,%s, %s)",
(item['OrganePresse'],
item['Titre'],
item['URL'],
item['Jour'],
item['Contenu'],
item['LienImage'] ))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
And this is my spider file
import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from projetpfe.items import ProjetpfeItem
class ProjetpfeSpider(CrawlSpider):
name = 'telquel'
start_urls = ['http://telquel.ma'] # urls from which the spider will start crawling
rules = [Rule(SgmlLinkExtractor(allow=[r'page/\d+']), follow=True),
# r'page/\d+' : regular expression for http://telquelpage/X URLs
Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\d{2}/\w+']), callback='parse_telquel')]
# r'\d{4}/\d{2}/\w+' : regular expression for http://telquel.ma/YYYY/MM/title URLs
def parse_telquel(self, response):
hxs = HtmlXPathSelector(response)
item = ProjetpfeItem()
# XPath selector for title
item['Titre'] = hxs.select("//h1[#class='article-title']/text()").extract()
item['LienImage'] = hxs.select("//div[#class='main-article-content']//img[#class='setborder']/#src").extract()
item['OrganePresse'] = hxs.select("//img[#class='logo']/#alt").extract()
item['Jour'] = hxs.select("//div[#class='calendar-date']/text()").extract()
item['Contenu'] = hxs.select("//div[#class='shortcode-content']").extract()
item['URL'] = hxs.select("/html/head/link[5]/#href").extract()
return item
This is the settings file
BOT_NAME = 'projetpfe'
SPIDER_MODULES = ['projetpfe.spiders']
NEWSPIDER_MODULE = 'projetpfe.spiders'
ITEM_PIPELINES = {'projetpfe.pipelines.MySQLStorePipeline' : 300}
and finally my items
from scrapy.item import Item, Field
class ProjetpfeItem(Item):
OrganePresse = Field()
Titre = Field()
Journaliste = Field()
Jour = Field()
Annee = Field()
Categorie = Field()
Contenu = Field()
LienImage = Field()
URL = Field()
So the spider works fine but nada is stored in the database. HELP!!!

Related

SQLAlchemy Table classes and imports

I started a project using PostgreSQL and SQLAlchemy. Since i'm not a experienced programmer(just started using classes) and also quite new to databases i noticed some workflows i don't really understand.
What i understand up till now from classes is the following workflow:
# filename.py
class ClassName():
def __init__(self):
# do something
def some_funcion(self, var1, var2):
# do something with parameters
---------------------------------------
# main.py
from filename import ClassName
par1 = ...
par2 = ...
a = ClassName()
b = a.some_function(par1, par2)
Now i am creating tables from classes:
# base.py
from sqlalchemy.orm import declarative_base
Base = declarative_base()
# tables.py
from base import Base
from sqlalchemy import Column
from sqlalchemy import Integer, String
class A(Base):
__tablename__ = "a"
a_id = Column(Integer, primary_key=True)
a_column = Column(String(30))
class B(Base):
__tablename__ = "b"
b_id = Column(Integer, primary_key=True)
b_column = Column(String(30))
and
import typing
from base import Base
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy.orm import sessionmaker
from tables import A, B
metadata_obj = MetaData()
def create_tables(engine):
session = sessionmaker()
session.configure(bind=engine)
Base.metadata.create_all(bind=engine)
a = Table("a", metadata_obj, autoload_with=engine)
b = Table("b", metadata_obj, autoload_with=engine)
return(a, b) # not sure return is needed
if __name__ == "__main__":
username = "username"
password = "AtPasswordHere!"
dbname = "dbname"
url = "postgresql://" + username + ":" + password + "#localhost/" + dbname
engine = create_engine(url, echo=True, future=True)
a, b = create_tables(engine)
Everything works fine in that it creates Table A and Table B in the database. The point i don't understand is the following:
Both my IDE(pyflake) and LGTM complain 'Tables. ... imported but not used'. (EDIT i understand why it complains in the way it is not the normal Class flow. It is mor about Why it is not the normal class workflow)
Is this normal behavior for this usecase? I only see examples that make use of the above workflow
Are there better methods to create the same results (but without the warnings)
If this is the normal behavior: Is there an explanation for this? I didn't read it anywhere.

Bizarre Environment-dependent Bad Request 400 error

I'm writing a program to convert a repository into a Docker with an API based on some specification files. When I run the app on my Macbook's base environment, the computer-generated API works perfectly with both gunicorn and uwsgi. However, within the miniconda-based docker container, it failed with Bad Request 400: The browser (or proxy) sent a request that this server could not understand. My goal is to eliminate this error. Obviously, this has to do with the versions of some dependency or set of dependencies. Interestingly, the last endpoint in the API, which has a request parser within a namespace with no arguments, works perfectly, unlike the two other endpoints in the default namespace that do have arguments.
The API is built on flask_restx and uses reqparse.
The API code is here:
from flask_restx import Api, Resource, Namespace, reqparse, inputs
import flask
import process
from load_data import store_data
app = flask.Flask("restful_api")
api = Api(app, title="My API", description="This is an extremely useful API for performing tasks you would do with an API.", version="3.14")
data = {}
data.update(store_data())
class DefaultClass():
def __init__(self):
self.data = data
def _replace_get(self, **args):
default_args = {}
args = {**default_args, **args}
return process.replace(**args)
def _find_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.find_in_data_string(**args)
def set_up_worker():
global defaultClass
defaultClass = DefaultClass()
set_up_worker()
_replaceGetParser = reqparse.RequestParser()
_replaceGetParser.add_argument("txt",
type=str,
required=True,
help="Text to search ")
_replaceGetParser.add_argument("old",
type=str,
required=True,
help="Substring to replace ")
_replaceGetParser.add_argument("new",
type=str,
required=True,
help="Replacement for old ")
_replaceGetParser.add_argument("irrelevant_parameter",
type=int,
required=False,
default=5,
help="")
_replaceGetParser.add_argument("smart_casing",
type=inputs.boolean,
required=False,
default=True,
help="True if we should infer replacement capitalization from original casing. ")
_replaceGetParser.add_argument("case_sensitive",
type=inputs.boolean,
required=False,
default=True,
help="True if we should only replace case-sensitive matches ")
_findGetParser = reqparse.RequestParser()
_findGetParser.add_argument("window",
type=int,
required=False,
default=5,
help="Number of characters before and after first match to return ")
_findGetParser.add_argument("txt",
type=str,
required=False,
default="quick",
help="Your search term ")
#api.route('/replace', endpoint='replace', methods=['GET'])
#api.doc('defaultClass')
class ReplaceFrontend(Resource):
#api.expect(_replaceGetParser)
def get(self):
args = _replaceGetParser.parse_args()
return defaultClass._replace_get(**args)
#api.route('/find', endpoint='find', methods=['GET'])
#api.doc('defaultClass')
class FindFrontend(Resource):
#api.expect(_findGetParser)
def get(self):
args = _findGetParser.parse_args()
return defaultClass._find_get(**args)
retrievalNamespace = Namespace("retrieval", description="Data retrieval operations")
class RetrievalNamespaceClass():
def __init__(self):
self.data = data
def _retrieval_retrieve_data_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.return_data(**args)
def set_up_retrieval_worker():
global retrievalNamespaceClass
retrievalNamespaceClass = RetrievalNamespaceClass()
set_up_retrieval_worker()
_retrieval_retrieve_dataGetParser = reqparse.RequestParser()
#retrievalNamespace.route('/retrieval/retrieve_data', endpoint='retrieval/retrieve_data', methods=['GET'])
#retrievalNamespace.doc('retrievalNamespaceClass')
class Retrieval_retrieve_dataFrontend(Resource):
#retrievalNamespace.expect(_retrieval_retrieve_dataGetParser)
def get(self):
args = _retrieval_retrieve_dataGetParser.parse_args()
return retrievalNamespaceClass._retrieval_retrieve_data_get(**args)
api.add_namespace(retrievalNamespace)
I have had this problem with both pip-installed gunicorn and conda-installed uwsgi. I'm putting the file imported by the API at the end, since I think it is likely irrelevant what the function definitions are.
import numpy as np
import pandas as pd
import re
from subprocess import Popen, PIPE
from flask_restx import abort
def replace(txt: str = '', # apireq
old: str = '', # apireq
new: str = '', # apireq
case_sensitive: bool = True,
smart_casing: bool = True,
irrelevant_parameter: int = 5):
"""
Search and replace within a string, as long as the string and replacement
contain no four letter words.
arguments:
txt: Text to search
old: Substring to replace
new: Replacement for old
case_sensitive: True if we should only replace case-sensitive matches
smart_casing: True if we should infer replacement capitalization
from original casing.
return
return value
"""
four_letter_words = [re.match('[a-zA-Z]{4}$', word).string
for word in ('%s %s' % (txt, new)).split()
if re.match('[a-zA-Z]{4}$', word)]
if four_letter_words:
error_message = ('Server refuses to process four letter word(s) %s'
% ', '.join(four_letter_words[:5])
+ (', etc' if len(four_letter_words) > 5 else ''))
abort(403, custom=error_message)
return_value = {}
if not case_sensitive:
return_value['output'] = txt.replace(old, new)
else:
lowered = txt.replace(old, old.lower())
return_value['output'] = lowered.replace(old.lower(), new)
return return_value
def find_in_data_string(txt: str = "quick", # req
window: int = 5,
data=None): # noapi
"""
Check if there is a match for your search string in our extensive database,
and return the position of the first match with the surrounding text.
arguments:
txt: Your search term
data: The server's text data
window: Number of characters before and after first match to return
"""
return_value = {}
if txt in data:
idx = data.find(txt)
min_idx = max(idx-window, 0)
max_idx = min(idx+len(txt)+window, len(data)-1)
return_value['string_found'] = True
return_value['position'] = idx
return_value['surrounding_string'] = data[min_idx:max_idx]
return_value['surrounding_string_indices'] = [min_idx, max_idx]
else:
return_value = {['string_found']: False}
return return_value
def return_data(data=None): # noapi
"""
Return all the data in our text database.
"""
with Popen(['which', 'aws'], shell=True, stdout=PIPE) as p:
output = p.stdout.read()
try:
assert not output.strip()
except AssertionError:
abort(503, custom='The server is incorrectly configured.')
return_value = {'data': data}
return return_value

How do I write SQLAlchemy test fixtures for FastAPI applications

I am writing a FastAPI application that uses a SQLAlchemy database. I have copied the example from the FastAPI documentation, simplifying the database schema for concisions' sake. The complete source is at the bottom of this post.
This works. I can run it with uvicorn sql_app.main:app and interact with the database via the Swagger docs. When it runs it creates a test.db in the working directory.
Now I want to add a unit test. Something like this.
from fastapi import status
from fastapi.testclient import TestClient
from pytest import fixture
from main import app
#fixture
def client() -> TestClient:
return TestClient(app)
def test_fast_sql(client: TestClient):
response = client.get("/users/")
assert response.status_code == status.HTTP_200_OK
assert response.json() == []
Using the source code below, this takes the test.db in the working directory as the database. Instead I want to create a new database for every unit test that is deleted at the end of the test.
I could put the global database.engine and database.SessionLocal inside an object that is created at runtime, like so:
class UserDatabase:
def __init__(self, directory: Path):
directory.mkdir(exist_ok=True, parents=True)
sqlalchemy_database_url = f"sqlite:///{directory}/store.db"
self.engine = create_engine(
sqlalchemy_database_url, connect_args={"check_same_thread": False}
)
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
models.Base.metadata.create_all(bind=self.engine)
but I don't know how to make that work with main.get_db, since the Depends(get_db) logic ultimately assumes database.engine and database.SessionLocal are available globally.
I'm used to working with Flask, whose unit testing facilities handle all this for you. I don't know how to write it myself. Can someone show me the minimal changes I'd have to make in order to generate a new database for each unit test in this framework?
The complete source of the simplified FastAPI/SQLAlchemy app is as follows.
database.py
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
SQLALCHEMY_DATABASE_URL = "sqlite:///./test.db"
engine = create_engine(
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
models.py
from sqlalchemy import Column, Integer, String
from database import Base
class User(Base):
__tablename__ = "users"
id = Column(Integer, primary_key=True, index=True)
name = Column(String)
age = Column(Integer)
schemas.py
from pydantic import BaseModel
class UserBase(BaseModel):
name: str
age: int
class UserCreate(UserBase):
pass
class User(UserBase):
id: int
class Config:
orm_mode = True
crud.py
from sqlalchemy.orm import Session
import schemas
import models
def get_user(db: Session, user_id: int):
return db.query(models.User).filter(models.User.id == user_id).first()
def get_users(db: Session, skip: int = 0, limit: int = 100):
return db.query(models.User).offset(skip).limit(limit).all()
def create_user(db: Session, user: schemas.UserCreate):
db_user = models.User(name=user.name, age=user.age)
db.add(db_user)
db.commit()
db.refresh(db_user)
return db_user
main.py
from typing import List
from fastapi import Depends, FastAPI, HTTPException
from sqlalchemy.orm import Session
import schemas
import models
import crud
from database import SessionLocal, engine
models.Base.metadata.create_all(bind=engine)
app = FastAPI()
# Dependency
def get_db():
try:
db = SessionLocal()
yield db
finally:
db.close()
#app.post("/users/", response_model=schemas.User)
def create_user(user: schemas.UserCreate, db: Session = Depends(get_db)):
return crud.create_user(db=db, user=user)
#app.get("/users/", response_model=List[schemas.User])
def read_users(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)):
users = crud.get_users(db, skip=skip, limit=limit)
return users
#app.get("/users/{user_id}", response_model=schemas.User)
def read_user(user_id: int, db: Session = Depends(get_db)):
db_user = crud.get_user(db, user_id=user_id)
if db_user is None:
raise HTTPException(status_code=404, detail="User not found")
return db_user

You need to override your get_db dependency in your tests, see these docs.
Something like this for your fixture:
#fixture
def db_fixture() -> Session:
raise NotImplementError() # Make this return your temporary session
#fixture
def client(db_fixture) -> TestClient:
def _get_db_override():
return db_fixture
app.dependency_overrides[get_db] = _get_db_override
return TestClient(app)

Retrieving MySQL with Kivy

I have a Kivy code, where the output is:
I want to get replace the Box No. with strings retrieved from MySQL
So far I have tried to implement the MySQL to the python script:
class RemoveScreen(MyLayout):
def __init__(self,**kwargs):
db = MySQLdb.connect("localhost", "root", "[PASSWORD]", "tcs_microrage_crm")
cursor=db.cursor()
self.var = StringVar()
self.label1 = Label(self, text=0, textvariable=self.var)
myvar=str(self.var)
#http://stackoverflow.com/questions/775296/python-mysql-parameterized-queries
cursor.execute("SELECT part_name FROM stock_lists WHERE part_number = %s", (myvar))
self.myvar=StringVar()
self.myvar.set(cursor.fetchone())
self.label2 = Label(self, text=0, textvariable=myvar)
But this didn't work.
Q: How can I do MySQL queries and print individual strings in the kv file.

To show you how you could do that, I made a little search example.
This searches for fruit names in the database, and will output its name and price to the table.
from kivy.app import App
import MySQLdb
from kivy.uix.boxlayout import BoxLayout
from kivy.uix.gridlayout import GridLayout
from kivy.uix.label import Label
from kivy.uix.button import Button
from kivy.uix.textinput import TextInput
class DbCon:
def __init__(self):
self.db = MySQLdb.connect(user="root",passwd="pw",db="kivy")
self.c = self.db.cursor()
def get_rows(self,search = ""):
self.c.execute("SELECT * FROM fruit WHERE name REGEXP '.*%s.*' LIMIT 3" % search)
return self.c.fetchall()
class Table(BoxLayout):
def __init__(self,**kwargs):
super(Table,self).__init__(**kwargs)
self.orientation = "vertical"
self.search_field = BoxLayout(orientation="horizontal")
self.search_input = TextInput(text='search',multiline=False)
self.search_button = Button(text="search",on_press=self.search)
self.search_field.add_widget(self.search_input)
self.search_field.add_widget(self.search_button)
self.add_widget(self.search_field)
self.add_widget(Label(text="table"))
self.table = GridLayout(cols=2,rows=4)
self.table.add_widget(Label(text="Fruit"))
self.table.add_widget(Label(text="Price"))
self.rows = [[Label(text="item"),Label(text="price")],
[Label(text="item"),Label(text="price")],
[Label(text="item"),Label(text="price")]]
for item,price in self.rows:
self.table.add_widget(item)
self.table.add_widget(price)
self.add_widget(self.table)
self.db = DbCon()
self.update_table()
def update_table(self,search=""):
for index,row in enumerate(self.db.get_rows(search)):
self.rows[index][0].text = row[1]
self.rows[index][1].text = str(row[2])
def clear_table(self):
for index in range(3):
self.rows[index][0].text = ""
self.rows[index][1].text = ""
def search(self, *args):
self.clear_table()
self.update_table(self.search_input.text)
class MyApp(App):
def build(self):
return Table()
MyApp().run()

SQLAlchemy: replacing object with a new one, following defaults

I want to create a new instance of an SQLAlchemy object, so that fields are filled with default values, but I want to commit that to the database generating an UPDATE to a row that already exists with the same primary key, effectively resetting it to the default values. Is there any simple way to do that?

I have tried to do that and failed, because SQLAlchemy session tracks state of objects. So there is no easy way to make session to track new object as persistent one.
But you want to reset object to default, do you? There is a simple way to do that:
from sqlalchemy.ext.declarative import declarative_base
class Base(object):
def reset(self):
for name, column in self.__class__.__table__.columns.items():
if column.default is not None:
setattr(self, name, column.default.execute())
Base = declarative_base(bind=engine, cls=Base)
This adds reset method to all your model classes.
Here is the complete working example to fiddle with:
import os
from datetime import datetime
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql import functions
here = os.path.abspath(os.path.dirname(__file__))
engine = create_engine('sqlite:///%s/db.sqlite' % here, echo=True)
Session = sessionmaker(bind=engine)
class Base(object):
def reset(self):
for name, column in self.__class__.__table__.columns.items():
if column.default is not None:
setattr(self, name, column.default.execute())
Base = declarative_base(bind=engine, cls=Base)
class Thing(Base):
__tablename__ = 'things'
id = Column(Integer, primary_key=True)
value = Column(String(255), default='default')
ts1 = Column(DateTime, default=datetime.now)
ts2 = Column(DateTime, default=functions.now())
def __repr__(self):
return '<Thing(id={0.id!r}, value={0.value!r}, ' \
'ts1={0.ts1!r}, ts2={0.ts2!r})>'.format(self)
if __name__ == '__main__':
Base.metadata.drop_all()
Base.metadata.create_all()
print("---------------------------------------")
print("Create a new thing")
print("---------------------------------------")
session = Session()
thing = Thing(
value='some value',
ts1=datetime(2014, 1, 1),
ts2=datetime(2014, 2, 2),
)
session.add(thing)
session.commit()
session.close()
print("---------------------------------------")
print("Quering it from DB")
print("---------------------------------------")
session = Session()
thing = session.query(Thing).filter(Thing.id == 1).one()
print(thing)
session.close()
print("---------------------------------------")
print("Reset it to default")
print("---------------------------------------")
session = Session()
thing = session.query(Thing).filter(Thing.id == 1).one()
thing.reset()
session.commit()
session.close()
print("---------------------------------------")
print("Quering it from DB")
print("---------------------------------------")
session = Session()
thing = session.query(Thing).filter(Thing.id == 1).one()
print(thing)
session.close()

Is there any simple way to do that?
Upon further consideration, not really. The cleanest way will be to define your defaults in __init__. The constructor is never called when fetching objects from the DB, so it's perfectly safe. You can also use backend functions such as current_timestamp().
class MyObject(Base):
id = Column(sa.Integer, primary_key=True)
column1 = Column(sa.String)
column2 = Column(sa.Integer)
columnN = Column(sa.String)
updated = Column(sa.DateTime)
def __init__(self, **kwargs):
kwargs.setdefault('column1', 'default value')
kwargs.setdefault('column2', 123)
kwargs.setdefault('columnN', None)
kwargs.setdefault('updated', sa.func.current_timestamp())
super(MyObject, self).__init__(**kwargs)
default_obj = MyObject()
default_obj.id = old_id
session.merge(default_obj)
session.commit()

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Storing scraped data with SCRAPY in MySQL database - mysql

Related

SQLAlchemy Table classes and imports

Bizarre Environment-dependent Bad Request 400 error

How do I write SQLAlchemy test fixtures for FastAPI applications

Retrieving MySQL with Kivy

SQLAlchemy: replacing object with a new one, following defaults

Categories

Resources