A CSV import process intends to import valid records and advise of errors on bad records in a resulting action.
rescue outputs can go to the console, and in development mode rendered to the user. But in production mode these need to be captured and rendered.
The model defines the import
def self.importtest(file, analisi_id)
n, errs = 0, []
CSV.foreach(file.path, :col_sep => "\t", headers: true, skip_blanks: true) do |row|
n += 1
# skip blank row
next if row.join.blank?
begin
operativ = Operativ.find_by_account(row[3])
if operativ.nil?
errs << row
end
end
end
if errs != []
# send errors to form and render to user
else
# run full import routine
and the controller
def import
params[:analysis_id] = session[:analysis_id]
Bilancino.import(params[:file], params[:analysis_id])
redirect_to loaded_registrations_path, notice: "data imported"
end
There are two problems here. The first is how to populate a form with the error data. The second is how to re-direct in each case errs != [] and errs == []...
Related
I am working on a personal project and was wondering if my solution for inserting data to a MySQL database would be considered "pythonic" and efficient.
I have written a separate class for that, which will be called from an object which holds a dataframe. From there I am calling my save() function to write the dataframe to the database.
The script will be running once a day where I scrape some data from some websites and save it to my database. So it is important that it really runs through completely even when I have bad data or temporary connection issues (script and database run on different machines).
import mysql.connector
# custom logger
from myLog import logger
# custom class for formatting the data, a lot of potential errors are handled here
from myFormat import myFormat
# insert strings to mysql are stored and referenced here
import sqlStrings
class saveSQL:
def __init__(self):
self.frmt = myFormat()
self.host = 'XXX.XXX.XXX.XXX'
self.user = 'XXXXXXXX'
self.password = 'XXXXXXXX'
self.database = 'XXXXXXXX'
def save(self, payload, type):
match type:
case 'First':
return self.__first(payload)
case 'Second':
...
case _:
logger.error('Undefined Input for Type!')
def __first(self, payload):
try:
self.mydb = mysql.connector.connect(host=self.host,user=self.user,password=self.password,database=self.database)
mycursor = self.mydb.cursor()
except mysql.connector.Error as err:
logger.error('Couldn\'t establish connection to DB!')
try:
tmpList = payload.values.tolist()
except ValueError:
logger.error('Value error in converting dataframe to list: ' % payload)
try:
mycursor.executemany(sqlStrings.First, tmpList)
self.mydb.commit()
dbWrite = mycursor.rowcount
except mysql.connector.Error as err:
logger.error('Error in writing to database: %s' % err)
for ele in myList:
dbWrite = 0
try:
mycursor.execute(sqlStrings.First, ele)
self.mydb.commit()
dbWrite = dbWrite + mycursor.rowcount
except mysql.connector.Error as err:
logger.error('Error in writing to database: %s \n ele: %s' % [err,ele])
continue
pass
mycursor.close()
return dbWrite
Things I am wondering about:
Is the match case a good option to distinguish between writing to different tables depending on the data?
Are the different try/except blocks really necessary or are there easier ways of handling potential errors?
Do I really need the pass command at the end of the for-loop?
I'm trying to get a unit test working that validates a function that reads credentials from a JSON-encoded file. Since the credentials themselves aren't fixed, the unit test needs to provide some and then test that they are correctly retrieved.
Here is the credentials function:
def read_credentials():
basedir = os.path.dirname(__file__)
with open(os.path.join(basedir, "authentication.json")) as f:
data = json.load(f)
return data["bot_name"], data["bot_password"]
and here is the test:
def test_credentials(self):
with patch("builtins.open", mock_open(
read_data='{"bot_name": "name", "bot_password": "password"}\n'
)):
name, password = shared.read_credentials()
self.assertEqual(name, "name")
self.assertEqual(password, "password")
However, when I run the test, the json code blows up with a decode error. Looking at the json code itself, I'm struggling to see why the mock test is failing because json.load(f) simply calls f.read() then calls json.loads().
Indeed, if I change my authentication function to the following, the unit test works:
def read_credentials():
# Read the authentication file from the current directory and create a
# HTTPBasicAuth object that can then be used for future calls.
basedir = os.path.dirname(__file__)
with open(os.path.join(basedir, "authentication.json")) as f:
content = f.read()
data = json.loads(content)
return data["bot_name"], data["bot_password"]
I don't necessarily mind leaving my code in this form, but I'd like to understand if I've got something wrong in my test that would allow me to keep my function in its original form.
Stack trace:
Traceback (most recent call last):
File "test_shared.py", line 56, in test_credentials
shared.read_credentials()
File "shared.py", line 60, in read_credentials
data = json.loads(content)
File "/home/philip/.local/share/virtualenvs/atlassian-webhook-basic-3gOncDp4/lib/python3.6/site-packages/flask/json/__init__.py", line 205, in loads
return _json.loads(s, **kwargs)
File "/usr/lib/python3.6/json/__init__.py", line 367, in loads
return cls(**kw).decode(s)
File "/usr/lib/python3.6/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python3.6/json/decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
I had the same issue and got around it by mocking json.load and builtins.open:
import json
from unittest.mock import patch, MagicMock
# I don't care about the actual open
p1 = patch( "builtins.open", MagicMock() )
m = MagicMock( side_effect = [ { "foo": "bar" } ] )
p2 = patch( "json.load", m )
with p1 as p_open:
with p2 as p_json_load:
f = open( "filename" )
print( json.load( f ) )
Result:
{'foo': 'bar'}
I had the exact same issue and solved it. Full code below, first the function to test, then the test itself.
The original function I want to test loads a json file that is structured like a dictionary, and checks to see if there's a specific key-value pair in it:
def check_if_file_has_real_data(filepath):
with open(filepath, "r") as f:
data = json.load(f)
if "fake" in data["the_data"]:
return False
else:
return True
But I want to test this without loading any actual file, exactly as you describe. Here's how I solved it:
from my_module import check_if_file_has_real_data
import mock
#mock.patch("my_module.json.load")
#mock.patch("my_module.open")
def test_check_if_file_has_real_data(mock_open, mock_json_load):
mock_json_load.return_value = dict({"the_data": "This is fake data"})
assert check_if_file_has_real_data("filepath") == False
mock_json_load.return_value = dict({"the_data": "This is real data"})
assert check_if_file_has_real_data("filepath") == True
The mock_open object isn't called explicitly in the test function, but if you don't include that decorator and argument you get a filepath error when the with open part of the check_if_file_has_real_data function tries to run using the actual open function rather than the MagicMock object that's been passed into it.
Then you overwrite the response provided by the json.load mock with whatever you want to test.
I've tried to implement this pipeline in my spider.
After installing the necessary dependencies I am able to run the spider without any errors but for some reason it doesn't write to my database.
I'm pretty sure there is something going wrong with connecting to the database. When I give in a wrong password, I still don't get any error.
When the spider scraped all the data, it needs a few minutes before it starts dumping the stats.
2017-08-31 13:17:12 [scrapy] INFO: Closing spider (finished)
2017-08-31 13:17:12 [scrapy] INFO: Stored csv feed (27 items) in: test.csv
2017-08-31 13:24:46 [scrapy] INFO: Dumping Scrapy stats:
Pipeline:
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = {}
SETTINGS['DB_HOST'] = 'mysql.domain.com'
SETTINGS['DB_USER'] = 'username'
SETTINGS['DB_PASSWD'] = 'password'
SETTINGS['DB_PORT'] = 3306
SETTINGS['DB_DB'] = 'database_name'
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
print "init"
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print "close"
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
print "insert"
result = tx.execute(
" INSERT INTO matches(type,home,away,home_score,away_score) VALUES (soccer,"+item["home"]+","+item["away"]+","+item["score"].explode("-")[0]+","+item["score"].explode("-")[1]+")"
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
print "error"
log.err(e)
Spider:
import scrapy
import dateparser
from crawling.items import KNVBItem
class KNVBspider(scrapy.Spider):
name = "knvb"
start_urls = [
'http://www.knvb.nl/competities/eredivisie/uitslagen',
]
custom_settings = {
'ITEM_PIPELINES': {
'crawling.pipelines.MySQLPipeline': 301,
}
}
def parse(self, response):
# www.knvb.nl/competities/eredivisie/uitslagen
for row in response.xpath('//div[#class="table"]'):
for div in row.xpath('./div[#class="row"]'):
match = KNVBItem()
match['home'] = div.xpath('./div[#class="value home"]/div[#class="team"]/text()').extract_first()
match['away'] = div.xpath('./div[#class="value away"]/div[#class="team"]/text()').extract_first()
match['score'] = div.xpath('./div[#class="value center"]/text()').extract_first()
match['date'] = dateparser.parse(div.xpath('./preceding-sibling::div[#class="header"]/span/span/text()').extract_first(), languages=['nl']).strftime("%d-%m-%Y")
yield match
If there are better pipelines available to do what I'm trying to achieve that'd be welcome as well. Thanks!
Update:
With the link provided in the accepted answer I eventually got to this function that's working (and thus solved my problem):
def process_item(self, item, spider):
print "process"
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
query.addBoth(lambda _: item)
return query
Take a look at this for how to use adbapi with MySQL for saving scraped items. Note the difference in your process_item and their process_item method implementation. While you return the item immediately, they return Deferred object which is the result of runInteraction method and which returns the item upon its completion. I think this is the reason your _insert_record never gets called.
If you can see the insert in your output that's already a good sign.
I'd rewrite the insert function this way:
def _insert_record(self, tx, item):
print "insert"
raw_sql = "INSERT INTO matches(type,home,away,home_score,away_score) VALUES ('%s', '%s', '%s', '%s', '%s')"
sql = raw_sql % ('soccer', item['home'], item['away'], item['score'].explode('-')[0], item['score'].explode('-')[1])
print sql
result = tx.execute(sql)
if result > 0:
self.stats.inc_value('database/items_added')
It allows you to debug the sql you're using. In you version you're not wrapping the string in ' which is a syntax error in mysql.
I'm not sure about your last values (score) so I treated them as strings.
I'm trying to download the Flickr Style dataset using assemble_data.py provided in the examples folder. However, whenever I run this python crashes with error 'python quit unexpectedly'.
It seems to be related to multiprocessing and urllib. When I replace pool.map with a single threaded loop it works but is very slow. Also, if I run with multiprocessing but remove urlretrieve it seems to work too.
Answering my own question here... I resolved this by using urllib3 instead.
http = urllib3.PoolManager(10)
def download_image(args_tuple):
"For use with multiprocessing map. Returns filename on fail."
url, filename = args_tuple
try:
if not os.path.exists(filename):
print url + ' -> ' + filename
# Dont redirect.
response = http.request('GET', url, redirect=False)
with open(filename, 'wb') as f:
f.write(response.data)
with open(filename) as f:
assert hashlib.sha1(f.read()).hexdigest() != MISSING_IMAGE_SHA1
test_read_image = io.imread(filename)
return True
except KeyboardInterrupt:
raise Exception() # multiprocessing doesn't catch keyboard exceptions
except:
os.remove(filename)
return False
Gist here.
I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())