Fail to store data in csv file through scraping

Fail to store data in csv file through scraping - csv

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())

Related

How to add an user input to to an URL with Tkinter?

I'm trying to create a program in which currency conversion is crucial. I have an API URL to which I want to add user input (amount that is supposed to be converted) not to display it in a label but to work with it later on.
input = Entry(root)
URL = "https://www.myawesomeurl.com/exchangerate/amout="
payload = {}
headers = {"apikey": "awjdwahduwahdwauduw"}
response = requests.request("GET", URL, headers=headers, data = payload)
status_code = response.status_code
result = response.text
I'm new to programming so any help and tips would be much appreciated.

You can use a submit function to process your data. Ex.
import tkinter
import requests
root = tkinter.Tk()
def submitamount():
amounttext = amount.get()
URL = f"https://www.myawesomeurl.com/exchangerate/amout={amounttext}"
payload = {}
headers = {"apikey": "awjdwahduwahdwauduw"}
response = requests.request("GET", URL, headers=headers, data=payload)
status_code = response.status_code
result = response.text
# Do stuff here / call another function to do stuff
#
# If you want to make some variables accessible to other parts of
# the code, simply define them before this function is declared;
# Ex.
#
# Instead of:
# root = tkinter.Tk()
# ...
# def submitamount():
# ...
#
# Do:
# root = tkinter.Tk()
# ...
# response = None
# status_code = None
# result = None
#
# def submitamount():
# ...
#
amount = tkinter.Entry(root)
amount.pack()
submit = tkinter.Button(text="Submit", command=submitamount)
submit.pack()
root.mainloop()
If you need a more thorough explanation, please let me know!

Bizarre Environment-dependent Bad Request 400 error

I'm writing a program to convert a repository into a Docker with an API based on some specification files. When I run the app on my Macbook's base environment, the computer-generated API works perfectly with both gunicorn and uwsgi. However, within the miniconda-based docker container, it failed with Bad Request 400: The browser (or proxy) sent a request that this server could not understand. My goal is to eliminate this error. Obviously, this has to do with the versions of some dependency or set of dependencies. Interestingly, the last endpoint in the API, which has a request parser within a namespace with no arguments, works perfectly, unlike the two other endpoints in the default namespace that do have arguments.
The API is built on flask_restx and uses reqparse.
The API code is here:
from flask_restx import Api, Resource, Namespace, reqparse, inputs
import flask
import process
from load_data import store_data
app = flask.Flask("restful_api")
api = Api(app, title="My API", description="This is an extremely useful API for performing tasks you would do with an API.", version="3.14")
data = {}
data.update(store_data())
class DefaultClass():
def __init__(self):
self.data = data
def _replace_get(self, **args):
default_args = {}
args = {**default_args, **args}
return process.replace(**args)
def _find_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.find_in_data_string(**args)
def set_up_worker():
global defaultClass
defaultClass = DefaultClass()
set_up_worker()
_replaceGetParser = reqparse.RequestParser()
_replaceGetParser.add_argument("txt",
type=str,
required=True,
help="Text to search ")
_replaceGetParser.add_argument("old",
type=str,
required=True,
help="Substring to replace ")
_replaceGetParser.add_argument("new",
type=str,
required=True,
help="Replacement for old ")
_replaceGetParser.add_argument("irrelevant_parameter",
type=int,
required=False,
default=5,
help="")
_replaceGetParser.add_argument("smart_casing",
type=inputs.boolean,
required=False,
default=True,
help="True if we should infer replacement capitalization from original casing. ")
_replaceGetParser.add_argument("case_sensitive",
type=inputs.boolean,
required=False,
default=True,
help="True if we should only replace case-sensitive matches ")
_findGetParser = reqparse.RequestParser()
_findGetParser.add_argument("window",
type=int,
required=False,
default=5,
help="Number of characters before and after first match to return ")
_findGetParser.add_argument("txt",
type=str,
required=False,
default="quick",
help="Your search term ")
#api.route('/replace', endpoint='replace', methods=['GET'])
#api.doc('defaultClass')
class ReplaceFrontend(Resource):
#api.expect(_replaceGetParser)
def get(self):
args = _replaceGetParser.parse_args()
return defaultClass._replace_get(**args)
#api.route('/find', endpoint='find', methods=['GET'])
#api.doc('defaultClass')
class FindFrontend(Resource):
#api.expect(_findGetParser)
def get(self):
args = _findGetParser.parse_args()
return defaultClass._find_get(**args)
retrievalNamespace = Namespace("retrieval", description="Data retrieval operations")
class RetrievalNamespaceClass():
def __init__(self):
self.data = data
def _retrieval_retrieve_data_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.return_data(**args)
def set_up_retrieval_worker():
global retrievalNamespaceClass
retrievalNamespaceClass = RetrievalNamespaceClass()
set_up_retrieval_worker()
_retrieval_retrieve_dataGetParser = reqparse.RequestParser()
#retrievalNamespace.route('/retrieval/retrieve_data', endpoint='retrieval/retrieve_data', methods=['GET'])
#retrievalNamespace.doc('retrievalNamespaceClass')
class Retrieval_retrieve_dataFrontend(Resource):
#retrievalNamespace.expect(_retrieval_retrieve_dataGetParser)
def get(self):
args = _retrieval_retrieve_dataGetParser.parse_args()
return retrievalNamespaceClass._retrieval_retrieve_data_get(**args)
api.add_namespace(retrievalNamespace)
I have had this problem with both pip-installed gunicorn and conda-installed uwsgi. I'm putting the file imported by the API at the end, since I think it is likely irrelevant what the function definitions are.
import numpy as np
import pandas as pd
import re
from subprocess import Popen, PIPE
from flask_restx import abort
def replace(txt: str = '', # apireq
old: str = '', # apireq
new: str = '', # apireq
case_sensitive: bool = True,
smart_casing: bool = True,
irrelevant_parameter: int = 5):
"""
Search and replace within a string, as long as the string and replacement
contain no four letter words.
arguments:
txt: Text to search
old: Substring to replace
new: Replacement for old
case_sensitive: True if we should only replace case-sensitive matches
smart_casing: True if we should infer replacement capitalization
from original casing.
return
return value
"""
four_letter_words = [re.match('[a-zA-Z]{4}$', word).string
for word in ('%s %s' % (txt, new)).split()
if re.match('[a-zA-Z]{4}$', word)]
if four_letter_words:
error_message = ('Server refuses to process four letter word(s) %s'
% ', '.join(four_letter_words[:5])
+ (', etc' if len(four_letter_words) > 5 else ''))
abort(403, custom=error_message)
return_value = {}
if not case_sensitive:
return_value['output'] = txt.replace(old, new)
else:
lowered = txt.replace(old, old.lower())
return_value['output'] = lowered.replace(old.lower(), new)
return return_value
def find_in_data_string(txt: str = "quick", # req
window: int = 5,
data=None): # noapi
"""
Check if there is a match for your search string in our extensive database,
and return the position of the first match with the surrounding text.
arguments:
txt: Your search term
data: The server's text data
window: Number of characters before and after first match to return
"""
return_value = {}
if txt in data:
idx = data.find(txt)
min_idx = max(idx-window, 0)
max_idx = min(idx+len(txt)+window, len(data)-1)
return_value['string_found'] = True
return_value['position'] = idx
return_value['surrounding_string'] = data[min_idx:max_idx]
return_value['surrounding_string_indices'] = [min_idx, max_idx]
else:
return_value = {['string_found']: False}
return return_value
def return_data(data=None): # noapi
"""
Return all the data in our text database.
"""
with Popen(['which', 'aws'], shell=True, stdout=PIPE) as p:
output = p.stdout.read()
try:
assert not output.strip()
except AssertionError:
abort(503, custom='The server is incorrectly configured.')
return_value = {'data': data}
return return_value

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",

From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Dynamically create REST test steps in groovy

I have some JSON file that I want to use as input, let's consider that I have this folders
mainFolder --> Folder 1 : 10 JSON file (req)
--> Folder 2 : 10 JSON file (req)
I want to create from these folders : - Each directory is a testCase - Each file is a testStep
Here's my code :
import com.eviware.soapui.impl.wsdl.teststeps.registry.GroovyScriptStepFactory
import com.eviware.soapui.support.UISupport;
import com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory
import com.eviware.soapui.config.TestStepConfig
import com.eviware.soapui.impl.rest.*;
def myTestCase = context.testCase
log.info myTestCase
def projectPath = Path
def endPoint = "anEndPoint";
def addTestStep(operation, requestFile, testCase, projectPath, endpoint){
def usageId = requestFile.name.replace("_request.json","")
def projectPathTest = projectPath+"SPecificPath";
def testStepName=usageId;
def iface = testCase.testSuite.project.getInterfaceList()[0];
def operationName= operation;
def op = iface.operations[operationName];
def config = com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory.createConfig( op, testStepName);
def newTestStep = testCase.addTestStep( config );
newTestStep.getTestRequest().setRequestContent(requestFile.text)
newTestStep.httpRequest.endpoint = endpoint
}
if ( com.eviware.soapui.support.UISupport.confirm("Reconstruct ?","Confirm") ){
testSuite.getTestCaseList().each{testCase->testSuite.removeTestCase(testCase)}
new File(projectPathTest).eachDir{dir->
operation = dir.name
def RestTestCase = testSuite.addNewTestCase(operation)
RestTestCase.setFailOnError(false)
dir.eachFileMatch(~/.*_request\.json/){file->
addTestStep(operation, file, RestTestCase, projectPath, endPoint)
}
}
}
I verified many times, many pages, many forums, it seems that I have the correct form and algorithm to get what I want, instead, I succeed to create testCase with the names of the folders, I succeed to get the request JSON file, but I fail to create the test step, and I'm pretty sure it's either the config or interface/operation who make it fail :
def config = com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory.createConfig( op, testStepName);
Any help please ?

I'm using TMDB api to show movie poster but I'm getting some errors

**here is my code**
import os
import requests
CONFIG_PATTERN = 'http://api.themoviedb.org/3/search/movie?query=Monsters+University&api_key=xxx'
IMG_PATTERN = 'http://api.themoviedb.org/3/movie?query=Monsters+University&api_key=xxx'
KEY = 'xxx'
def _get_json(url):
r = requests.get(url)
return r.json()
def _download_images(urls, path='.'):
"""download all images in list 'urls' to 'path' """
for nr, url in enumerate(urls):
r = requests.get(url)
filetype = r.headers['content-type'].split('/')[-1]
filename = 'poster_{0}.{1}'.format(nr+1,filetype)
filepath = os.path.join(path, filename)
with open(filepath,'wb') as w:
w.write(r.content)
def get_poster_urls(imdbid):
""" return image urls of posters for IMDB id
returns all poster images from 'themoviedb.org'. Uses the
maximum available size.
Args:
imdbid (str): IMDB id of the movie
Returns:
list: list of urls to the images
"""
config = _get_json(CONFIG_PATTERN.format(key=KEY))
base_url = config['images']['base_url']
sizes = config['images']['poster_sizes']
def size_str_to_int(x):
return float("inf") if x == 'original' else int(x[1:])
max_size = max(sizes, key=size_str_to_int)
posters = _get_json(IMG_PATTERN.format(key=KEY,imdbid=imdbid))['posters']
poster_urls = []
for poster in posters:
rel_path = poster['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
poster_urls.append(url)
return poster_urls
def tmdb_posters(imdbid, count=None, outpath='.'):
urls = get_poster_urls(imdbid)
if count is not None:
urls = urls[:count]
_download_images(urls, outpath)
if __name__=="__main__":
tmdb_posters('tt0095016')
please format the code accordingly iam fetching the json data using TMDB api but iam getting errors
here are errors
Traceback (most recent call last):
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 57, in <module>
tmdb_posters('tt0095016')
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 51, in tmdb_posters
urls = get_poster_urls(imdbid)
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 33, in get_poster_urls
base_url = config['images']['base_url']
KeyError: 'images'
Process finished with exit code 1
But iam unable to figuring out the errors it seems like everything is right but urrghh these errors are not going i have tried everything

I guess it should work if you change CONFIG_PATTERN to http://api.themoviedb.org/3/configuration?api_key=<your_key>
BTW, edited your question since you posted your API key in it.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Fail to store data in csv file through scraping - csv

Related

How to add an user input to to an URL with Tkinter?

Bizarre Environment-dependent Bad Request 400 error

Scrapinghub plugs my results in the log and not in item

Dynamically create REST test steps in groovy

I'm using TMDB api to show movie poster but I'm getting some errors

Categories

Resources