I opened port 8080 using python socket:
Sk.bind(ip_addrr, 8080)
But I want it to open a html page in that port,so that when I navigate to :8080 in browser I must get a web page.Any ideas?
my problem is that I need to get a html page which I created, to be displayed in port 8080.for example I have index.html for port 80 similarly,I need to have a html page in port 8080.How will I do that?
Very minimal example I was able to put together after some research. You can find this also on https://replit.com/#bluebrown/python-socket-html
import socket
s = socket.socket()
s.bind(('0.0.0.0', 8080))
s.listen(1)
with open('index.html', 'rb') as file:
html = file.read()
while True:
conn, addr = s.accept()
with conn:
print('Connected by', addr)
req = conn.recv(1024)
print('request:', req)
conn.send('HTTP/1.1 200 OK\nContent-Type: text/html\n\n'.encode())
conn.sendall(html)
Based on your request int he comments I have taken it a bit further. Not much though. Just enough to give you an idea what it takes to service different pages.
import socket
def parseRequest(request):
output = {}
r = request.decode("utf-8").split("\r\n")
parts = r[0].split(' ')
output["method"] = parts[0]
output["path"] = parts[1]
output["protocol"] = parts[2]
output["headers"] = { (kv.split(':')[0]): kv.split(':')[1].strip() for kv in r[1:] if (len(kv.split(':')) > 1) }
return output
s = socket.socket()
s.bind(('0.0.0.0', 8080))
s.listen(1)
while True:
conn, addr = s.accept()
with conn:
print('Connected by', addr)
req = conn.recv(1024)
r = parseRequest(req)
path = r["path"][1:]
if (path == ""):
path = "index"
with open(f'{path}.html', 'rb') as file:
html = file.read()
conn.send('HTTP/1.1 200 OK\nContent-Type: text/html\n\n'.encode())
conn.sendall(html)
You can check the updated repl.
Related
I've written a primitive HTTP server for testing my Emscripten apps. It serves static files from the current directory. The specifics is that I have large binary files (.data and .wasm), some of them rarely change. So it makes sense to have browser cache them indefinitely.
Chrome successfully sends If-None-Match for .html and .js files, but fails to do so for .data (in my case it's ~ 70 Mb). .html and .js files thus receive well 304, while .data gets 200 and the whole large file sent again, and this is slow-ish even on localhost.
How do I force Chrome to cache large binary files?
import os
import hashlib
import http.server
root = '.'
mime = {
'.manifest': 'text/cache-manifest',
'.html': 'text/html',
'.png': 'image/png',
'.jpg': 'image/jpg',
'.svg': 'image/svg+xml',
'.css': 'text/css',
'.js': 'application/x-javascript',
'.wasm': 'application/wasm',
'.data': 'application/octet-stream',
}
mime_fallback = 'application/octet-stream'
def md5(file_path):
hash = hashlib.md5()
with open(file_path, 'rb') as f:
hash.update(f.read())
return hash.hexdigest()
cache = {os.path.join(root, f) : md5(f) for f in os.listdir() if any(map(f.endswith, mime)) and os.path.isfile(f)}
class EtagHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self, body = True):
self.protocol_version = 'HTTP/1.1'
self.path = os.path.join(root, self.path.lstrip('/') + ('index.html' if self.path == '/' else ''))
if not os.path.exists(self.path) or not os.path.isfile(self.path):
self.send_response(404)
elif self.path not in cache or cache[self.path] != self.headers.get('If-None-Match'):
content_type = ([content_type for ext, content_type in sorted(mime.items(), reverse = True) if self.path.endswith(ext)] + [mime_fallback])[0]
with open(self.path, 'rb') as f:
content = f.read()
self.send_response(200)
self.send_header('Content-Length', len(content))
self.send_header('Content-Type', content_type)
self.send_header('ETag', cache[self.path])
self.end_headers()
self.wfile.write(content)
else:
self.send_response(304)
self.send_header('ETag', cache[self.path])
self.end_headers()
if __name__ == '__main__':
PORT = 8080
print("serving at port", PORT)
httpd = http.server.HTTPServer(("", PORT), EtagHandler)
httpd.serve_forever()
I am writing a script in python 3 that is listening to the tunnel and saving and updating data inside MySQL depend on the message received.
I went into weird behavior, i did a simple connection to MySQL using pymysql module and everything worked fine, ut after sometime this simple connection closes.
So i decide to implement Pool connection to MySQL and here arises the problem. Something happens no errors, but the issue is the following:
My cursor = yield self._pool.execute(query, list(filters.values()))
cursor result = tornado_mysql.pools.Pool object at 0x0000019DE5D71F98
and stacks like that not doing anything more
If i remove yield from cursor pass that line and next line throws error
response = yield c.fetchall()
AttributeError: 'Future' object has no attribute 'fetchall'
How i can fix the MySQL pool connection to work properly?
What i tried:
I use few modules for pool connection, all goes in same issue
Did back simple connection with pymysql and worked again
Below my code:
python script file
import pika
from model import SyncModel
_model = SyncModel(conf, _server_id)
#coroutine
def main():
credentials = pika.PlainCredentials('user', 'password')
try:
cp = pika.ConnectionParameters(
host='127.0.0.1',
port=5671,
credentials=credentials,
ssl=False,
)
connection = pika.BlockingConnection(cp)
channel = connection.channel()
#coroutine
def callback(ch, method, properties, body):
if 'messageType' in properties.headers:
message_type = properties.headers['messageType']
if message_type in allowed_message_types:
result = ptoto_file._reflection.ParseMessage(descriptors[message_type], body)
if result:
result = protobuf_to_dict(result)
if message_type == 'MyMessage':
yield _model.message_event(data=result)
else:
print('Message type not in allowed list = ' + str(message_type))
print('continue listening...')
channel.basic_consume(callback, queue='queue', no_ack=True)
print(' [*] Waiting for messages. To exit press CTRL+C')
channel.start_consuming()
except Exception as e:
print('Could not connect to host 127.0.0.1 on port 5671')
print(str(e))
if __name__ == '__main__':
main()
SyncModel
from tornado_mysql import pools
from tornado.gen import coroutine, Return
from tornado_mysql.cursors import DictCursor
class SyncModel(object):
def __init__(self, conf, server_id):
self.conf = conf
servers = [i for i in conf.mysql.servers]
for s in servers:
if s['server_id'] == server_id:
// s hold all data as, host, user, port, autocommit, charset, db, password
s['cursorclass'] = DictCursor
self._pool = pools.Pool(s, max_idle_connections=1, max_recycle_sec=3)
#coroutine
def message_event(self, data):
table_name = 'table_name'
query = ''
data = data['message']
filters = {
'id': data['id']
}
// here the connection fails as describe above
response = yield self.query_select(table_name, self._pool, filters=filters)
#coroutine
def query_select(self, table_name, _pool, filters=None):
if filters is None:
filters = {}
combined_filters = ['`%s` = %%s' % i for i in filters.keys()]
where = 'WHERE ' + ' AND '.join(combined_filters) if combined_filters else ''
query = """SELECT * FROM `%s` %s""" % (table_name, where)
c = self._pool.execute(query, list(filters.values()))
response = yield c.fetchall()
raise Return({response})
All the code was working with just simple connection to the database, after i start to use pool example is not working anymore. Will appreciate any help in this issue.
This is a stand alone script.
The pool connection was not working, so switched back to pymysql with double checking the connection
I would like to post my answer that worked, only this solution worked for me
before connecting to mysql to check if the connection is open, if not reconnect
if not self.mysql.open:
self.mysql.ping(reconnect=True)
I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'
I'm trying to use the AssumeRole in such a way that i'm traversing multiple accounts and retrieving assets for those accounts. I've made it to this point:
import boto3
stsclient = boto3.client('sts')
assumedRoleObject = sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1")
Great, i have the assumedRoleObject. But now i want to use that to list things like ELBs or something that isn't a built-in low level resource.
How does one go about doing that? If i may ask - please code out a full example, so that everyone can benefit.
Here's a code snippet from the official AWS documentation where an s3 resource is created for listing all s3 buckets. boto3 resources or clients for other services can be built in a similar fashion.
# create an STS client object that represents a live connection to the
# STS service
sts_client = boto3.client('sts')
# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.
assumed_role_object=sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1"
)
# From the response that contains the assumed role, get the temporary
# credentials that can be used to make subsequent API calls
credentials=assumed_role_object['Credentials']
# Use the temporary credentials that AssumeRole returns to make a
# connection to Amazon S3
s3_resource=boto3.resource(
's3',
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken'],
)
# Use the Amazon S3 resource object that is now configured with the
# credentials to access your S3 buckets.
for bucket in s3_resource.buckets.all():
print(bucket.name)
To get a session with an assumed role:
import botocore
import boto3
import datetime
from dateutil.tz import tzlocal
assume_role_cache: dict = {}
def assumed_role_session(role_arn: str, base_session: botocore.session.Session = None):
base_session = base_session or boto3.session.Session()._session
fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
client_creator = base_session.create_client,
source_credentials = base_session.get_credentials(),
role_arn = role_arn,
extra_args = {
# 'RoleSessionName': None # set this if you want something non-default
}
)
creds = botocore.credentials.DeferredRefreshableCredentials(
method = 'assume-role',
refresh_using = fetcher.fetch_credentials,
time_fetcher = lambda: datetime.datetime.now(tzlocal())
)
botocore_session = botocore.session.Session()
botocore_session._credentials = creds
return boto3.Session(botocore_session = botocore_session)
# usage:
session = assumed_role_session('arn:aws:iam::ACCOUNTID:role/ROLE_NAME')
ec2 = session.client('ec2') # ... etc.
The resulting session's credentials will be automatically refreshed when required which is quite nice.
Note: my previous answer was outright wrong but I can't delete it, so I've replaced it with a better and working answer.
You can assume role using STS token, like:
class Boto3STSService(object):
def __init__(self, arn):
sess = Session(aws_access_key_id=ARN_ACCESS_KEY,
aws_secret_access_key=ARN_SECRET_KEY)
sts_connection = sess.client('sts')
assume_role_object = sts_connection.assume_role(
RoleArn=arn, RoleSessionName=ARN_ROLE_SESSION_NAME,
DurationSeconds=3600)
self.credentials = assume_role_object['Credentials']
This will give you temporary access key and secret keys, with session token. With these temporary credentials, you can access any service. For Eg, if you want to access ELB, you can use the below code:
self.tmp_credentials = Boto3STSService(arn).credentials
def get_boto3_session(self):
tmp_access_key = self.tmp_credentials['AccessKeyId']
tmp_secret_key = self.tmp_credentials['SecretAccessKey']
security_token = self.tmp_credentials['SessionToken']
boto3_session = Session(
aws_access_key_id=tmp_access_key,
aws_secret_access_key=tmp_secret_key, aws_session_token=security_token
)
return boto3_session
def get_elb_boto3_connection(self, region):
sess = self.get_boto3_session()
elb_conn = sess.client(service_name='elb', region_name=region)
return elb_conn
with reference to the solution by #jarrad which is not working as of Feb 2021, and as a solution that does not use STS explicitly please see the following
import boto3
import botocore.session
from botocore.credentials import AssumeRoleCredentialFetcher, DeferredRefreshableCredentials
def get_boto3_session(assume_role_arn=None):
session = boto3.Session(aws_access_key_id="abc", aws_secret_access_key="def")
if not assume_role_arn:
return session
fetcher = AssumeRoleCredentialFetcher(
client_creator=_get_client_creator(session),
source_credentials=session.get_credentials(),
role_arn=assume_role_arn,
)
botocore_session = botocore.session.Session()
botocore_session._credentials = DeferredRefreshableCredentials(
method='assume-role',
refresh_using=fetcher.fetch_credentials
)
return boto3.Session(botocore_session=botocore_session)
def _get_client_creator(session):
def client_creator(service_name, **kwargs):
return session.client(service_name, **kwargs)
return client_creator
the function can be called as follows
ec2_client = get_boto3_session(role_arn='my_role_arn').client('ec2', region_name='us-east-1')
If you want a functional implementation, this is what I settled on:
def filter_none_values(kwargs: dict) -> dict:
"""Returns a new dictionary excluding items where value was None"""
return {k: v for k, v in kwargs.items() if v is not None}
def assume_session(
role_session_name: str,
role_arn: str,
duration_seconds: Optional[int] = None,
region_name: Optional[str] = None,
) -> boto3.Session:
"""
Returns a session with the given name and role.
If not specified, duration will be set by AWS, probably at 1 hour.
If not specified, region will be left unset.
Region can be overridden by each client or resource spawned from this session.
"""
assume_role_kwargs = filter_none_values(
{
"RoleSessionName": role_session_name,
"RoleArn": role_arn,
"DurationSeconds": duration_seconds,
}
)
credentials = boto3.client("sts").assume_role(**assume_role_kwargs)["Credentials"]
create_session_kwargs = filter_none_values(
{
"aws_access_key_id": credentials["AccessKeyId"],
"aws_secret_access_key": credentials["SecretAccessKey"],
"aws_session_token": credentials["SessionToken"],
"region_name": region_name,
}
)
return boto3.Session(**create_session_kwargs)
def main() -> None:
session = assume_session(
"MyCustomSessionName",
"arn:aws:iam::XXXXXXXXXXXX:role/TheRoleIWantToAssume",
region_name="us-east-1",
)
client = session.client(service_name="ec2")
print(client.describe_key_pairs())
import json
import boto3
roleARN = 'arn:aws:iam::account-of-role-to-assume:role/name-of-role'
client = boto3.client('sts')
response = client.assume_role(RoleArn=roleARN,
RoleSessionName='RoleSessionName',
DurationSeconds=900)
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1',
aws_access_key_id=response['Credentials']['AccessKeyId'],
aws_secret_access_key=response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
response = dynamodb_client.get_item(
Key={
'key1': {
'S': '1',
},
'key2': {
'S': '2',
},
},
TableName='TestTable')
print(response)
#!/usr/bin/env python3
import boto3
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(RoleArn = "arn:aws:iam::123456789012:role/example_role",
RoleSessionName = "AssumeRoleSession1",
DurationSeconds = 1800)
session = boto3.Session(
aws_access_key_id = assumed_role['Credentials']['AccessKeyId'],
aws_secret_access_key = assumed_role['Credentials']['SecretAccessKey'],
aws_session_token = assumed_role['Credentials']['SessionToken'],
region_name = 'us-west-1'
)
# now we make use of the role to retrieve a parameter from SSM
client = session.client('ssm')
response = client.get_parameter(
Name = '/this/is/a/path/parameter',
WithDecryption = True
)
print(response)
Assuming that 1) the ~/.aws/config or ~/.aws/credentials file is populated with each of the roles that you wish to assume and that 2) the default role has AssumeRole defined in its IAM policy for each of those roles, then you can simply (in pseudo-code) do the following and not have to fuss with STS:
import boto3
# get all of the roles from the AWS config/credentials file using a config file parser
profiles = get_profiles()
for profile in profiles:
# this is only used to fetch the available regions
initial_session = boto3.Session(profile_name=profile)
# get the regions
regions = boto3.Session.get_available_regions('ec2')
# cycle through the regions, setting up session, resource and client objects
for region in regions:
boto3_session = boto3.Session(profile_name=profile, region_name=region)
boto3_resource = boto3_session.resource(service_name='s3', region_name=region)
boto3_client = boto3_session.client(service_name='s3', region_name=region)
[ do something interesting with your session/resource/client here ]
Credential Setup (boto3 - Shared Credentials File)
Assume Role Setup (AWS)
After a few days of searching, this is the simplest solution I have found. explained here but does not have a usage example.
import boto3
for profile in boto3.Session().available_profiles:
boto3.DEFAULT_SESSION = boto3.session.Session(profile_name=profile)
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
This will switch the default role you will be using. To not make the profile the default, just do not assign it to boto3.DEFAULT_SESSION. but instead, do the following.
testing_profile = boto3.session.Session(profile_name='mainTesting')
s3 = testing_profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
Important to note that the .aws credentials need to be set in a specific way.
[default]
aws_access_key_id = default_access_id
aws_secret_access_key = default_access_key
[main]
aws_access_key_id = main_profile_access_id
aws_secret_access_key = main_profile_access_key
[mainTesting]
source_profile = main
role_arn = Testing role arn
mfa_serial = mfa_arn_for_main_role
[mainProduction]
source_profile = main
role_arn = Production role arn
mfa_serial = mfa_arn_for_main_role
I don't know why but the mfa_serial key has to be on the roles for this to work instead of the source account which would make more sense.
Here's the code snippet I used
sts_client = boto3.client('sts')
assumed_role_object = sts_client.assume_role(
RoleArn=<arn of the role to assume>,
RoleSessionName="<role session name>"
)
print(assumed_role_object)
credentials = assumed_role_object['Credentials']
session = Session(
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken']
)
self.s3 = session.client('s3')
I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())