scrapy unhandled exception - exception

I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url

This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.

Related

Unable to use method of a class in different class-missing 2 required positional arguments

I have two python classes:- One class(CloudLink) is responsible for sending JSON events to the app and another(ReadData) is responsible for building the JSON data.
The ReadData class will be using the CloudLink methods to send the JSON data to the App. But I'm getting error _buildJSONdata() missing 1 required positional argument: 'Data'.
ReadData class
from pyspark.sql import SparkSession
import functools
from pyspark.sql import DataFrame
from pyspark.sql.functions import explode
from cosmosconnect import azurecosmos
class ReadData:
#exception(logger)
def __init__(self):
self.spark_session = (
SparkSession.builder
.appName("readData")
.getOrCreate()
)
mssparkutils.fs.unmount('/mnt/test')
logger.info("Drive unmounted")
mssparkutils.fs.mount(
'abfss://abc#transl.dfs.core.windows.net/',
'/mnt/test',
{'linkedService': "linkCosmos"}
)
logger.info("Mounted Successfully")
self.input_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/input_path"
)
self.output_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/output_path"
)
'''
Reading the schema from csv file
'''
#exception(logger)
def readConfig(self):
try:
logger.info(f"Reading the Config present in {self.input_directory} ")
dfConfig = self.spark_session.read.option("multiline","true") \
.json(self.input_directory)
#for f in dfConfig.select("Entity","Query","Business_Rule").collect():
dfConfig=dfConfig.select(explode('Input').alias('Input_Data'))\
.select('Input_Data.Validation_Type','Input_Data.Entity','Input_Data.Query','Input_Data.Business_Rule')
for f in dfConfig.rdd.toLocalIterator():
#for index, f in dfConfig.toPandas().iterrows():
self.Validation_Type=f[0]
self.container=f[1]
self.query=f[2]
self.rule=f[3]
self.readCosmos(self)
except:
raise ValueError("")
#exception(logger)
def readCosmos(self,*params):
#from cosmosconnect import azurecosmos
#a=[]
linkedService='fg'
df=azurecosmos.cosmosConnect(linkedService,self.query,self.container)
df.cache()
if len(df.head(1)) >0:
outputpath=self.output_directory+'/'+self.container
df.coalesce(1).write.mode('overwrite').parquet(outputpath)
Status="Validation Failure"
Data= {"Validation_Type":[],"Status":[],"Container":[],"Business_Rule":[]}
Data["Validation_Type"].append(self.Validation_Type)
Data["Status"].append(Status)
Data["Container"].append(self.container)
Data["Business_Rule"].append(self.rule)
CloudLink._buildJSONdata(Data)
if __name__ == "__main__":
p = ReadData()
p.readConfig()
CloudLink class
import json
import datetime
import hashlib
import json
import sys
import traceback
import adal
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import logging
from functools import wraps
import sys
def create_logger():
#create a logger object
#logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logfile = logging.FileHandler('exc_logger.log')
#logfile = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(fmt)
logfile.setFormatter(formatter)
logger.addHandler(logfile)
return logger
logger = create_logger()
def exception(logger):
def decorator(func):
#wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
issue = "exception in "+func.__name__+"\n"
issue = issue+"-------------------------\
------------------------------------------------\n"
logger.exception(issue)
raise
return wrapper
return decorator
class CloudLink(object):
_token = None
_instance = None
http = None
cloudclient = TokenLibrary.getSecret("xxxx", "rtrt")
clientid = TokenLibrary.getSecret("xxxx", "tyty")
clientcredentials = TokenLibrary.getSecret("xxxx", "abcabc")
authority_url = TokenLibrary.getSecret("xxxx", "abab")
cloudtest = TokenLibrary.getSecret("xxxx", "yyyy")
#staticmethod
def getInstance():
if not CloudLink._instance:
CloudLink._instance = CloudLink()
return CloudLink._instance
def __init__(self):
retry_strategy = Retry(
total=3,
backoff_factor=0,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.http = requests.Session()
self.http.mount("https://", adapter)
self.http.mount("http://", adapter)
print("Inside init")
def parseJSON(self, t):
try:
eventData = json.loads(t)
logger.info(f"Sending {eventData} to cloud")
self.sendToCloud(eventData)
except ValueError as e:
print("Error: %s Please validate JSON in https://www.jsonschemavalidator.net/"% e)
return None # or: raise
def sendToCloud(self, eventData):
cloudData = {"eventData": eventData, "metadata": self._buildMetadata()}
logger.info(f"Raising alert with data=({cloudData}")
response = self.http.post(
self.cloudtest, headers=self._buildHeaders(), json=cloudData
)
logger.info(f"cloud alert response={response}")
if response.status_code == 202 or response.status_code == 200:
logger.info("Mail sent to Cloud")
else:
raise Exception(f"Cloud reporting failed with Error {response}")
def _buildJSONdata(self,Data):
if len(Data) == 0:
raise Exception("JSON is empty")
else:
t = json.dumps(self.Data)
self.parseJSON(t)
def _buildMetadata(self):
return {
"messageType": "Send Email",
"messageVersion": "0.0.1",
"sender": "Send Email",
}
def _buildHeaders(self):
self._refreshADToken()
headers = {
"Authorization": "Bearer {}".format(self._token["accessToken"]),
"Content-type": "application/json",
"Accept": "text/plain",
}
return headers
def _refreshADToken(self):
def shouldRenew(token):
"""Returns True if the token should be renewed"""
expiresOn = datetime.datetime.strptime(
token["expiresOn"], "%Y-%m-%d %H:%M:%S.%f"
)
now = datetime.datetime.now()
return (expiresOn - now) < datetime.timedelta(minutes=5)
if not self._token or shouldRenew(self._token):
logger.info("Renewing credentials for Alerting")
result = None
try:
context = adal.AuthenticationContext(CloudLink.authority_url)
result = context.acquire_token_with_client_credentials(CloudLink.cloudclient, CloudLink.clientid,CloudLink.clientcredentials)
except Exception as e:
error = "Failed to renew client credentials."
logger.info(error)
raise
if result and "accessToken" in result:
self._token = result
else:
logger.error(
"Failed to acquire bearer token. accessToken not found in result object on renewing credentials."
)
raise Exception("Could not acquire a bearer token")

expected str, bytes or os.PathLike object, not _io.BytesIO

thi is my error"expected str, bytes or os.PathLike object, not _io.BytesIO " while running this code.
here my Apiview
class UploadViewSet(APIView):
parser_classes = (MultiPartParser, FormParser)
permission_classes = (AllowAny,)
def post(self, request, *args, **kwargs):
file = BytesIO(request.FILES['file'].read())
with open(file, "r") as csv_file:
reader = csv.reader(csv_file)
for row in reader():
new_company = Company(
name=row['name'],
hr_name=row['hr_name'],
hr_email=row['hr_email'],
hr_verified=row['hr_verified'],
user_id=row['user_id'],
primary_phone=row['primary_phone'],
comments=row['comments'],
)
new_company.save()
return Response({"status": "success"},status.HTTP_201_CREATED)

Python 2's webbrowser is not working in repl.it

I am using repl.it, and also the webbrowser module to open a link using webbrowser.open(<link to open>)
But, my link doesn't work. Can you help me with this?
Here is my code:
import webbrowser
webbrowser.open("https://www.daffodilday.com.au/get-involved/register-your-school/", 2)
And a not-so-quick overview of the webbrowser module:
import os, sys, shlex, stat, subprocess, time
__all__ = ["Error", "open", "open_new", "open_new_tab", "get", "register"]
class Error(Exception):
pass
_browsers = {}
_tryorder = []
def register(name, klass, instance=None, update_tryorder=1):
_browsers[name.lower()] = [klass, instance]
if update_tryorder > 0:
_tryorder.append(name)
elif update_tryorder < 0:
_tryorder.insert(0, name)
def get(using=None):
if using is not None:
alternatives = [using]
else:
alternatives = _tryorder
for browser in alternatives:
if '%s' in browser:
browser = shlex.split(browser)
if browser[-1] == '&':
return BackgroundBrowser(browser[:-1])
else:
return GenericBrowser(browser)
else:
try:
command = _browsers[browser.lower()]
except KeyError:
command = _synthesize(browser)
if command[1] is not None:
return command[1]
elif command[0] is not None:
return command[0]()
raise Error("could not locate runnable browser")
def open(url, new=0, autoraise=True):
for name in _tryorder:
browser = get(name)
if browser.open(url, new, autoraise):
return True
return False
def open_new(url):
return open(url, 1)
def open_new_tab(url):
return open(url, 2)
def _synthesize(browser, update_tryorder=1):
cmd = browser.split()[0]
if not _iscommand(cmd):
return [None, None]
name = os.path.basename(cmd)
try:
command = _browsers[name.lower()]
except KeyError:
return [None, None]
# now attempt to clone to fit the new name:
controller = command[1]
if controller and name.lower() == controller.basename:
import copy
controller = copy.copy(controller)
controller.name = browser
controller.basename = os.path.basename(browser)
register(browser, None, controller, update_tryorder)
return [None, controller]
return [None, None]
if sys.platform[:3] == "win":
def _isexecutable(cmd):
cmd = cmd.lower()
if os.path.isfile(cmd) and cmd.endswith((".exe", ".bat")):
return True
for ext in ".exe", ".bat":
if os.path.isfile(cmd + ext):
return True
return False
else:
def _isexecutable(cmd):
if os.path.isfile(cmd):
mode = os.stat(cmd)[stat.ST_MODE]
if mode & stat.S_IXUSR or mode & stat.S_IXGRP or mode & stat.S_IXOTH:
return True
return False
def _iscommand(cmd):
if _isexecutable(cmd):
return True
path = os.environ.get("PATH")
if not path:
return False
for d in path.split(os.pathsep):
exe = os.path.join(d, cmd)
if _isexecutable(exe):
return True
return False
class BaseBrowser(object):
"""Parent class for all browsers. Do not use directly."""
args = ['%s']
def __init__(self, name=""):
self.name = name
self.basename = name
def open(self, url, new=0, autoraise=True):
raise NotImplementedError
def open_new(self, url):
return self.open(url, 1)
def open_new_tab(self, url):
return self.open(url, 2)
class GenericBrowser(BaseBrowser):
def __init__(self, name):
if isinstance(name, basestring):
self.name = name
self.args = ["%s"]
else:
self.name = name[0]
self.args = name[1:]
self.basename = os.path.basename(self.name)
def open(self, url, new=0, autoraise=True):
cmdline = [self.name] + [arg.replace("%s", url)
for arg in self.args]
try:
if sys.platform[:3] == 'win':
p = subprocess.Popen(cmdline)
else:
p = subprocess.Popen(cmdline, close_fds=True)
return not p.wait()
except OSError:
return False
class BackgroundBrowser(GenericBrowser):
def open(self, url, new=0, autoraise=True):
cmdline = [self.name] + [arg.replace("%s", url)
for arg in self.args]
try:
if sys.platform[:3] == 'win':
p = subprocess.Popen(cmdline)
else:
setsid = getattr(os, 'setsid', None)
if not setsid:
setsid = getattr(os, 'setpgrp', None)
p = subprocess.Popen(cmdline, close_fds=True, preexec_fn=setsid)
return (p.poll() is None)
except OSError:
return False
class UnixBrowser(BaseBrowser):
raise_opts = None
remote_args = ['%action', '%s']
remote_action = None
remote_action_newwin = None
remote_action_newtab = None
background = False
redirect_stdout = True
def _invoke(self, args, remote, autoraise):
raise_opt = []
if remote and self.raise_opts:
# use autoraise argument only for remote invocation
autoraise = int(autoraise)
opt = self.raise_opts[autoraise]
if opt: raise_opt = [opt]
cmdline = [self.name] + raise_opt + args
if remote or self.background:
inout = file(os.devnull, "r+")
else:
inout = None
setsid = getattr(os, 'setsid', None)
if not setsid:
setsid = getattr(os, 'setpgrp', None)
p = subprocess.Popen(cmdline, close_fds=True, stdin=inout,
stdout=(self.redirect_stdout and inout or None),
stderr=inout, preexec_fn=setsid)
if remote:
time.sleep(1)
rc = p.poll()
if rc is None:
time.sleep(4)
rc = p.poll()
if rc is None:
return True
return not rc
elif self.background:
if p.poll() is None:
return True
else:
return False
else:
return not p.wait()
def open(self, url, new=0, autoraise=True):
if new == 0:
action = self.remote_action
elif new == 1:
action = self.remote_action_newwin
elif new == 2:
if self.remote_action_newtab is None:
action = self.remote_action_newwin
else:
action = self.remote_action_newtab
else:
raise Error("Bad 'new' parameter to open(); " +
"expected 0, 1, or 2, got %s" % new)
args = [arg.replace("%s", url).replace("%action", action)
for arg in self.remote_args]
success = self._invoke(args, True, autoraise)
if not success:
args = [arg.replace("%s", url) for arg in self.args]
return self._invoke(args, False, False)
else:
return True
class Mozilla(UnixBrowser):
raise_opts = ["-noraise", "-raise"]
remote_args = ['-remote', 'openURL(%s%action)']
remote_action = ""
remote_action_newwin = ",new-window"
remote_action_newtab = ",new-tab"
background = True
Netscape = Mozilla
class Galeon(UnixBrowser):
raise_opts = ["-noraise", ""]
remote_args = ['%action', '%s']
remote_action = "-n"
remote_action_newwin = "-w"
background = True
class Chrome(UnixBrowser):
remote_args = ['%action', '%s']
remote_action = ""
remote_action_newwin = "--new-window"
remote_action_newtab = ""
background = True
Chromium = Chrome
class Opera(UnixBrowser):
raise_opts = ["-noraise", ""]
remote_args = ['-remote', 'openURL(%s%action)']
remote_action = ""
remote_action_newwin = ",new-window"
remote_action_newtab = ",new-page"
background = True
class Elinks(UnixBrowser):
remote_args = ['-remote', 'openURL(%s%action)']
remote_action = ""
remote_action_newwin = ",new-window"
remote_action_newtab = ",new-tab"
background = False
redirect_stdout = False
class Konqueror(BaseBrowser):
def open(self, url, new=0, autoraise=True):
if new == 2:
action = "newTab"
else:
action = "openURL"
devnull = file(os.devnull, "r+")
setsid = getattr(os, 'setsid', None)
if not setsid:
setsid = getattr(os, 'setpgrp', None)
try:
p = subprocess.Popen(["kfmclient", action, url],
close_fds=True, stdin=devnull,
stdout=devnull, stderr=devnull)
except OSError:
pass
else:
p.wait()
return True
try:
p = subprocess.Popen(["konqueror", "--silent", url],
close_fds=True, stdin=devnull,
stdout=devnull, stderr=devnull,
preexec_fn=setsid)
except OSError:
pass
else:
if p.poll() is None:
return True
try:
p = subprocess.Popen(["kfm", "-d", url],
close_fds=True, stdin=devnull,
stdout=devnull, stderr=devnull,
preexec_fn=setsid)
except OSError:
return False
else:
return (p.poll() is None)
class Grail(BaseBrowser):
def _find_grail_rc(self):
import glob
import pwd
import socket
import tempfile
tempdir = os.path.join(tempfile.gettempdir(),
".grail-unix")
user = pwd.getpwuid(os.getuid())[0]
filename = os.path.join(tempdir, user + "-*")
maybes = glob.glob(filename)
if not maybes:
return None
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
for fn in maybes:
# need to PING each one until we find one that's live
try:
s.connect(fn)
except socket.error:
# no good; attempt to clean it out, but don't fail:
try:
os.unlink(fn)
except IOError:
pass
else:
return s
def _remote(self, action):
s = self._find_grail_rc()
if not s:
return 0
s.send(action)
s.close()
return 1
def open(self, url, new=0, autoraise=True):
if new:
ok = self._remote("LOADNEW " + url)
else:
ok = self._remote("LOAD " + url)
return ok
def register_X_browsers():
if _iscommand("xdg-open"):
register("xdg-open", None, BackgroundBrowser("xdg-open"))
if "GNOME_DESKTOP_SESSION_ID" in os.environ and _iscommand("gvfs-open"):
register("gvfs-open", None, BackgroundBrowser("gvfs-open"))
if "GNOME_DESKTOP_SESSION_ID" in os.environ and _iscommand("gnome-open"):
register("gnome-open", None, BackgroundBrowser("gnome-open"))
if "KDE_FULL_SESSION" in os.environ and _iscommand("kfmclient"):
register("kfmclient", Konqueror, Konqueror("kfmclient"))
if _iscommand("x-www-browser"):
register("x-www-browser", None, BackgroundBrowser("x-www-browser"))
for browser in ("mozilla-firefox", "firefox",
"mozilla-firebird", "firebird",
"iceweasel", "iceape",
"seamonkey", "mozilla", "netscape"):
if _iscommand(browser):
register(browser, None, Mozilla(browser))
if _iscommand("kfm"):
register("kfm", Konqueror, Konqueror("kfm"))
elif _iscommand("konqueror"):
register("konqueror", Konqueror, Konqueror("konqueror"))
for browser in ("galeon", "epiphany"):
if _iscommand(browser):
register(browser, None, Galeon(browser))
if _iscommand("skipstone"):
register("skipstone", None, BackgroundBrowser("skipstone"))
for browser in ("google-chrome", "chrome", "chromium", "chromium-browser"):
if _iscommand(browser):
register(browser, None, Chrome(browser))
if _iscommand("opera"):
register("opera", None, Opera("opera")).
if _iscommand("mosaic"):
register("mosaic", None, BackgroundBrowser("mosaic")
if _iscommand("grail"):
register("grail", Grail, None)
if os.environ.get("DISPLAY"):
register_X_browsers()
if os.environ.get("TERM"):
if _iscommand("www-browser"):
register("www-browser", None, GenericBrowser("www-browser"))
if _iscommand("links"):
register("links", None, GenericBrowser("links"))
if _iscommand("elinks"):
register("elinks", None, Elinks("elinks"))
if _iscommand("lynx"):
register("lynx", None, GenericBrowser("lynx"))
if _iscommand("w3m"):
register("w3m", None, GenericBrowser("w3m"))
if sys.platform[:3] == "win":
class WindowsDefault(BaseBrowser):
def open(self, url, new=0, autoraise=True):
try:
os.startfile(url)
except WindowsError:
return False
else:
return True
_tryorder = []
_browsers = {}
register("windows-default", WindowsDefault)
iexplore = os.path.join(os.environ.get("PROGRAMFILES", "C:\\Program Files"),
"Internet Explorer\\IEXPLORE.EXE")
for browser in ("firefox", "firebird", "seamonkey", "mozilla",
"netscape", "opera", iexplore):
if _iscommand(browser):
register(browser, None, BackgroundBrowser(browser))
if sys.platform == 'darwin':
class MacOSX(BaseBrowser):
def __init__(self, name):
self.name = name
def open(self, url, new=0, autoraise=True):
assert "'" not in url
if not ':' in url:
url = 'file:'+url
new = int(bool(new))
if self.name == "default":
script = 'open location "%s"' % url.replace('"', '%22')
else:
if self.name == "OmniWeb":
toWindow = ""
else:
toWindow = "toWindow %d" % (new - 1)
cmd = 'OpenURL "%s"' % url.replace('"', '%22')
script = '''tell application "%s"
activate
%s %s
end tell''' % (self.name, cmd, toWindow)
osapipe = os.popen("osascript", "w")
if osapipe is None:
return False
osapipe.write(script)
rc = osapipe.close()
return not rc
class MacOSXOSAScript(BaseBrowser):
def __init__(self, name):
self._name = name
def open(self, url, new=0, autoraise=True):
if self._name == 'default':
script = 'open location "%s"' % url.replace('"', '%22')
script = '''
tell application "%s"
activate
open location "%s"
end
'''%(self._name, url.replace('"', '%22'))
osapipe = os.popen("osascript", "w")
if osapipe is None:
return False
osapipe.write(script)
rc = osapipe.close()
return not rc
register("safari", None, MacOSXOSAScript('safari'), -1)
register("firefox", None, MacOSXOSAScript('firefox'), -1)
register("MacOSX", None, MacOSXOSAScript('default'), -1)
if sys.platform[:3] == "os2" and _iscommand("netscape"):
_tryorder = []
_browsers = {}
register("os2netscape", None,
GenericBrowser(["start", "netscape", "%s"]), -1)
if "BROWSER" in os.environ:
_userchoices = os.environ["BROWSER"].split(os.pathsep)
_userchoices.reverse()
for cmdline in _userchoices:
if cmdline != '':
cmd = _synthesize(cmdline, -1)
if cmd[1] is None:
register(cmdline, None, GenericBrowser(cmdline), -1)
cmdline = None
del cmdline
del _userchoices
def main():
import getopt
usage = """Usage: %s [-n | -t] url
-n: open new window
-t: open new tab""" % sys.argv[0]
try:
opts, args = getopt.getopt(sys.argv[1:], 'ntd')
except getopt.error, msg:
print >>sys.stderr, msg
print >>sys.stderr, usage
sys.exit(1)
new_win = 0
for o, a in opts:
if o == '-n': new_win = 1
elif o == '-t': new_win = 2
if len(args) != 1:
print >>sys.stderr, usage
sys.exit(1)
url = args[0]
open(url, new_win)
print "\a"
if __name__ == "__main__":
main()
Can you help me? I mostly know about everything in Python, but still sometimes need quite quaintly a lot of effort.
You can simplify the answer like this:
import webbrowser
webbrowser.open(*website*)
Of course, replace * website * with the site and no need to write the 2.
Then tell me if it works.
And by the way, some online python processor like pythonanywhere do not allow opening all websites.
Instead, they only allow some specific websites.

How to parse next page by Beautiful Soup?

I use code as below to parse page with next page:
def parseNextThemeUrl(url):
ret = []
ret1 = []
html = urllib.request.urlopen(url)
html = BeautifulSoup(html, PARSER)
html = html.find('a', class_='pager_next')
if html:
html = urljoin(url, html.get('href'))
ret1 = parseNextThemeUrl(html)
for r in ret1:
ret.append(r)
else:
ret.append(url)
return ret
But I got error as below, how can I parse next link if there is a link.
Traceback (most recent call last):
html = urllib.request.urlopen(url)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 456, in open
req.timeout = timeout
AttributeError: 'list' object has no attribute 'timeout'
I got my own answer as below:
def parseNextThemeUrl(url):
urls = []
urls.append(url)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
new_page = soup.find('a', class_='pager_next')
if new_page:
new_url = urljoin(url, new_page.get('href'))
urls1 = parseNextThemeUrl(new_url)
for url1 in urls1:
urls.append(url1)
return urls

TypeError: string indices must be integers

Hi i have a problem with my code that i get a error in a loop that works for a few times but then throws me a typeerro: string indices must be integers.
I want to call an api to get a json back and get some parts of the json response. heres the code:
class API(object):
def __init__(self, api_key):
self.api_key = api_key
def _request(self, api_url, params={}):
args = {'api_key': self.api_key}
for key, value in params.items():
if key not in args:
args[key] = value
response = requests.get(
Consts.URL['base'].format(
url=api_url
),
params=args
)
if response.status_code == requests.codes.ok:
return response.json()
else:
return "not possible"
print(response.url)
def get_list(self):
excel = EXCEL('s6.xlsx')
api_url = Consts.URL['list'].format(
version = Consts.API_VERSIONS['matchversion'],
start = excel.get_gamenr()
)
return self._request(api_url)
def get_match(self, matchid):
idlist = matchid
api_url = Consts.URL['match'].format(
version = Consts.API_VERSIONS['matchversion'],
matchId = idlist
)
return self._request(api_url)
def match_ids(self):
api = API('c6ea2f68-7ed6-40fa-9b99-fd591c55c05f')
x = api.get_list()
y = x['matches']
count = len(y)
ids = []
while count > 0:
count = count - 1
temp = y[0]
ids.append(temp['matchId'])
del y[0]
return ids
def match_info(self):
matchids = self.match_ids()
print(matchids)
matchinfolist = {}
counter = 1
for gameids in matchids:
info = self.get_match(gameids)
myid = self.find_partid(info['participantIdentities'])
prepdstats = info['participants'][myid-1]
print(prepdstats)
matchinfolist['stats' + str(counter)] = prepdstats
return matchinfolist
def find_partid(self, partlist):
partid = 0
idlist = partlist
while partid < 10:
partid = partid + 1
tempplayer = idlist[0]['player']
if tempplayer['summonerId'] == 19204660:
playernr = partid
partid = 500
del idlist[0]
return playernr
when i run the match_info() function i get this error
Traceback (most recent call last):
File "C:\Users\Niklas\Desktop\python riot\main.py", line 17, in <module>
main()
File "C:\Users\Niklas\Desktop\python riot\main.py", line 10, in main
print(api.match_info())
File "C:\Users\Niklas\Desktop\python riot\api.py", line 78, in match_info
myid = self.find_partid(info['participantIdentities'])
TypeError: string indices must be integers
but only after the loop in the function has run for a few times. I have no idea what im doing wrong. Any help would be nice.
Here is a link to the json: https://euw.api.pvp.net/api/lol/euw/v2.2/match/2492271473?api_key=c6ea2f68-7ed6-40fa-9b99-fd591c55c05f
The error shows up on
myid = self.find_partid(info['participantIdentities'])
For this line to execute, info must be a mapping with string keys, not a string itself. info is
info = self.get_match(gameids)
get_match ends with
return self._request(api_url)
_request ends with
if response.status_code == requests.codes.ok:
return response.json()
else:
return "not possible"
For the loop to ever run, response.json() must be a dict with key 'participantIdentities'. Your bug is expecting that to always be true.
One fix might be to make the expectation always ture. If there is a satisfactory default value, return {'participantIdentities': <default value>}. Otherwise, return None and change the loop to
info = self.get_match(gameids)
if info is not None:
# as before
else:
# whatever default action you want