expected str, bytes or os.PathLike object, not _io.BytesIO - csv

thi is my error"expected str, bytes or os.PathLike object, not _io.BytesIO " while running this code.
here my Apiview
class UploadViewSet(APIView):
parser_classes = (MultiPartParser, FormParser)
permission_classes = (AllowAny,)
def post(self, request, *args, **kwargs):
file = BytesIO(request.FILES['file'].read())
with open(file, "r") as csv_file:
reader = csv.reader(csv_file)
for row in reader():
new_company = Company(
name=row['name'],
hr_name=row['hr_name'],
hr_email=row['hr_email'],
hr_verified=row['hr_verified'],
user_id=row['user_id'],
primary_phone=row['primary_phone'],
comments=row['comments'],
)
new_company.save()
return Response({"status": "success"},status.HTTP_201_CREATED)

Related

Unable to use method of a class in different class-missing 2 required positional arguments

I have two python classes:- One class(CloudLink) is responsible for sending JSON events to the app and another(ReadData) is responsible for building the JSON data.
The ReadData class will be using the CloudLink methods to send the JSON data to the App. But I'm getting error _buildJSONdata() missing 1 required positional argument: 'Data'.
ReadData class
from pyspark.sql import SparkSession
import functools
from pyspark.sql import DataFrame
from pyspark.sql.functions import explode
from cosmosconnect import azurecosmos
class ReadData:
#exception(logger)
def __init__(self):
self.spark_session = (
SparkSession.builder
.appName("readData")
.getOrCreate()
)
mssparkutils.fs.unmount('/mnt/test')
logger.info("Drive unmounted")
mssparkutils.fs.mount(
'abfss://abc#transl.dfs.core.windows.net/',
'/mnt/test',
{'linkedService': "linkCosmos"}
)
logger.info("Mounted Successfully")
self.input_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/input_path"
)
self.output_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/output_path"
)
'''
Reading the schema from csv file
'''
#exception(logger)
def readConfig(self):
try:
logger.info(f"Reading the Config present in {self.input_directory} ")
dfConfig = self.spark_session.read.option("multiline","true") \
.json(self.input_directory)
#for f in dfConfig.select("Entity","Query","Business_Rule").collect():
dfConfig=dfConfig.select(explode('Input').alias('Input_Data'))\
.select('Input_Data.Validation_Type','Input_Data.Entity','Input_Data.Query','Input_Data.Business_Rule')
for f in dfConfig.rdd.toLocalIterator():
#for index, f in dfConfig.toPandas().iterrows():
self.Validation_Type=f[0]
self.container=f[1]
self.query=f[2]
self.rule=f[3]
self.readCosmos(self)
except:
raise ValueError("")
#exception(logger)
def readCosmos(self,*params):
#from cosmosconnect import azurecosmos
#a=[]
linkedService='fg'
df=azurecosmos.cosmosConnect(linkedService,self.query,self.container)
df.cache()
if len(df.head(1)) >0:
outputpath=self.output_directory+'/'+self.container
df.coalesce(1).write.mode('overwrite').parquet(outputpath)
Status="Validation Failure"
Data= {"Validation_Type":[],"Status":[],"Container":[],"Business_Rule":[]}
Data["Validation_Type"].append(self.Validation_Type)
Data["Status"].append(Status)
Data["Container"].append(self.container)
Data["Business_Rule"].append(self.rule)
CloudLink._buildJSONdata(Data)
if __name__ == "__main__":
p = ReadData()
p.readConfig()
CloudLink class
import json
import datetime
import hashlib
import json
import sys
import traceback
import adal
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import logging
from functools import wraps
import sys
def create_logger():
#create a logger object
#logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logfile = logging.FileHandler('exc_logger.log')
#logfile = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(fmt)
logfile.setFormatter(formatter)
logger.addHandler(logfile)
return logger
logger = create_logger()
def exception(logger):
def decorator(func):
#wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
issue = "exception in "+func.__name__+"\n"
issue = issue+"-------------------------\
------------------------------------------------\n"
logger.exception(issue)
raise
return wrapper
return decorator
class CloudLink(object):
_token = None
_instance = None
http = None
cloudclient = TokenLibrary.getSecret("xxxx", "rtrt")
clientid = TokenLibrary.getSecret("xxxx", "tyty")
clientcredentials = TokenLibrary.getSecret("xxxx", "abcabc")
authority_url = TokenLibrary.getSecret("xxxx", "abab")
cloudtest = TokenLibrary.getSecret("xxxx", "yyyy")
#staticmethod
def getInstance():
if not CloudLink._instance:
CloudLink._instance = CloudLink()
return CloudLink._instance
def __init__(self):
retry_strategy = Retry(
total=3,
backoff_factor=0,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.http = requests.Session()
self.http.mount("https://", adapter)
self.http.mount("http://", adapter)
print("Inside init")
def parseJSON(self, t):
try:
eventData = json.loads(t)
logger.info(f"Sending {eventData} to cloud")
self.sendToCloud(eventData)
except ValueError as e:
print("Error: %s Please validate JSON in https://www.jsonschemavalidator.net/"% e)
return None # or: raise
def sendToCloud(self, eventData):
cloudData = {"eventData": eventData, "metadata": self._buildMetadata()}
logger.info(f"Raising alert with data=({cloudData}")
response = self.http.post(
self.cloudtest, headers=self._buildHeaders(), json=cloudData
)
logger.info(f"cloud alert response={response}")
if response.status_code == 202 or response.status_code == 200:
logger.info("Mail sent to Cloud")
else:
raise Exception(f"Cloud reporting failed with Error {response}")
def _buildJSONdata(self,Data):
if len(Data) == 0:
raise Exception("JSON is empty")
else:
t = json.dumps(self.Data)
self.parseJSON(t)
def _buildMetadata(self):
return {
"messageType": "Send Email",
"messageVersion": "0.0.1",
"sender": "Send Email",
}
def _buildHeaders(self):
self._refreshADToken()
headers = {
"Authorization": "Bearer {}".format(self._token["accessToken"]),
"Content-type": "application/json",
"Accept": "text/plain",
}
return headers
def _refreshADToken(self):
def shouldRenew(token):
"""Returns True if the token should be renewed"""
expiresOn = datetime.datetime.strptime(
token["expiresOn"], "%Y-%m-%d %H:%M:%S.%f"
)
now = datetime.datetime.now()
return (expiresOn - now) < datetime.timedelta(minutes=5)
if not self._token or shouldRenew(self._token):
logger.info("Renewing credentials for Alerting")
result = None
try:
context = adal.AuthenticationContext(CloudLink.authority_url)
result = context.acquire_token_with_client_credentials(CloudLink.cloudclient, CloudLink.clientid,CloudLink.clientcredentials)
except Exception as e:
error = "Failed to renew client credentials."
logger.info(error)
raise
if result and "accessToken" in result:
self._token = result
else:
logger.error(
"Failed to acquire bearer token. accessToken not found in result object on renewing credentials."
)
raise Exception("Could not acquire a bearer token")

How may I convert a Many to Many field into JSON Format in Django

I am working with APIs based in REST Architecture. I know Django has a framework to work with this APIs but my homework is do it from scratch. I got an API of a movies site where users can go and search information about a bunch of movies and i am trying to get the data into JSON format from the model Movie which has a Many-to-Many relationship whith the Actor model. I am using Class-based views for this.
The code from my models.py and views.py files is nested below:
class Actor(models.Model):
full_name = models.CharField(max_length=125)
role = models.CharField(max_length=125)
def __str__(self):
return self.full_name
class Movie(models.Model):
ACTION = 'AC'
DRAMA = 'DR'
COMEDY = 'CM'
SCIENCE_FICTION = 'SF'
THRILLER = 'TR'
RELIGIOUS = 'RG'
GENRE_CHOICES = [
(ACTION, 'Accion'),
(DRAMA, 'Drama'),
(COMEDY, 'Comedy'),
(SCIENCE_FICTION, 'Ciencia Ficcion'),
(THRILLER, 'Triler'),
(RELIGIOUS, 'Religioso')
]
title = models.CharField(max_length=155, blank=False)
synopsis = models.TextField(max_length=1000, blank=True)
genre = models.CharField(max_length=100, choices=GENRE_CHOICES, default='', blank=False)
tag = models.JSONField(default=dict, blank=True)
actors = models.ManyToManyField(Actor, related_name='movies', blank=True)
def __str__(self):
views.py
from django.views import View
from django.http.response import JsonResponse
from .models import Movie
from django.utils.decorators import method_decorator
from django.views.decorators.csrf import csrf_exempt
import json
class MovieView(View):
#method_decorator(csrf_exempt)
def dispatch(self, request, *args, **kwargs):
return super().dispatch(request, *args, **kwargs)
def get(self, request, pk=0):
"""
Return the list of all movies, or a single movie
:param pk:
:param request:
:return:
"""
if pk > 0:
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
movie = movies[0]
data = {'message': "Success", 'movie': movie}
else:
data = {'message': "Movie not found... "}
return JsonResponse(data)
else:
movies = list(Movie.objects.values('title', 'synopsis', 'genre', 'actors__full_name').order_by('pk'))
if len(movies) > 0:
data = {'message': "Success", 'movies': movies}
else:
data = {'message': "Movies not found ..."}
return JsonResponse(data)
def post(self, request):
"""
Create a new movie
:param request:
:return:
"""
json_data = json.loads(request.body)
Movie.objects.create(title=json_data['title'], synopsis=json_data['synopsis'], genre=json_data['genre'],
tag=json_data['tag'], actors=json_data['actors'])
data = {'message': "Success"}
return JsonResponse(data)
def put(self, request, pk):
"""
Update a single movie
:param request:
:param pk:
:return:
"""
json_data = json.loads(request.body)
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
movie = Movie.objects.get(pk=pk)
movie.title = json_data['title']
movie.synopsis = json_data['synopsis']
movie.genre = json_data['genre']
movie.tag = json_data['tag']
movie.save()
data = {'message': "Success"}
else:
data = {'message': "Movie not found ..."}
return JsonResponse(data)
def delete(self, request, pk):
movies = list(Movie.objects.filter(pk=pk).values())
if len(movies) > 0:
Movie.objects.filter(pk=pk).delete()
data = {'message': "Success"}
else:
data = {'message': "Movie not found ..."}
return JsonResponse(data)

How can I change the color on the like button in django?

I did create (like and dislike) in my project and I need it when someone clicks on the button. the color will change to blue
I saw something like that where I could create a variable called something like: is_liked = False, and I can place that in HTML by context to trigger it in (if condition) but it's not working with me so, How can I run the color on the like button?
views.py
# Detail question and Create comment
class QuestionDetail(DetailView, SingleObjectMixin):
template_name = 'community/question_view.html'
slug_field = 'ask_slug'
slug_url_kwarg = 'user_slug'
model = UserAsking
queryset = UserAsking.objects.all()
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['my_question'] = UserAsking.objects.get(title=self.object)
self_post = UserAsking.objects.get(title=self.object)
post_slug = UserAsking.objects.get(ask_slug=self_post.ask_slug)
context['summation'] = post_slug.likes.count() - post_slug.dislikes.count()
context['comment_form'] = CommentForm
comments_count = Comment.objects.filter(userasking=UserAsking.objects.get(title=self.object))
context['comments_count'] = comments_count.count()
# liked_post = User.objects.get(username=self.request.user.username).likes.exists()
# context['liked_post'] = liked_post
# disliked_post = User.objects.get(username=self.request.user.username).dislikes.exists()
# context['disliked_post'] = disliked_post
return context
def post(self, request, user_slug, *args, **kwargs):
my_question = UserAsking.objects.get(ask_slug=user_slug)
userprof = UserProfile.objects.get(userasking__ask_slug=user_slug)
comment_form = CommentForm(request.POST, instance=request.user)
name = "%s %s" % (self.request.user.first_name, self.request.user.last_name)
username = self.request.user.username
logo = self.request.user.userprofile.logo.url
c = CommentForm(self.request.POST).add_error('comment', 'error')
if comment_form.is_valid():
comment_form = Comment.objects.create(comment=self.request.POST.get('comment', None),
userasking_id=my_question.id,
userprofile_id=userprof.id,
name=name,
username=username,
logo=logo,
comment_slug=my_question.ask_slug
)
comment_form.save()
return redirect('community:question_view', comment_form.userasking.ask_slug)
return render(request, 'community/question_view.html', {'comment_form': comment_form,
'c': c})
# Like post function
class LikePost(View, SingleObjectMixin):
template_name = 'community/question_view.html'
def post(self, request, *args, **kwargs):
post = get_object_or_404(UserAsking, ask_slug=request.POST.get('post_slug'))
if post.dislikes.filter(username=request.user).exists():
post.dislikes.remove(request.user)
post.likes.add(request.user)
models.py
class UserAsking(models.Model):
userprofile = models.ForeignKey(UserProfile, on_delete=models.CASCADE)
title = models.CharField(max_length=100, blank=False, help_text='Be specific and imagine you’re asking a question to another person')
question = models.TextField(max_length=500, blank=False, help_text='Include all the information someone would need to answer your question')
field = models.CharField(max_length=20, choices=CHOICE, default='Technology', help_text='Add the field to describe what your question is about')
date = models.DateTimeField(auto_now_add=True)
ask_slug = models.SlugField(max_length=100)
likes = models.ManyToManyField(User, related_name='likes', blank=True)
dislikes = models.ManyToManyField(User, related_name='dislikes', blank=True)
def __str__(self):
return self.title
def get_absolute_url(self):
return reverse('community:question_view', kwargs={'user_slug': self.ask_slug})
def save(self, *args, **kwargs):
self.ask_slug = slugify(self.title)
super().save(*args, **kwargs)
elif post.likes.filter(username=request.user).exists():
post.likes.remove(request.user)
else:
post.likes.add(request.user)
return redirect(post.get_absolute_url())
# Dislike post function
class DisLikePost(View, SingleObjectMixin):
def post(self, request, *args, **kwargs):
post = get_object_or_404(UserAsking, ask_slug=request.POST.get('post_dislike_slug'))
if post.likes.filter(username=request.user).exists():
post.likes.remove(request.user)
post.dislikes.add(request.user)
elif post.dislikes.filter(username=request.user).exists():
post.dislikes.remove(request.user)
else:
post.dislikes.add(request.user)
return redirect(post.get_absolute_url())
how can I put the condition in the HTML page to check if is_liked is True or False?
In Django Html use template like this
{% if query_set.is_like %}
...do something
change back color like <h1 style:background:''blue></h1>
{% else %}
No change... <h1></h1>
{% endif%}
I had to add a condition to get_context where that way did work with me. this way I didn't see before but it works perfectly:
views.py
class QuestionDetail(DetailView, SingleObjectMixin):
template_name = 'community/question_view.html'
slug_field = 'ask_slug'
slug_url_kwarg = 'user_slug'
model = UserAsking
queryset = UserAsking.objects.all()
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['my_question'] = UserAsking.objects.get(title=self.object)
self_post = UserAsking.objects.get(title=self.object)
post_slug = UserAsking.objects.get(ask_slug=self_post.ask_slug)
context['summation'] = post_slug.likes.count() - post_slug.dislikes.count()
context['comment_form'] = CommentForm
comments_count = Comment.objects.filter(userasking=UserAsking.objects.get(title=self.object))
context['comments_count'] = comments_count.count()
context['is_liked'] = False
context['is_dislike'] = False
# context to like the post
if LikePost.as_view():
if post_slug.dislikes.filter(username=self.request.user).exists():
context['is_liked'] = False
elif post_slug.likes.filter(username=self.request.user).exists():
context['is_liked'] = True
else:
context['is_liked'] = False
# context to dis-like the post
if DisLikePost.as_view():
if post_slug.likes.filter(username=self.request.user).exists():
context['is_dislike'] = False
elif post_slug.dislikes.filter(username=self.request.user).exists():
context['is_dislike'] = True
else:
context['is_dislike'] = False
return context
def post(self, request, user_slug, *args, **kwargs):
my_question = UserAsking.objects.get(ask_slug=user_slug)
userprof = UserProfile.objects.get(userasking__ask_slug=user_slug)
comment_form = CommentForm(request.POST, instance=request.user)
name = "%s %s" % (self.request.user.first_name, self.request.user.last_name)
username = self.request.user.username
logo = self.request.user.userprofile.logo.url
c = CommentForm(self.request.POST).add_error('comment', 'error')
if comment_form.is_valid():
comment_form = Comment.objects.create(comment=self.request.POST.get('comment', None),
userasking_id=my_question.id,
userprofile_id=userprof.id,
name=name,
username=username,
logo=logo,
comment_slug=my_question.ask_slug
)
comment_form.save()
return redirect('community:question_view', comment_form.userasking.ask_slug)
return render(request, 'community/question_view.html', {'comment_form': comment_form,
'c': c})
# Like post function
class LikePost(View, SingleObjectMixin):
template_name = 'community/question_view.html'
def post(self, request, *args, **kwargs):
post = get_object_or_404(UserAsking, ask_slug=request.POST.get('post_slug'))
if post.dislikes.filter(username=request.user).exists():
post.dislikes.remove(request.user)
post.likes.add(request.user)
elif post.likes.filter(username=request.user).exists():
post.likes.remove(request.user)
else:
post.likes.add(request.user)
return redirect(post.get_absolute_url())
# Dislike post function
class DisLikePost(View, SingleObjectMixin):
def post(self, request, *args, **kwargs):
post = get_object_or_404(UserAsking, ask_slug=request.POST.get('post_dislike_slug'))
if post.likes.filter(username=request.user).exists():
post.likes.remove(request.user)
post.dislikes.add(request.user)
elif post.dislikes.filter(username=request.user).exists():
post.dislikes.remove(request.user)
else:
post.dislikes.add(request.user)
return redirect(post.get_absolute_url())
at this moment, I add a condition on the view I already handle as you see above in the QuestionDetail model.

Django Jsonresponse filtered queryset by id

My objective is to get the id and make a queryset filtered by id, as in the following code:
views.py
class MyProfile(TemplateView):
model = Reports
template_name = 'template.html'
def get_context_data(request, *args, **kwargs):
if kwargs.get('pk', None):
q = kwargs.get('pk', None)
queryset = Reports.objects.all().values('id','line_x','line_y',).filter(id = q)
data = list(queryset)
return JsonResponse(data, safe=False)
urls.py
url(r'^profiles/(?P<pk>\d+)/$', views.MyProfile.as_view())
It returns the following error:
context must be a dict rather than JsonResponse
Django 1.11.8
from django.http import JsonResponse
def different_function_name(request, *args, **kwargs):
if kwargs.get('pk', None):
q = kwargs.get('pk', None)
queryset = Reports.objects.all().values('id','line_x','line_y',).filter(id = q)
query_list = list(queryset)
return JsonResponse(query_list, safe=False)

scrapy unhandled exception

I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.