I'm using TMDB api to show movie poster but I'm getting some errors - json

**here is my code**
import os
import requests
CONFIG_PATTERN = 'http://api.themoviedb.org/3/search/movie?query=Monsters+University&api_key=xxx'
IMG_PATTERN = 'http://api.themoviedb.org/3/movie?query=Monsters+University&api_key=xxx'
KEY = 'xxx'
def _get_json(url):
r = requests.get(url)
return r.json()
def _download_images(urls, path='.'):
"""download all images in list 'urls' to 'path' """
for nr, url in enumerate(urls):
r = requests.get(url)
filetype = r.headers['content-type'].split('/')[-1]
filename = 'poster_{0}.{1}'.format(nr+1,filetype)
filepath = os.path.join(path, filename)
with open(filepath,'wb') as w:
w.write(r.content)
def get_poster_urls(imdbid):
""" return image urls of posters for IMDB id
returns all poster images from 'themoviedb.org'. Uses the
maximum available size.
Args:
imdbid (str): IMDB id of the movie
Returns:
list: list of urls to the images
"""
config = _get_json(CONFIG_PATTERN.format(key=KEY))
base_url = config['images']['base_url']
sizes = config['images']['poster_sizes']
def size_str_to_int(x):
return float("inf") if x == 'original' else int(x[1:])
max_size = max(sizes, key=size_str_to_int)
posters = _get_json(IMG_PATTERN.format(key=KEY,imdbid=imdbid))['posters']
poster_urls = []
for poster in posters:
rel_path = poster['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
poster_urls.append(url)
return poster_urls
def tmdb_posters(imdbid, count=None, outpath='.'):
urls = get_poster_urls(imdbid)
if count is not None:
urls = urls[:count]
_download_images(urls, outpath)
if __name__=="__main__":
tmdb_posters('tt0095016')
please format the code accordingly iam fetching the json data using TMDB api but iam getting errors
here are errors
Traceback (most recent call last):
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 57, in <module>
tmdb_posters('tt0095016')
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 51, in tmdb_posters
urls = get_poster_urls(imdbid)
File "C:/Users/ayushblueluck/PycharmProjects/MovieDatabase/test.py", line 33, in get_poster_urls
base_url = config['images']['base_url']
KeyError: 'images'
Process finished with exit code 1
But iam unable to figuring out the errors it seems like everything is right but urrghh these errors are not going i have tried everything

I guess it should work if you change CONFIG_PATTERN to http://api.themoviedb.org/3/configuration?api_key=<your_key>
BTW, edited your question since you posted your API key in it.

Related

Beautifulsoup find method returns not subscriptable object

I was trying to create a Twitter scraper using beautifulsoup, request, and json. However, when I tried to run the code, it raised the error object is not subscriptable. I checked the lines where the error is located, but I couldn't find what raises the error. Can someone please help? I couldn't fix it.
File "tweetscraper.py", line 131, in <module>
start()
File "tweetscraper.py", line 125, in start
tweets = get_tweets_data(username, soup)
File "tweetscraper.py", line 54, in get_tweets_data
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
TypeError: 'NoneType' object is not subscriptable
Here is my code:
def get_tweet_text(tweet):
tweet_text_box = tweet.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"})
images_in_tweet_tag = tweet_text_box.find_all("a", {"class": "twitter-timeline-link u-hidden"})
tweet_text = tweet_text_box.text
for image_in_tweet_tag in images_in_tweet_tag:
tweet_text = tweet_text.replace(image_in_tweet_tag.text, '')
return tweet_text
def get_this_page_tweets(soup):
tweets_list = list()
tweets = soup.find_all("li", {"data-item-type": "tweet"})
for tweet in tweets:
tweet_data = None
try:
tweet_data = get_tweet_text(tweet)
except Exception as e:
continue
#ignore if there is any loading or tweet error
if tweet_data:
tweets_list.append(tweet_data)
print(".", end="")
sys.stdout.flush()
return tweets_list
def get_tweets_data(username, soup):
tweets_list = list()
tweets_list.extend(get_this_page_tweets(soup))
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
while True:
next_url = "https://twitter.com/i/profiles/show/" + username + \
"/timeline/tweets?include_available_features=1&" \
"include_entities=1&max_position=" + next_pointer + "&reset_error_state=false"
next_response = None
try:
next_response = requests.get(next_url)
except Exception as e:
# in case there is some issue with request. None encountered so far.
print(e)
return tweets_list
tweets_data = next_response.text
tweets_obj = json.loads(tweets_data)
if not tweets_obj["has_more_items"] and not tweets_obj["min_position"]:
# using two checks here bcz in one case has_more_items was false but there were more items
print("\nNo more tweets returned")
break
next_pointer = tweets_obj["min_position"]
html = tweets_obj["items_html"]
soup = BeautifulSoup(html, 'lxml')
tweets_list.extend(get_this_page_tweets(soup))
return tweets_list
# dump final result in a json file
def dump_data(username, tweets):
filename = username+"_twitter.json"
print("\nDumping data in file " + filename)
data = dict()
data["tweets"] = tweets
with open(filename, 'w') as fh:
fh.write(json.dumps(data))
return filename
def get_username():
# if username is not passed
if len(sys.argv) < 2:
usage()
username = sys.argv[1].strip().lower()
if not username:
usage()
return username
def start(username = None):
username = get_username()
url = "http://www.twitter.com/" + username
print("\n\nDownloading tweets for " + username)
response = None
try:
response = requests.get(url)
except Exception as e:
print(repr(e))
sys.exit(1)
if response.status_code != 200:
print("Non success status code returned "+str(response.status_code))
sys.exit(1)
soup = BeautifulSoup(response.text, 'lxml')
if soup.find("div", {"class": "errorpage-topbar"}):
print("\n\n Error: Invalid username.")
sys.exit(1)
tweets = get_tweets_data(username, soup)
# dump data in a text file
dump_data(username, tweets)
print(str(len(tweets))+" tweets dumped.")
start()
The method find() will only return the first occurrence that is matched from the website data. This is a single object returned. Whereas, the method find_all() will return all the occurrences that are matched to the condition specified. So the method find_all() returns a list that is subscriptable.
Find out more about this in the Beautiful Soup Documentation.

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Twitter streaming script is throwing a keyerror on location field of the tweet

I have as of now written a Python script to stream tweets and I have made use of the tweepy module to do so. After streaming for around 3 minutes for tweets, I dump these tweets into a .json file. I populate these tweets (I try to) into a pandas dataframe for location and text fields of the tweet. The text field of the tweet gets populated but not for every tweet (problem 1) in the .json file and as far as the location field is concerned a keyerror (problem 2) is thrown. May I know what exactly is going wrong.
twitter_stream_dump.py
import time
import json
import pandas as pd
import re
#tweepy based modules
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
#initializing authentication credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener) :
def __init__(self,time_limit) :
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('requests.json','a')
super(StdOutListener,self).__init__()
def on_data(self, data) :
if ((time.time() - self.start_time) < self.limit) :
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else :
self.saveFile.close()
return False
def on_error(self, status) :
print(status)
def getwords(string) :
return re.findall(r"[\w'#]+|[.,!?;]",string)
if __name__ == '__main__' :
#This handles Twitter authetification and the connection to Twitter Streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
time_limit = input("Enter the time limit in minutes : ")
time_limit *= 60
stream = Stream(auth,listener = StdOutListener(time_limit))
string = raw_input("Enter the list of keywords/hashtags to be compared : ")
keyword_list = getwords(string)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track = keyword_list)
tweets_data_path = 'requests.json'
tweets_data = []
tweet_list = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file :
try :
tweet = json.loads(line)
tweet_list.append(tweet)
except :
continue
num_tweets_collected = len(tweet_list)
#Creates a data frame structure
tweet_dataframe = pd.DataFrame()
text_dump = open('text_dump.txt', 'w')
#Populating the location field of the data frame
#tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
tweet_dataframe['text'] = map(lambda tweet : tweet['text'], tweet_list)
print(tweet_dataframe['text'])
Errors :
abhijeet-mohanty-2:Desktop SubrataMohanty$ python twitter_stream_dump.py
Enter the time limit in minutes : 3
Enter the list of keywords/hashtags to be compared : python ruby scala
Traceback (most recent call last):
File "twitter_stream_dump.py", line 81, in <module>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
File "twitter_stream_dump.py", line 81, in <lambda>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
KeyError: 'location'
requests.json (My .json file)
https://drive.google.com/file/d/0B1p05OszaBkXLWFsQ2VmeWVjbDQ/view?usp=sharing
The location field is a user-defined value and will sometimes not be present.
That's why you're getting the KeyError.
Note that location is part of the "user profile" metadata that comes with a tweet. It's intended to describe a user's location (like their hometown), and not the geotagged location of a given tweet.
In case you're interested in geotags, first check a tweeet to see if the geo_enabled field is true. If so, the geo, coordinates, and place fields may contain geotagged information.
As for missing text entries, I don't see the same issue when using the data you provided. It's possible the issue was caused by your try/except clause when reading in lines of data. Consider this approach:
for i, line in enumerate(tweets_file):
if line.rstrip():
tweet = json.loads(line)
tweet_list.append(tweet)
num_tweets_collected = len(tweet_list)
texts = [tweet['text'] for tweet in tweet_list]
tweet_dataframe = pd.DataFrame(texts, columns=['text'])
Sample output:
print(tweet_dataframe.head())
# text
# 0 Tweets and python BFF <3 15121629.976126991
# 1 RT #zeroSteiner: Can now write more post modul...
# 2 •ruby• #MtvInstagLSelena #MtvColabTaylors
# 3 Ruby Necklace July Birthstone Jewelry Rosary...
# 4 #ossia I didn't see any such thing as Python. ...
A few quick summary stats show that no lines are missing, and no entries are null:
print("N tweets: {}".format(num_tweets_collected))
# N tweets: 286
print("N rows in dataframe: {}".format(tweet_dataframe.shape[0]))
# N rows in dataframe: 286
null_count = tweet_dataframe.text.isnull().sum()
print("Tweets with no text field extracted: {}".format(null_count))
# Tweets with no text field extracted: 0

How to soup a browser response

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable
Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.

Fail to store data in csv file through scraping

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())