How to parse russian text properly using Python 2.7 and BeautifulSoup?

How to parse russian text properly using Python 2.7 and BeautifulSoup? - html

I am trying to parse all posts from a russian website(http://games4you.ucoz.ua/news/). I am using Python 2.7.9 and BeautifulSoup 4. I am working in PyCharm. I've tried a lot of things to make it work, but still I get this instead of russian text: '\u0421\u0442\u0440\u0430\u0442\u0435\u0433\u0456\u0457'
This is my code:
# Parsing information from games4you.ucoz.ua
# -*- coding: utf-8 -*-
import re
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
response = urllib.urlopen(url)
return response.read()
def get_page_count(html):
soup = BeautifulSoup(html)
paggination = soup.find('div', class_='catPages1')
return int(paggination.find_all('a')[-2].text)
def save(games, path):
# with open(path, 'w') as csvfile:
# writer = csv.writer(csvfile)
#
# writer.writerow(('Title', 'Category', 'Date', 'Time'))
#
# writer.writerows(
# (game['title'], ', '.join(game['category']), game['date'], game['time']) for game in games
# )
with open(path,'w+') as f:
f.write(str(games).encode("UTF-8"))
def parse(html):
soup = BeautifulSoup(html)
# Getting the <div> that contains all posts on page
all_entries = soup.find('div',id='allEntries')
# Getting all of the posts (every table represents one post)
tables = all_entries.find_all('table',class_='eBlock')
# Creating a list o dictionaries for games information
games = []
for table in tables:
try:
# Getting the game title
game_title = table.tr.td.a.text
game_post_body = table.find('div',class_='eMessage')
# Getting the game description
game_description = game_post_body.p.text.split('....')[0] + '.'
game_details = table.find('div',class_='eDetails')
# Getting the game category
game_category = game_details.a.text
game_post_details = game_details.text
except:
print 'Some error'
continue
# Getting the post views count
post_views = game_post_details[:game_post_details.find('function')].split()[-2]
# Getting the post date
post_date = game_details.span.text
# Getting the post time
post_time = game_details.span['title']
# print 'Game title: ',game_title,'\n'
# print 'Views: ',post_views,'\n'
# print 'Game category: ',game_category,'\n'
# print 'Game description: ','\n',game_description,'\n'
# print 'Post date: ',post_date,'\n'
# print 'Post time: ',post_time,'\n'
games.append({
'title': game_title,
'category' : game_category,
'description' : game_description,
'date' : post_date,
'time' : post_time
})
return games
def main():
total_pages = get_page_count(get_html(BASE_URL))
print('Total found %d pages...' % total_pages)
games = []
for page in range(1, total_pages + 1):
print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
print('Saving...')
save(games, 'games.txt')
main()

in Python2
>>> import HTMLParser
>>> s = 'Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.'
>>> print HTMLParser.HTMLParser().unescape(s)
Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.
in Python3
>>> import html
>>> html.unescape(s)
your example
'Стратегії'
For "normal" utf-8 file writing(reading) use
import codecs
f = codecs.open(filename, 'w', 'utf-8')
hope this helps

Yes, I did it! Guess I messed up with the decoding/coding text and using different charsets. Everything I had to do is simply convert the data I get from BeautifulSoup from Unicode to Utf-8, like this:
game_title = game_title.encode("utf-8")
game_category = game_category.encode("utf-8")
game_description = game_description.encode("utf-8")
post_date = post_date.encode("utf-8")
post_time = post_time.encode("utf-8")
Nothing else was needed. This is the result code that worked for me:
# Parsing information from games4you.ucoz.ua
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
response = urllib.urlopen(url)
return response.read()
def get_page_count(html):
soup = BeautifulSoup(html)
paggination = soup.find('div', class_='catPages1')
return int(paggination.find_all('a')[-2].text)
def save(games, path):
with open(path, 'w+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('Title', 'Category', 'Date', 'Time'))
writer.writerows(
(game['title'],game['category'], game['date'], game['time']) for game in games
)
def parse(html):
soup = BeautifulSoup(html)
# Getting the <div> that contains all posts on page
all_entries = soup.find('div',id='allEntries')
# Getting all of the posts (every table represents one post)
tables = all_entries.find_all('table',class_='eBlock')
# Creating a list o dictionaries for games information
games = []
for table in tables:
try:
# Getting the game title
game_title = table.tr.td.a.text
game_post_body = table.find('div',class_='eMessage')
# Getting the game description
game_description = game_post_body.p.text.split('....')[0] + '.'
game_details = table.find('div',class_='eDetails')
# Getting the game category
game_category = game_details.a.text
game_post_details = game_details.text
except:
print 'Some error'
continue
# Getting the post views count
post_views = game_post_details[:game_post_details.find('function')].split()[-2]
# Getting the post date
post_date = game_details.span.text
# Getting the post time
post_time = game_details.span['title']
# Converting all data from Unicode to Utf-8
game_title = game_title.encode("utf-8")
game_category = game_category.encode("utf-8")
game_description = game_description.encode("utf-8")
post_date = post_date.encode("utf-8")
post_time = post_time.encode("utf-8")
# Writing data to the list
games.append({
'title': game_title,
'category' : game_category,
'description' : game_description,
'date' : post_date,
'time' : post_time
})
return games
def main():
total_pages = get_page_count(get_html(BASE_URL))
print('Total found %d pages...' % total_pages)
games = []
for page in range(1, total_pages + 1):
print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
print('Saving...')
save(games, 'games.csv')
main()

Related

Instaloader JSON files: Convert 200 JSON files into a Single CSV (Python 3.7)

I want to automatically download pictures (or videos) along with their captions and other data from a specific Instagram Hashtag (e.g. #moodoftheday) using Instaloader. Instaloader returns JSON files including posts metadata.
The following code worked with just a single #user_profile metadata.
I want to do the same, but for a #hashtag not a specific #user.
The ultimate goal is to have all of the JSON files (e.g. 200) into a csv file.
How can I process my downloaded data in a clean excel/CSV file?
Here is my code:
# Install Instaloader
import instaloader
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("username" , "password")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
'''
This function will now save all instagram posts and related data to a folder in you current working directory.
Let’s call this function on the instagram account of “moodoftheday”. let the script do its magic.
This might take a while so be patient.
'''
import os
import datetime
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime(2020, 9, 1)
enddate = datetime(2020, 10, 1)
# get your current working directory
current_wkdir = os.get_cwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
'''
You notice that this data is NOT yet in the right format since each post has a separate json file.
You will need to process all these json files to a consolidated excel file in order to perform analyses on the data.
'''
def parse_instafiles(username, path):
"""
This function loads in all the json files generated by the instaloader package and parses it into a csv file.
"""
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
glob('*UTC.json')
for file in glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = orjson.loads(filecontent.read())
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_excel("moodoftheday.csv")
I am very new to Python and programming overall, therefore any help is very much appreciated!!
Thank you in advance! Sofia

I made some changes it's not showing error but still needs some professional works:
import instaloader
from datetime import datetime
import datetime
from itertools import takewhile
from itertools import dropwhile
import os
import glob as glob
import json
import pandas as pd
import csv
lusername = ''
lpassword = ''
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("lusername" , "lpassword")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime.datetime(2020, 9, 1,0,0)
enddate = datetime.datetime(2022, 2, 1,0,0)
# get your current working directory
current_wkdir = os.getcwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
def parse_instafiles(username, path):
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
# glob('*UTC.json')
for file in glob.glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = json.load(filecontent)
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_csv("moodoftheday.csv")

Instaloader has an example of hashtag search on its documentation, here's the code:
from datetime import datetime
import instaloader
L = instaloader.Instaloader()
posts = instaloader.Hashtag.from_name(L.context, "urbanphotography").get_posts()
SINCE = datetime(2020, 5, 10) # further from today, inclusive
UNTIL = datetime(2020, 5, 11) # closer to today, not inclusive
k = 0 # initiate k
#k_list = [] # uncomment this to tune k
for post in posts:
postdate = post.date
if postdate > UNTIL:
continue
elif postdate <= SINCE:
k += 1
if k == 50:
break
else:
continue
else:
L.download_post(post, "#urbanphotography")
# if you want to tune k, uncomment below to get your k max
#k_list.append(k)
k = 0 # set k to 0
#max(k_list)
here's the link for more info:
https://instaloader.github.io/codesnippets.html
I'm trying do to something similar, but I'm still very new to programming, so I'm sorry if I can't offer much help

BeautifulSoup4 & Python - multiple pages into DataFrame

I have some code which collects the description, price, and old price(if on sale) from online retailers over multiple pages. I'm looking to export this into a DataFrame and have had a go but run into the following error:
ValueError: Shape of passed values is (1, 3210), indices imply (3, 3210).
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
# Start Timer
then = time.time()
# Headers
headers = {"User-Agent": "Mozilla/5.0"}
# Set HTTPCode = 200 and Counter = 1
Code = 200
i = 1
scraped_data = []
while Code == 200:
# Put url together
url = "https://www.asos.com/women/jumpers-cardigans/cat/?cid=2637&page="
url = url + str(i)
# Request URL
r = requests.get(url, allow_redirects=False, headers=headers) # No redirects to allow infinite page count
data = r.text
Code = r.status_code
# Soup
soup = BeautifulSoup(data, 'lxml')
# For loop each product then scroll through title price, old price and description
divs = soup.find_all('article', attrs={'class': '_2qG85dG'}) # want to cycle through each of these
for div in divs:
# Get Description
Description = div.find('div', attrs={'class': '_3J74XsK'})
Description = Description.text.strip()
scraped_data.append(Description)
# Fetch TitlePrice
NewPrice = div.find('span', attrs={'data-auto-id':'productTilePrice'})
NewPrice = NewPrice.text.strip("£")
scraped_data.append(NewPrice)
# Fetch OldPrice
try:
OldPrice = div.find('span', attrs={'data-auto-id': 'productTileSaleAmount'})
OldPrice = OldPrice.text.strip("£")
scraped_data.append(OldPrice)
except AttributeError:
OldPrice = ""
scraped_data.append(OldPrice)
print('page', i, 'scraped')
# Print Array
#array = {"Description": str(Description), "CurrentPrice": str(NewPrice), "Old Price": str(OldPrice)}
#print(array)
i = i + 1
else:
i = i - 2
now = time.time()
pd.DataFrame(scraped_data, columns=["A", "B", "C"])
print('Parse complete with', i, 'pages' + ' in', now-then, 'seconds')

Right now your data is appended to list based on an algorithm that I can describe like this:
Load the web page
Append to list value A
Append to list value B
Append to list value C
What this creates for each run through the dataset is:
[A1, B1, C1, A2, B2, C2]
There exists only one column with data, which is what pandas is telling you. To construct the dataframe properly, either you need to swap it into a format where you have, on each row entry, a tuple of three values (heh) like:
[
(A1, B1, C1),
(A2, B2, C2)
]
Or, in my preferred way because it's far more robust to coding errors and inconsistent lengths to your data: creating each row as a dictionary of columns. Thus,
rowdict_list = []
for row in data_source:
a = extract_a()
b = extract_b()
c = extract_c()
rowdict_list.append({'column_a': a, 'column_b': b, 'column_c': c})
And the data frame is constructed easily without having to explicitly specify columns in the constructor with df = pd.DataFrame(rowdict_list).

You can create a DataFrame using the array dictionary.
You would want to set the values of the array dict to empty lists that way you can append the values from the webpage into the correct list. Also move the array variable outside of the while loop.
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
On the line where you were previously defining the array variable you would then want to append the desciption, price and old price values like so.
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
Then you can to create a DataFrame using the array variable
pd.DataFrame(array)
So the final solution would look something like
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
# For loop
for div in divs:
# Get Description
Description = div.find('h3', attrs={'class': 'product__title'})
Description = Description.text.strip()
# Fetch TitlePrice
try:
NewPrice = div.find('div', attrs={'class': 'price product__price--current'})
NewPrice = NewPrice.text.strip()
except AttributeError:
NewPrice = div.find('p', attrs={'class': 'price price--reduced'})
NewPrice = NewPrice.text.strip()
# Fetch OldPrice
try:
OldPrice = div.find('p', attrs={'class': 'price price--previous'})
OldPrice = OldPrice.text.strip()
except AttributeError:
OldPrice = ""
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
# Print Array
print(array)
df = pd.DataFrame(array)
i = i + 1
else:
i = i - 2
now = time.time()
print('Parse complete with', i, 'pages' + ' in', now - then, 'seconds')
Finally make sure you've imported pandas at the top of the module
import pandas as pd

trying to extract data and want to save in excel but getting error using python beautifulsoup

trying to extract but last in last field getting error want to save all fields in excel.
i have tried using beautifulsoup to extract but fails to catch, getting below error
Traceback (most recent call last):
File "C:/Users/acer/AppData/Local/Programs/Python/Python37/agri.py", line 30, in
specimens = soup2.find('h3',class_='trigger
expanded').find_next_sibling('div',class_='collapsefaq-content').text
AttributeError: 'NoneType' object has no attribute 'find_next_sibling'
from bs4 import BeautifulSoup
import requests
page1 = requests.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases')
soup1 = BeautifulSoup(page1.text,'lxml')
for lis in soup1.find_all('li',class_='flex-item'):
diseases = lis.find('img').next_sibling
print("Diseases: " + diseases)
image_link = lis.find('img')['src']
print("Image_Link:http://www.agriculture.gov.au" + image_link)
links = lis.find('a')['href']
if links.startswith("http://"):
link = links
else:
link = "http://www.agriculture.gov.au" + links
page2 = requests.get(link)
soup2 = BeautifulSoup(page2.text,'lxml')
try:
origin = soup2.find('strong',string='Origin: ').next_sibling
print("Origin: " + origin)
except:
pass
try:
imported = soup2.find('strong',string='Pathways: ').next_sibling
print("Imported: " + imported)
except:
pass
specimens = soup2.find('h3',class_='trigger expanded').find_next_sibling('div',class_='collapsefaq-content').text
print("Specimens: " + specimens)
want to extarct that last field and to save all fields into excel sheet using python, plz help me anyone.

Minor typo:
data2,append("Image_Link:http://www.agriculture.gov.au" + image_link)
Should be:
data2.append("Image_Link:http://www.agriculture.gov.au" + image_link) #period instead of a comma

It seems to want headers to prevent being blocked and also there is not a specimens section for each page. The following shows possible handling for each page for the specimen info
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
if soup.select_one('.trigger'): # could also use if soup.select_one('.trigger:nth-of-type(3) + div'):
info = soup.select_one('.trigger:nth-of-type(3) + div').text
else:
info = 'None'
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
df.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
I have run the above lots of times without problem, however, you can always switch my current test to a try except block.
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
try:
info = soup.select_one('.trigger:nth-of-type(3) + div').text
except:
info = 'None'
print(link)
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
Example of csv output:

getting the wrong text from web scrape with beautifulsoup

I'm getting the wrong text when I scrape this url:
http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018
this is what I have
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
# game_publishers = html_soup.find_all("ul", class_='more_stats')
# game_ratings = html_soup.find_all("ul", class_='more_stats')
# game_genres = html_soup.find_all("ul", class_='more_stats')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
game_user = games4.find()
userscores.append(game_user.text.strip())
# print(name)
# print(metascore)
# print(userscore)
# for i in userscores:
# temp = str(i)
# temp2 = temp.replace("User:\n ", "")
# userscoresNew.append(temp2)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
# df = pd.DataFrame({'Games:': names,
# 'Metascore:': metascores,
# 'Userscore:': userscoresNew})
# df.to_csv("metacritic scrape.csv")
the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:
game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')
I get an error when I run the code:
AttributeError: 'NoneType' object has no attribute 'text'
also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")
any help would be appreicated
FYI I modifying code I have already written but grabing more details from a more detailed view

This should help.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
userScore = i.find('span', class_="data textscore textscore_favorable")
if userScore:
print(userScore.text)
Output:
7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
Use html_soup.find_all("li", class_='stat product_avguserscore') to get score

Fail to store data in csv file through scraping

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to parse russian text properly using Python 2.7 and BeautifulSoup? - html

Related

Instaloader JSON files: Convert 200 JSON files into a Single CSV (Python 3.7)

BeautifulSoup4 & Python - multiple pages into DataFrame

trying to extract data and want to save in excel but getting error using python beautifulsoup

getting the wrong text from web scrape with beautifulsoup

Fail to store data in csv file through scraping

Categories

Resources