getting the wrong text from web scrape with beautifulsoup - html

I'm getting the wrong text when I scrape this url:
http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018
this is what I have
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
# game_publishers = html_soup.find_all("ul", class_='more_stats')
# game_ratings = html_soup.find_all("ul", class_='more_stats')
# game_genres = html_soup.find_all("ul", class_='more_stats')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
game_user = games4.find()
userscores.append(game_user.text.strip())
# print(name)
# print(metascore)
# print(userscore)
# for i in userscores:
# temp = str(i)
# temp2 = temp.replace("User:\n ", "")
# userscoresNew.append(temp2)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
# df = pd.DataFrame({'Games:': names,
# 'Metascore:': metascores,
# 'Userscore:': userscoresNew})
# df.to_csv("metacritic scrape.csv")
the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:
game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')
I get an error when I run the code:
AttributeError: 'NoneType' object has no attribute 'text'
also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")
any help would be appreicated
FYI I modifying code I have already written but grabing more details from a more detailed view

This should help.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
userScore = i.find('span', class_="data textscore textscore_favorable")
if userScore:
print(userScore.text)
Output:
7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
Use html_soup.find_all("li", class_='stat product_avguserscore') to get score

Related

Web scraping Reed.co.uk returns no results

I am trying to build a webscraper for Reed.co.uk to pull a list of jobs. However, when I run the code it returns no results. Everything is working and I get no errors. It just doesn't return any results. What am I doing wrong? I used the same method for Indeed and it works perfectly each time. I have inspected the website and I have tried a number of different titles for the divs section
import requests
from bs4 import BeautifulSoup
import pandas as pd
#get URL
def extract():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
url = 'https://www.reed.co.uk/jobs/warehouse-operative-jobs-in-portsmouth'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
#extract required company data from adverts and append to joblist
def transform(soup):
divs = soup.find_all('article', class_ = 'job-result ')
for item in divs:
title = item.find('h3', class_ = 'title')
company = item.find('div', class_ = 'posted-by')
try:
salary = item.find('div', class_ = 'metadata')
except:
salary = ''
summary = item.find('div', class_ = 'description')
job = {
'Title' : title,
'Company' : company,
'Salary' : salary,
'Summary' : summary
}
joblist.append(job)
return
joblist = []
for i in range(0,40,10):
print(f'Getting page,{i}')
c = extract()
transform(c)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
'''
You can use this example how to get information from that page into pandas DataFrame:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.reed.co.uk/jobs/warehouse-operative-jobs-in-portsmouth?pageno={}"
all_data = []
for page in range(1, 3):
print("Getting page {}".format(page))
soup = BeautifulSoup(requests.get(url.format(page)).content, "html.parser")
for job in soup.select("#server-results article"):
title = job.h3.get_text(strip=True)
posted_by = job.select_one(".posted-by").a.get_text(strip=True)
salary = job.select_one(".salary")
salary = salary.get_text(strip=True) if salary else ""
description = job.select_one(".description").p.get_text(strip=True)
all_data.append((title, posted_by, salary, description))
df = pd.DataFrame(all_data, columns=["Title", "Company", "Salary", "Summary"])
df.to_csv("data.csv", index=False)
print(df)
Creates data.csv (screenshot from LibreOffice):

Scraping and save data from URLs to csv using BeautifulSoup

Well, I am new to BS in Python. I have written a code that scrapes HTML and save all the data that I need in csv file. The values from the ALL_NUMBERS file are substituted into the URL and thus a large number of URLs are obtained.
The code is below:
import requests
from bs4 import BeautifulSoup
#--READ NAMES--
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.84 Safari/537.36',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7'}
all_names = [] # TO KEEP ALL NAMES IN MEMORY
with open('ALL_NUMBERS.txt', 'r') as text_file:
for line in text_file:
line = line.strip()
all_names.append(line)
url_template = 'https://www.investing.com/news/stock-market-news/learjet-the-private-plane-synonymous-with-the-jetset-nears-end-of-runway-{}'
all_urls = [] # TO KEEP ALL URLs IN MEMORY
with open("url_requests.txt", "w") as text_file:
for name in all_names:
url = url_template.format(name)
print('url:', url)
all_urls.append(url)
text_file.write(url + "\n")
# --- read data ---
for name, url in zip(all_names, all_urls):
# print('name:', name)
# print('url:', url)
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = results.findAll("p")
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll ("span")
#for n in results_2:
#print n.find('p').text
#cont = soup.select_one("div.contentSectionDetails")
#ram = cont.select_one("span")
#[x.extract() for x in ram.select_one('span')]
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write(str(para))
text_file.write(str(para_2))
It works well, but only with one URL. I want to save para and para_2 from each URL in one csv file. That is, save two parameters from each URL in each line:
Text
Time
para From URL(1)
para_2 From URL(1)
para From URL(2)
para_2 From URL(2)
...
...
Unfortunately, I don't know how do it better for a lot of URLs in my case.
You could store all the params in a list and then save the result in your file:
import csv
# ...
# --- read data ---
params = []
for name, url in zip(all_names, all_urls):
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = '\n'.join([r.text for r in results.findAll("p")])
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll("span")[0].text
params.append([str(para), str(para_2)])
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write("Text;Time\n")
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(['Text', 'Time'])
wr.writerows(params)
Does this answer solve your problem?
Have a nice day!

Why do I have to declare the list inside the loop?

I just wrote the code below and it wasn't working like it's supposed to at first because I declared the list (ltag) outside the loop. I just wanna know what the difference is and why it didn't work in the first case.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter the URL mboy :')
pos= input("Enter position: ")
n= int(pos) - 1
count = input("Enter count: ")
c= int(count) +1
times = 0
while times < c :
ltag = list()
print('Retrieving: ', url)
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for i in range (c):
for tag in tags:
ltag.append(tag)
url = ltag[n].get('href', None)
times = times + 1

trying to extract data and want to save in excel but getting error using python beautifulsoup

trying to extract but last in last field getting error want to save all fields in excel.
i have tried using beautifulsoup to extract but fails to catch, getting below error
Traceback (most recent call last):
File "C:/Users/acer/AppData/Local/Programs/Python/Python37/agri.py", line 30, in
specimens = soup2.find('h3',class_='trigger
expanded').find_next_sibling('div',class_='collapsefaq-content').text
AttributeError: 'NoneType' object has no attribute 'find_next_sibling'
from bs4 import BeautifulSoup
import requests
page1 = requests.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases')
soup1 = BeautifulSoup(page1.text,'lxml')
for lis in soup1.find_all('li',class_='flex-item'):
diseases = lis.find('img').next_sibling
print("Diseases: " + diseases)
image_link = lis.find('img')['src']
print("Image_Link:http://www.agriculture.gov.au" + image_link)
links = lis.find('a')['href']
if links.startswith("http://"):
link = links
else:
link = "http://www.agriculture.gov.au" + links
page2 = requests.get(link)
soup2 = BeautifulSoup(page2.text,'lxml')
try:
origin = soup2.find('strong',string='Origin: ').next_sibling
print("Origin: " + origin)
except:
pass
try:
imported = soup2.find('strong',string='Pathways: ').next_sibling
print("Imported: " + imported)
except:
pass
specimens = soup2.find('h3',class_='trigger expanded').find_next_sibling('div',class_='collapsefaq-content').text
print("Specimens: " + specimens)
want to extarct that last field and to save all fields into excel sheet using python, plz help me anyone.
Minor typo:
data2,append("Image_Link:http://www.agriculture.gov.au" + image_link)
Should be:
data2.append("Image_Link:http://www.agriculture.gov.au" + image_link) #period instead of a comma
It seems to want headers to prevent being blocked and also there is not a specimens section for each page. The following shows possible handling for each page for the specimen info
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
if soup.select_one('.trigger'): # could also use if soup.select_one('.trigger:nth-of-type(3) + div'):
info = soup.select_one('.trigger:nth-of-type(3) + div').text
else:
info = 'None'
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
df.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
I have run the above lots of times without problem, however, you can always switch my current test to a try except block.
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
try:
info = soup.select_one('.trigger:nth-of-type(3) + div').text
except:
info = 'None'
print(link)
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
Example of csv output:

Assigning beautifulsoup indexed values (html links and text) to a panda html DataFrame

The following code retrieves images and html links from a webpage and stores the values in a beautiful soup index. I am now using pandas in order to create an output html table for those images and links. I have managed to populate cells manually by calling on a specific index value but I can't seem to find a way add each indexed image and html text to the panda dataframe so that all the indexed values are displayed in the table. How could I do this ?
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas import *
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',500)
from IPython.display import HTML
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
####################################
title_clean = soup.find('title')
print(title_clean)
image_links = [x['data-img'] for x in soup.find_all('a', rel='popover')]
for link in image_links:
print(link)
image_links_0 = image_links[0]
print(image_links_0)
mytags = []
tags = soup.find_all('td', width='41%')
for tag in tags:
image_text = tag.find('h5').text
mytags.append(image_text)
print(image_text)
for i in range(len(mytags)):
mytags[i]
mytags_0 = mytags[0]
image_links_0 = image_links[0]
#df = DataFrame({'foo1' : 'test',
df = DataFrame({'foo1' : '<img src="' + image_links_0 + '"/><p>' + mytags_0 + '</p>',
'foo2' : '' + mytags_0 + '',
'foo3' : mytags_0,
'foo4' : np.random.randn(2)})
print(df)
HTML(df.to_html('filename.html', escape=False))
print(tag)
This is the correct way to do it.
If you need any help with storing it and making an HTML out of it I'll be happy to provide a solution for that as well. Take care!
Update: Everything included, comments, scraping, writing to a file, creating tags with beautifulsoup.
from bs4 import BeautifulSoup
import requests
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table border="1"></table>', 'lxml')
thekeys = BeautifulSoup('<thead></thead>', 'html.parser')
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, height='50', width='50')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.tr.append(td_tag)
else:
tag = theads.new_tag('td')
tag.append(j)
thedata.tr.append(tag)
theads.table.append(thedata)
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
# each of these if you print them you'll get a information that you can store
# we use `.prettify()` as we can't write a BeautifulSoup object into a file.