Scraping and save data from URLs to csv using BeautifulSoup - html

Well, I am new to BS in Python. I have written a code that scrapes HTML and save all the data that I need in csv file. The values from the ALL_NUMBERS file are substituted into the URL and thus a large number of URLs are obtained.
The code is below:
import requests
from bs4 import BeautifulSoup
#--READ NAMES--
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.84 Safari/537.36',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7'}
all_names = [] # TO KEEP ALL NAMES IN MEMORY
with open('ALL_NUMBERS.txt', 'r') as text_file:
for line in text_file:
line = line.strip()
all_names.append(line)
url_template = 'https://www.investing.com/news/stock-market-news/learjet-the-private-plane-synonymous-with-the-jetset-nears-end-of-runway-{}'
all_urls = [] # TO KEEP ALL URLs IN MEMORY
with open("url_requests.txt", "w") as text_file:
for name in all_names:
url = url_template.format(name)
print('url:', url)
all_urls.append(url)
text_file.write(url + "\n")
# --- read data ---
for name, url in zip(all_names, all_urls):
# print('name:', name)
# print('url:', url)
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = results.findAll("p")
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll ("span")
#for n in results_2:
#print n.find('p').text
#cont = soup.select_one("div.contentSectionDetails")
#ram = cont.select_one("span")
#[x.extract() for x in ram.select_one('span')]
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write(str(para))
text_file.write(str(para_2))
It works well, but only with one URL. I want to save para and para_2 from each URL in one csv file. That is, save two parameters from each URL in each line:
Text
Time
para From URL(1)
para_2 From URL(1)
para From URL(2)
para_2 From URL(2)
...
...
Unfortunately, I don't know how do it better for a lot of URLs in my case.

You could store all the params in a list and then save the result in your file:
import csv
# ...
# --- read data ---
params = []
for name, url in zip(all_names, all_urls):
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = '\n'.join([r.text for r in results.findAll("p")])
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll("span")[0].text
params.append([str(para), str(para_2)])
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write("Text;Time\n")
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(['Text', 'Time'])
wr.writerows(params)
Does this answer solve your problem?
Have a nice day!

Related

How do I get all href in an ul in a page with a scrollbar

I would like to get all hrefs which are within these li's in this ul:
Click here to see screenshot
So far I wrote this line:
import bs4, requests, re
product_pages = []
def get_product_pages(openurl):
global product_pages
url = 'https://www.ah.nl/producten/aardappel-groente-fruit'
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for li in soup.findAll('li', attrs={'class': 'taxonomy-sub-selector_root__3rtWx'}):
for a in li.findAll('a', href=True):
print(a.attrs['href'])
get_product_pages('')
But it is only giving me the hrefs from the first three li's. I am wondering why it is only the first three and I am wondering how to get all eight..
In the page there is a scroll bar, which might cause trouble?
The taxonomies and all other page data is stored inside page in <script> so beautifulsoup doesn't see it. To get all children taxonomies from current category you can use next example (parsing the <script> tag with re/json):
import re
import json
import requests
base_url = "https://www.ah.nl/producten"
url = base_url + "/aardappel-groente-fruit/fruit"
html_doc = requests.get(url).text
data = re.search(r"window\.__INITIAL_STATE__= ({.*})", html_doc)
data = data.group(1).replace("undefined", "null")
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
taxonomies = {t["id"]: t for t in data["taxonomy"]["topLevel"]}
for t in data["taxonomy"]["taxonomies"]:
taxonomies[t["id"]] = t
def get_taxonomy(t, current, dupl=None):
if dupl is None:
dupl = set()
tmp = current + "/" + t["slugifiedName"]
yield tmp
for c in t["children"]:
if c in taxonomies and c not in dupl:
dupl.add(c)
yield from get_taxonomy(taxonomies[c], tmp, dupl)
for t in taxonomies.values():
if t["parents"] == [0]:
for t in get_taxonomy(t, base_url):
if url in t: # print only URL from current category
print(t)
Prints:
https://www.ah.nl/producten/aardappel-groente-fruit/fruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/appels
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/appels/groente-en-fruitbox
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/bananen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/sinaasappels-mandarijnen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/peren
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/ananas-mango-kiwi
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/aardbeien-frambozen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/druiven-kersen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/bramen-bessen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/abrikozen-pruimen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/abrikozen-pruimen/exotisch-fruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/perziken-nectarines
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/meloen-kokosnoot
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/grapefruit-minneola
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/citroen-limoen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/fruit-spread
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/vijgen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/kaki-papaya-cherimoya
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/granaatappel-passiefruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/fruitsalade-mix
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/gedroogd-fruit

How to rename file with text from a certain HTML element when using urllib.request and BeautifulSoup to download files

I have an algorithm that downloads PDF articles with urllib.request and BeautifulSoup (Python 3.6):
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=[]
for i in range(9):
root_url = 'xxx.com/articles/page'+ str(i)
webpages.append(root_url)
#make a list of PDF links
pdf_links = []
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
links = [span.attrs['href'] for span in page_soup.find_all('a', href = True)]
for link in links:
link_string = str(link)
if link_string.endswith('pdf'):
pdf_links.append(link_string)
#download the files
for pdf_link in pdf_links:
save_to = os.path.basename(pdf_link.strip())
urllib.request.urlretrieve(pdf_link.strip(), save_to)
I need to rename each downloaded PDF article with the title of the article, which is stored in a specific div class:
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
There is a larger div that stores both the article title and the corresponding PDF link:
<div article-id="1741" class="online article_row_view">
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
<span class="file-pdf"> PDF</span>
</div>
I have no idea how to automatically rename the files, much less with a specific HTML element. Any help would be appreciated!
Here's a complete solution that walks all pages in the navigation and downloads all pdfs for you:
import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from pathlib import Path
from urllib.parse import urljoin
BASE_URL = 'https://cross-currents.berkeley.edu/archives'
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def extract_articles(soup: BeautifulSoup) -> list:
articles = []
for result in soup.select('.node--view-mode-search-result'):
author = result.select_one('.field--name-field-citation-authors').text.strip()
date = result.select_one('.field--name-field-issue-date').text.strip()
title = result.select_one('.field-name-node-title').text.strip()
journal = result.find('em', recursive=False).text.strip()
pdf_url = result.select_one('a[href*=".pdf"]')['href']
articles.append({
'author': author,
'date': date,
'title': title,
'journal': journal,
'pdf_url': pdf_url,
})
return articles
def make_safe_filename(text: str) -> str:
"""convert forbidden chars to underscores"""
return ''.join(c if (c.isalnum() or c.isspace()) else '_' for c in text).strip('_ ')
def get_next_page_url(soup: BeautifulSoup) -> str:
try:
path = soup.select_one('.pager a[rel="next"]')['href']
return urljoin(BASE_URL, path)
except (TypeError, KeyError):
return None
def download_file(url: str, filename: str) -> str:
with requests.get(url, stream=True) as res, open(filename, 'wb') as f:
res.raise_for_status()
for chunk in res.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return filename
def scrape_archive():
save_dir = Path(r'd:\downloads')
save_dir.mkdir(exist_ok=True, parents=True)
url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=onlinearticle&issue=All&region=All&page=0'
while True:
soup = make_soup(url)
articles = extract_articles(soup)
for a in articles:
pdf_url = a['pdf_url']
filename = make_safe_filename(a['title'])
save_path = str(save_dir / (filename + '.pdf'))
print('Downloading:', a['title'])
download_file(pdf_url, save_path)
print('Finished')
# go to next page if exists
next_url = get_next_page_url(soup)
if not next_url:
break
url = next_url
print('Moving to next page', url)
scrape_archive()
Here I've used only the title to generate the pdf filename, but you can mix and combine journal, date, author etc to generate a better filename.
Also remember to change save_dir to your liking.

trying to extract data and want to save in excel but getting error using python beautifulsoup

trying to extract but last in last field getting error want to save all fields in excel.
i have tried using beautifulsoup to extract but fails to catch, getting below error
Traceback (most recent call last):
File "C:/Users/acer/AppData/Local/Programs/Python/Python37/agri.py", line 30, in
specimens = soup2.find('h3',class_='trigger
expanded').find_next_sibling('div',class_='collapsefaq-content').text
AttributeError: 'NoneType' object has no attribute 'find_next_sibling'
from bs4 import BeautifulSoup
import requests
page1 = requests.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases')
soup1 = BeautifulSoup(page1.text,'lxml')
for lis in soup1.find_all('li',class_='flex-item'):
diseases = lis.find('img').next_sibling
print("Diseases: " + diseases)
image_link = lis.find('img')['src']
print("Image_Link:http://www.agriculture.gov.au" + image_link)
links = lis.find('a')['href']
if links.startswith("http://"):
link = links
else:
link = "http://www.agriculture.gov.au" + links
page2 = requests.get(link)
soup2 = BeautifulSoup(page2.text,'lxml')
try:
origin = soup2.find('strong',string='Origin: ').next_sibling
print("Origin: " + origin)
except:
pass
try:
imported = soup2.find('strong',string='Pathways: ').next_sibling
print("Imported: " + imported)
except:
pass
specimens = soup2.find('h3',class_='trigger expanded').find_next_sibling('div',class_='collapsefaq-content').text
print("Specimens: " + specimens)
want to extarct that last field and to save all fields into excel sheet using python, plz help me anyone.
Minor typo:
data2,append("Image_Link:http://www.agriculture.gov.au" + image_link)
Should be:
data2.append("Image_Link:http://www.agriculture.gov.au" + image_link) #period instead of a comma
It seems to want headers to prevent being blocked and also there is not a specimens section for each page. The following shows possible handling for each page for the specimen info
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
if soup.select_one('.trigger'): # could also use if soup.select_one('.trigger:nth-of-type(3) + div'):
info = soup.select_one('.trigger:nth-of-type(3) + div').text
else:
info = 'None'
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
df.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
I have run the above lots of times without problem, however, you can always switch my current test to a try except block.
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
try:
info = soup.select_one('.trigger:nth-of-type(3) + div').text
except:
info = 'None'
print(link)
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
Example of csv output:

getting the wrong text from web scrape with beautifulsoup

I'm getting the wrong text when I scrape this url:
http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018
this is what I have
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
# game_publishers = html_soup.find_all("ul", class_='more_stats')
# game_ratings = html_soup.find_all("ul", class_='more_stats')
# game_genres = html_soup.find_all("ul", class_='more_stats')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
game_user = games4.find()
userscores.append(game_user.text.strip())
# print(name)
# print(metascore)
# print(userscore)
# for i in userscores:
# temp = str(i)
# temp2 = temp.replace("User:\n ", "")
# userscoresNew.append(temp2)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
# df = pd.DataFrame({'Games:': names,
# 'Metascore:': metascores,
# 'Userscore:': userscoresNew})
# df.to_csv("metacritic scrape.csv")
the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:
game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')
I get an error when I run the code:
AttributeError: 'NoneType' object has no attribute 'text'
also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")
any help would be appreicated
FYI I modifying code I have already written but grabing more details from a more detailed view
This should help.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
userScore = i.find('span', class_="data textscore textscore_favorable")
if userScore:
print(userScore.text)
Output:
7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
Use html_soup.find_all("li", class_='stat product_avguserscore') to get score

Assigning beautifulsoup indexed values (html links and text) to a panda html DataFrame

The following code retrieves images and html links from a webpage and stores the values in a beautiful soup index. I am now using pandas in order to create an output html table for those images and links. I have managed to populate cells manually by calling on a specific index value but I can't seem to find a way add each indexed image and html text to the panda dataframe so that all the indexed values are displayed in the table. How could I do this ?
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas import *
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',500)
from IPython.display import HTML
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
####################################
title_clean = soup.find('title')
print(title_clean)
image_links = [x['data-img'] for x in soup.find_all('a', rel='popover')]
for link in image_links:
print(link)
image_links_0 = image_links[0]
print(image_links_0)
mytags = []
tags = soup.find_all('td', width='41%')
for tag in tags:
image_text = tag.find('h5').text
mytags.append(image_text)
print(image_text)
for i in range(len(mytags)):
mytags[i]
mytags_0 = mytags[0]
image_links_0 = image_links[0]
#df = DataFrame({'foo1' : 'test',
df = DataFrame({'foo1' : '<img src="' + image_links_0 + '"/><p>' + mytags_0 + '</p>',
'foo2' : '' + mytags_0 + '',
'foo3' : mytags_0,
'foo4' : np.random.randn(2)})
print(df)
HTML(df.to_html('filename.html', escape=False))
print(tag)
This is the correct way to do it.
If you need any help with storing it and making an HTML out of it I'll be happy to provide a solution for that as well. Take care!
Update: Everything included, comments, scraping, writing to a file, creating tags with beautifulsoup.
from bs4 import BeautifulSoup
import requests
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table border="1"></table>', 'lxml')
thekeys = BeautifulSoup('<thead></thead>', 'html.parser')
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, height='50', width='50')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.tr.append(td_tag)
else:
tag = theads.new_tag('td')
tag.append(j)
thedata.tr.append(tag)
theads.table.append(thedata)
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
# each of these if you print them you'll get a information that you can store
# we use `.prettify()` as we can't write a BeautifulSoup object into a file.