Well, I am new to BS in Python. I have written a code that scrapes HTML and save all the data that I need in csv file. The values from the ALL_NUMBERS file are substituted into the URL and thus a large number of URLs are obtained.
The code is below:
import requests
from bs4 import BeautifulSoup
#--READ NAMES--
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.84 Safari/537.36',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7'}
all_names = [] # TO KEEP ALL NAMES IN MEMORY
with open('ALL_NUMBERS.txt', 'r') as text_file:
for line in text_file:
line = line.strip()
all_names.append(line)
url_template = 'https://www.investing.com/news/stock-market-news/learjet-the-private-plane-synonymous-with-the-jetset-nears-end-of-runway-{}'
all_urls = [] # TO KEEP ALL URLs IN MEMORY
with open("url_requests.txt", "w") as text_file:
for name in all_names:
url = url_template.format(name)
print('url:', url)
all_urls.append(url)
text_file.write(url + "\n")
# --- read data ---
for name, url in zip(all_names, all_urls):
# print('name:', name)
# print('url:', url)
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = results.findAll("p")
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll ("span")
#for n in results_2:
#print n.find('p').text
#cont = soup.select_one("div.contentSectionDetails")
#ram = cont.select_one("span")
#[x.extract() for x in ram.select_one('span')]
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write(str(para))
text_file.write(str(para_2))
It works well, but only with one URL. I want to save para and para_2 from each URL in one csv file. That is, save two parameters from each URL in each line:
Text
Time
para From URL(1)
para_2 From URL(1)
para From URL(2)
para_2 From URL(2)
...
...
Unfortunately, I don't know how do it better for a lot of URLs in my case.
You could store all the params in a list and then save the result in your file:
import csv
# ...
# --- read data ---
params = []
for name, url in zip(all_names, all_urls):
r1 = requests.get(url, headers = headers)
page = r1.content
soup = BeautifulSoup(page, 'html5lib')
results = soup.find('div', class_= 'WYSIWYG articlePage')
para = '\n'.join([r.text for r in results.findAll("p")])
results_2 = soup.find('div', class_= 'contentSectionDetails')
para_2 = results_2.findAll("span")[0].text
params.append([str(para), str(para_2)])
with open('stock_market_news_' + name + '.csv', 'w') as text_file:
text_file.write("Text;Time\n")
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(['Text', 'Time'])
wr.writerows(params)
Does this answer solve your problem?
Have a nice day!
Related
I would like to get all hrefs which are within these li's in this ul:
Click here to see screenshot
So far I wrote this line:
import bs4, requests, re
product_pages = []
def get_product_pages(openurl):
global product_pages
url = 'https://www.ah.nl/producten/aardappel-groente-fruit'
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
for li in soup.findAll('li', attrs={'class': 'taxonomy-sub-selector_root__3rtWx'}):
for a in li.findAll('a', href=True):
print(a.attrs['href'])
get_product_pages('')
But it is only giving me the hrefs from the first three li's. I am wondering why it is only the first three and I am wondering how to get all eight..
In the page there is a scroll bar, which might cause trouble?
The taxonomies and all other page data is stored inside page in <script> so beautifulsoup doesn't see it. To get all children taxonomies from current category you can use next example (parsing the <script> tag with re/json):
import re
import json
import requests
base_url = "https://www.ah.nl/producten"
url = base_url + "/aardappel-groente-fruit/fruit"
html_doc = requests.get(url).text
data = re.search(r"window\.__INITIAL_STATE__= ({.*})", html_doc)
data = data.group(1).replace("undefined", "null")
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
taxonomies = {t["id"]: t for t in data["taxonomy"]["topLevel"]}
for t in data["taxonomy"]["taxonomies"]:
taxonomies[t["id"]] = t
def get_taxonomy(t, current, dupl=None):
if dupl is None:
dupl = set()
tmp = current + "/" + t["slugifiedName"]
yield tmp
for c in t["children"]:
if c in taxonomies and c not in dupl:
dupl.add(c)
yield from get_taxonomy(taxonomies[c], tmp, dupl)
for t in taxonomies.values():
if t["parents"] == [0]:
for t in get_taxonomy(t, base_url):
if url in t: # print only URL from current category
print(t)
Prints:
https://www.ah.nl/producten/aardappel-groente-fruit/fruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/appels
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/appels/groente-en-fruitbox
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/bananen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/sinaasappels-mandarijnen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/peren
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/ananas-mango-kiwi
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/aardbeien-frambozen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/druiven-kersen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/bramen-bessen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/abrikozen-pruimen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/abrikozen-pruimen/exotisch-fruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/perziken-nectarines
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/meloen-kokosnoot
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/grapefruit-minneola
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/citroen-limoen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/fruit-spread
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/vijgen
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/kaki-papaya-cherimoya
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/granaatappel-passiefruit
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/fruitsalade-mix
https://www.ah.nl/producten/aardappel-groente-fruit/fruit/gedroogd-fruit
I have an algorithm that downloads PDF articles with urllib.request and BeautifulSoup (Python 3.6):
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=[]
for i in range(9):
root_url = 'xxx.com/articles/page'+ str(i)
webpages.append(root_url)
#make a list of PDF links
pdf_links = []
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
page_soup = soup(data.text, 'html.parser')
links = [span.attrs['href'] for span in page_soup.find_all('a', href = True)]
for link in links:
link_string = str(link)
if link_string.endswith('pdf'):
pdf_links.append(link_string)
#download the files
for pdf_link in pdf_links:
save_to = os.path.basename(pdf_link.strip())
urllib.request.urlretrieve(pdf_link.strip(), save_to)
I need to rename each downloaded PDF article with the title of the article, which is stored in a specific div class:
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
There is a larger div that stores both the article title and the corresponding PDF link:
<div article-id="1741" class="online article_row_view">
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
<span class="file-pdf"> PDF</span>
</div>
I have no idea how to automatically rename the files, much less with a specific HTML element. Any help would be appreciated!
Here's a complete solution that walks all pages in the navigation and downloads all pdfs for you:
import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from pathlib import Path
from urllib.parse import urljoin
BASE_URL = 'https://cross-currents.berkeley.edu/archives'
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def extract_articles(soup: BeautifulSoup) -> list:
articles = []
for result in soup.select('.node--view-mode-search-result'):
author = result.select_one('.field--name-field-citation-authors').text.strip()
date = result.select_one('.field--name-field-issue-date').text.strip()
title = result.select_one('.field-name-node-title').text.strip()
journal = result.find('em', recursive=False).text.strip()
pdf_url = result.select_one('a[href*=".pdf"]')['href']
articles.append({
'author': author,
'date': date,
'title': title,
'journal': journal,
'pdf_url': pdf_url,
})
return articles
def make_safe_filename(text: str) -> str:
"""convert forbidden chars to underscores"""
return ''.join(c if (c.isalnum() or c.isspace()) else '_' for c in text).strip('_ ')
def get_next_page_url(soup: BeautifulSoup) -> str:
try:
path = soup.select_one('.pager a[rel="next"]')['href']
return urljoin(BASE_URL, path)
except (TypeError, KeyError):
return None
def download_file(url: str, filename: str) -> str:
with requests.get(url, stream=True) as res, open(filename, 'wb') as f:
res.raise_for_status()
for chunk in res.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return filename
def scrape_archive():
save_dir = Path(r'd:\downloads')
save_dir.mkdir(exist_ok=True, parents=True)
url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=onlinearticle&issue=All®ion=All&page=0'
while True:
soup = make_soup(url)
articles = extract_articles(soup)
for a in articles:
pdf_url = a['pdf_url']
filename = make_safe_filename(a['title'])
save_path = str(save_dir / (filename + '.pdf'))
print('Downloading:', a['title'])
download_file(pdf_url, save_path)
print('Finished')
# go to next page if exists
next_url = get_next_page_url(soup)
if not next_url:
break
url = next_url
print('Moving to next page', url)
scrape_archive()
Here I've used only the title to generate the pdf filename, but you can mix and combine journal, date, author etc to generate a better filename.
Also remember to change save_dir to your liking.
trying to extract but last in last field getting error want to save all fields in excel.
i have tried using beautifulsoup to extract but fails to catch, getting below error
Traceback (most recent call last):
File "C:/Users/acer/AppData/Local/Programs/Python/Python37/agri.py", line 30, in
specimens = soup2.find('h3',class_='trigger
expanded').find_next_sibling('div',class_='collapsefaq-content').text
AttributeError: 'NoneType' object has no attribute 'find_next_sibling'
from bs4 import BeautifulSoup
import requests
page1 = requests.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases')
soup1 = BeautifulSoup(page1.text,'lxml')
for lis in soup1.find_all('li',class_='flex-item'):
diseases = lis.find('img').next_sibling
print("Diseases: " + diseases)
image_link = lis.find('img')['src']
print("Image_Link:http://www.agriculture.gov.au" + image_link)
links = lis.find('a')['href']
if links.startswith("http://"):
link = links
else:
link = "http://www.agriculture.gov.au" + links
page2 = requests.get(link)
soup2 = BeautifulSoup(page2.text,'lxml')
try:
origin = soup2.find('strong',string='Origin: ').next_sibling
print("Origin: " + origin)
except:
pass
try:
imported = soup2.find('strong',string='Pathways: ').next_sibling
print("Imported: " + imported)
except:
pass
specimens = soup2.find('h3',class_='trigger expanded').find_next_sibling('div',class_='collapsefaq-content').text
print("Specimens: " + specimens)
want to extarct that last field and to save all fields into excel sheet using python, plz help me anyone.
Minor typo:
data2,append("Image_Link:http://www.agriculture.gov.au" + image_link)
Should be:
data2.append("Image_Link:http://www.agriculture.gov.au" + image_link) #period instead of a comma
It seems to want headers to prevent being blocked and also there is not a specimens section for each page. The following shows possible handling for each page for the specimen info
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
if soup.select_one('.trigger'): # could also use if soup.select_one('.trigger:nth-of-type(3) + div'):
info = soup.select_one('.trigger:nth-of-type(3) + div').text
else:
info = 'None'
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
df.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
I have run the above lots of times without problem, however, you can always switch my current test to a try except block.
from bs4 import BeautifulSoup
import requests
import pandas as pd
base = 'http://www.agriculture.gov.au'
headers = {'User-Agent' : 'Mozilla/5.0'}
specimens = []
with requests.Session() as s:
r = s.get('http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases', headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
names, images, links = zip(*[ ( item.text.strip(), base + item.select_one('img')['src'] , item['href'] if 'http' in item['href'] else base + item['href']) for item in soup.select('.flex-item > a') ])
for link in links:
r = s.get(link)
soup = BeautifulSoup(r.content, 'lxml')
try:
info = soup.select_one('.trigger:nth-of-type(3) + div').text
except:
info = 'None'
print(link)
specimens.append(info)
df = pd.DataFrame([names, images, links, specimens])
df = df.transpose()
df.columns = ['names', 'image_link', 'link', 'specimen']
Example of csv output:
I'm getting the wrong text when I scrape this url:
http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018
this is what I have
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
# game_publishers = html_soup.find_all("ul", class_='more_stats')
# game_ratings = html_soup.find_all("ul", class_='more_stats')
# game_genres = html_soup.find_all("ul", class_='more_stats')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
game_user = games4.find()
userscores.append(game_user.text.strip())
# print(name)
# print(metascore)
# print(userscore)
# for i in userscores:
# temp = str(i)
# temp2 = temp.replace("User:\n ", "")
# userscoresNew.append(temp2)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
# df = pd.DataFrame({'Games:': names,
# 'Metascore:': metascores,
# 'Userscore:': userscoresNew})
# df.to_csv("metacritic scrape.csv")
the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:
game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')
I get an error when I run the code:
AttributeError: 'NoneType' object has no attribute 'text'
also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")
any help would be appreicated
FYI I modifying code I have already written but grabing more details from a more detailed view
This should help.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
userScore = i.find('span', class_="data textscore textscore_favorable")
if userScore:
print(userScore.text)
Output:
7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
Use html_soup.find_all("li", class_='stat product_avguserscore') to get score
The following code retrieves images and html links from a webpage and stores the values in a beautiful soup index. I am now using pandas in order to create an output html table for those images and links. I have managed to populate cells manually by calling on a specific index value but I can't seem to find a way add each indexed image and html text to the panda dataframe so that all the indexed values are displayed in the table. How could I do this ?
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas import *
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',500)
from IPython.display import HTML
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
####################################
title_clean = soup.find('title')
print(title_clean)
image_links = [x['data-img'] for x in soup.find_all('a', rel='popover')]
for link in image_links:
print(link)
image_links_0 = image_links[0]
print(image_links_0)
mytags = []
tags = soup.find_all('td', width='41%')
for tag in tags:
image_text = tag.find('h5').text
mytags.append(image_text)
print(image_text)
for i in range(len(mytags)):
mytags[i]
mytags_0 = mytags[0]
image_links_0 = image_links[0]
#df = DataFrame({'foo1' : 'test',
df = DataFrame({'foo1' : '<img src="' + image_links_0 + '"/><p>' + mytags_0 + '</p>',
'foo2' : '' + mytags_0 + '',
'foo3' : mytags_0,
'foo4' : np.random.randn(2)})
print(df)
HTML(df.to_html('filename.html', escape=False))
print(tag)
This is the correct way to do it.
If you need any help with storing it and making an HTML out of it I'll be happy to provide a solution for that as well. Take care!
Update: Everything included, comments, scraping, writing to a file, creating tags with beautifulsoup.
from bs4 import BeautifulSoup
import requests
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table border="1"></table>', 'lxml')
thekeys = BeautifulSoup('<thead></thead>', 'html.parser')
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, height='50', width='50')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.tr.append(td_tag)
else:
tag = theads.new_tag('td')
tag.append(j)
thedata.tr.append(tag)
theads.table.append(thedata)
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
# each of these if you print them you'll get a information that you can store
# we use `.prettify()` as we can't write a BeautifulSoup object into a file.