Parsing HTML and writing PDF to disk (python) - html
My goal was to write a script that downloads all the pdf files from a user entered site.
Problem 1. the code does not return the anchor tags located inside of the iframe. I tried explicitly using the iframe tag name and then using .contents but the commanded returns an empty list.
Question 1: How to parse the iframe? Why doesn't the iframe.contents return its children i.e. the <a> tags?
Problem 2: Writing the PDFs to disk appears successful however when I attempt to the files I get the following error,
"....could not open...because it is either not a supported file type
or because the file has been damaged ( for example, it was sent as an
email...and wasn't correctly decoded).
Question 2: Anybody encounter this before?
The code is split in two blocks; one for each problem delete the set of quotes around a block to run.
Lastly if anyone can explain why the two urls don't match in the first block of code that would be awesome. Code is commented; contains urls for each question. Thanks!
PYTHON CODE
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
#initializing counters
slide = 1
count = 0
#ignore SSL cert errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#get user url and create soup object
url = input("Enter the website name: ")
connect = urllib.request.urlopen(url, context=ctx)
soup = BeautifulSoup(connect, 'html.parser')
######## code block for question 1 revolving around parsing iframes and the issues with the
######## mismatching urls
#url used for code block 1: https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html
"""
#trying to retrieve all anchor tags; doesn't print the anchor tags within the iframe
tags = soup('a')
for tag in tags:
print(tag)
print('\n')
#explictly asking for the iframe tag
iframe = soup.iframe
#the url printed on this line doesn't match the url printed once I get the src attribute
#navigating to the url listed here is what I use for the second block of code because it
#isn't an iframe
print(iframe)
iframe_src_url = iframe['src']
#this url doesn't match the one shown in the previous print statement and it leaves you dealing
#with another iframe
print(iframe_src_url)
"""
#########code block for question 2 where I enter the url found in the iframe src attribute
#url for block 2: https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false
"""
tags = soup('a')
#iterate through tags, retrieve href addresses, navigate to the document, write data to file
for tag in tags:
doc_url = tag.get('href')
file = urllib.request.urlopen(doc_url, context=ctx)
file = open("Week " + str(slide) + " slides.pdf", 'wb')
file.write(connect.read())
file.close()
print("Finished file: ", slide)
count = count + 1
slide = slide + 1
print("Total files downloaded: ", count)"""
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html')
soup = BeautifulSoup(r.content, 'html.parser')
for item in soup.findAll('iframe'):
print(item.get('src'))
Output:
https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false
And Regarding the second question:
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://docs.google.com/spreadsheets/d/e/2PACX-1vRF408HaDlR6Q9fx6WF6YzeNrZIkXZBqwz_qyN8hz8N4rhIrcpc_GWNMrCODVmucMEUhXIElxcXyDpY/pubhtml?gid=0&single=true&widget=true&headers=false')
soup = BeautifulSoup(r.content, 'html.parser')
links = []
for item in soup.findAll('a', {'rel': 'noreferrer'}):
links.append(item.get('href'))
for item in links:
r = requests.get(item)
source = r.headers.get('Location')
print(f"Saving File {source[56:]}")
r1 = requests.get(source)
with open(f"{source[56:]}", 'wb') as f:
f.write(r1.content)
print(f"\nTotal File Downloaded is {len(links)}")
Output will save the file to your local disck:
Saving File 01-intro-logistics.pdf
Saving File 02-data.pdf
Saving File 03-preprocessing.pdf
Saving File 03-preprocessing.pdf
Saving File 04-frequent-patterns.pdf
Saving File 05a-supervised.pdf
Saving File 05b-supervised.pdf
Saving File 05c-supervised.pdf
Saving File 06a-supervised-advanced.pdf
Saving File 06b-supervised-advanced.pdf
Saving File 07a-unsupervised.pdf
Saving File 07b-unsupervised.pdf
Saving File 07c-advanced-unsupervised.pdf
Saving File 08-graph-mining.pdf
Saving File 09-anomaly-detection.pdf
Saving File 10-time-series.pdf
Total File Downloaded is 16
Full Version:
import requests
from bs4 import BeautifulSoup
import html
def Get_Links():
links = set()
r = requests.get(
'https://www.cs.ucr.edu/~epapalex/teaching/235_F19/index.html')
soup = BeautifulSoup(r.text, 'html.parser')
source = html.escape(soup.find('iframe').get('src'))
r = requests.get(source)
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('a', {'rel': 'noreferrer'}):
links.add(item.get('href'))
return links, len(links)
def Save_Items():
items, size = Get_Links()
for item in items:
r = requests.get(item)
source = r.headers.get('Location')
print(f"Saving File {source[56:]}")
r = requests.get(source)
with open(f"{source[56:]}", 'wb') as f:
f.write(r.content)
print(f"\nTotal File Downloaded is {size}")
Save_Items()
Related
How to scrape only texts from specific HTML elements?
I have a problem with selecting the appropriate items from the list. For example - I want to omit "1." then the first "5" (as in the example) Additionally, I would like to write a condition that the letter "W" should be changed to "WIN". import re from selenium import webdriver from bs4 import BeautifulSoup as BS2 from time import sleep driver = webdriver.Chrome() driver.get("https://www.flashscore.pl/druzyna/ajax/8UOvIwnb/tabela/") sleep(10) page = driver.page_source soup = BS2(page,'html.parser') content = soup.find('div',{'class':'ui-table__body'}) content_list = content.find_all('span',{"table__cell table__cell--value"}) res = [] for i in content: line = i.text.split()[0] if re.search('Ajax', line): res.append(line) print(res) results ['1.Ajax550016:315?WWWWW'] I need Ajax;5;5;0;16;3;W;W;W;W;W
I would recommend to select your elements more specific: for e in soup.select('.ui-table__row'): Iterate the ResultSet and decompose() unwanted tag: e.select_one('.wld--tbd').decompose() Extract texts with stripped_strings and join() them to your expected string: data.append(';'.join(e.stripped_strings)) Example Also making some replacements, based on dict just to demonstrate how this would work, not knowing R or P. ... soup = BS2(page,'html.parser') data = [] for e in soup.select('.ui-table__row'): e.select_one('.wld--tbd').decompose() e.select_one('.tableCellRank').decompose() e.select_one('.table__cell--points').decompose() e.select_one('.table__cell--score').string = ';'.join(e.select_one('.table__cell--score').text.split(':')) pattern = {'W':'WIN','R':'RRR','P':'PPP'} data.append(';'.join([pattern.get(i,i) for i in e.stripped_strings])) data To only get result for Ajax: data = [] for e in soup.select('.ui-table__row:-soup-contains("Ajax")'): e.select_one('.wld--tbd').decompose() e.select_one('.tableCellRank').decompose() e.select_one('.table__cell--points').decompose() e.select_one('.table__cell--score').string = ';'.join(e.select_one('.table__cell--score').text.split(':')) pattern = {'W':'WIN','R':'RRR','P':'PPP'} data.append(';'.join([pattern.get(i,i) for i in e.stripped_strings])) data Output Based on actually data it may differ from questions example. ['Ajax;6;6;0;0;21;3;WIN;WIN;WIN;WIN;WIN']
you had the right start by using bs4 to find the table div, but then you gave up and just tried to use re to extract from the text. as you can see that's not going to work. Here is a simple way to hack and get what you want. I keep grabinn divs from the table div you find, and the grab the text of the next eight divs after finding Ajax. then I do some dirty string manipulation thing because the WWWWW is all in the same toplevel div. import re from selenium import webdriver from bs4 import BeautifulSoup as BS2 from time import sleep from webdriver_manager.chrome import ChromeDriverManager driver = webdriver.Chrome(ChromeDriverManager().install()) #driver = webdriver.Chrome() driver.get("https://www.flashscore.pl/druzyna/ajax/8UOvIwnb/tabela/") driver.implicitly_wait(10) page = driver.page_source soup = BS2(page,'html.parser') content = soup.find('div',{'class':'ui-table__body'}) content_list = content.find_all('span',{"table__cell table__cell--value"}) res = [] found = 0 for i in content.find('div'): line = i.text.split()[0] if re.search('Ajax', line): found = 8 if found: found -= 1 res.append(line) # change field 5 into separate values and skip field 6 res = res[:4] +res[5].split(':') + res[7:] # break the last field into separate values and drop the first '?' res = res[:-1] + [ i for i in res[-1]][1:] print(";".join(res)) returns Ajax;5;5;0;16;3;W;W;W;W;W This works, but it is very brittle, and will break as soon as the website changes their content. you should put in a lot of error checking. I also replaced the sleep with a wait call, and added chromedrivermamager, which allows me to use selenium with chrome.
How can I download link from YahooFinance in BeautifulSoup?
currently I'm trying to automatically scrape/download yahoo finance historical data. I plan to download the data using the download link provided in the website. My code is to list all the available link and work it from there, the problem is that the exact link doesn't appear in the result. Here is my code(partial): def scrape_page(url, header): page = requests.get(url, headers=header) if page.status_code == 200: soup = bs.BeautifulSoup(page.content, 'html.parser') return soup return null if __name__ == '__main__': symbol = 'GOOGL' dt_start = datetime.today() - timedelta(days=(365*5+1)) dt_end = datetime.today() start = format_date(dt_start) end = format_date(dt_end) sub = subdomain(symbol, start, end) header = header_function(sub) base_url = 'https://finance.yahoo.com' url = base_url + sub soup = scrape_page(url, header) result = soup.find_all('a') for a in result: print('URL :',a['href']) UPDATE 10/9/2020 : I managed to find the span which is the parent for the link with this code spans = soup.find_all('span',{"class":"Fl(end) Pos(r) T(-6px)"}) However, when I print it out, it does not show the link, here is the output: >>> spans [<span class="Fl(end) Pos(r) T(-6px)" data-reactid="31"></span>]
To download the historical data in CSV format from Yahoo Finance, you can use this example: import requests from datetime import datetime csv_link = 'https://query1.finance.yahoo.com/v7/finance/download/{quote}?period1={from_}&period2={to_}&interval=1d&events=history' quote = 'GOOGL' from_ = str(datetime.timestamp(datetime(2019,9,27,0,0))).split('.')[0] to_ = str(datetime.timestamp(datetime(2020,9,27,23,59))).split('.')[0] print(requests.get(csv_link.format(quote=quote, from_=from_, to_=to_)).text) Prints: Date,Open,High,Low,Close,Adj Close,Volume 2019-09-27,1242.829956,1244.989990,1215.199951,1225.949951,1225.949951,1706100 2019-09-30,1220.599976,1227.410034,1213.420044,1221.140015,1221.140015,1223500 2019-10-01,1222.489990,1232.859985,1205.550049,1206.000000,1206.000000,1225200 2019-10-02,1196.500000,1198.760010,1172.630005,1177.920044,1177.920044,1651500 2019-10-03,1183.339966,1191.000000,1163.140015,1189.430054,1189.430054,1418400 2019-10-04,1194.290039,1212.459961,1190.969971,1210.959961,1210.959961,1214100 2019-10-07,1207.000000,1218.910034,1204.359985,1208.250000,1208.250000,852000 2019-10-08,1198.770020,1206.869995,1189.479980,1190.130005,1190.130005,1004300 2019-10-09,1201.329956,1208.459961,1198.119995,1202.400024,1202.400024,797400 2019-10-10,1198.599976,1215.619995,1197.859985,1209.469971,1209.469971,642100 2019-10-11,1224.030029,1228.750000,1213.640015,1215.709961,1215.709961,1116500 2019-10-14,1213.890015,1225.880005,1211.880005,1217.770020,1217.770020,664800 2019-10-15,1221.500000,1247.130005,1220.920044,1242.239990,1242.239990,1379200 2019-10-16,1241.810059,1254.189941,1238.530029,1243.000000,1243.000000,1149300 2019-10-17,1251.400024,1263.750000,1249.869995,1252.800049,1252.800049,1047900 2019-10-18,1254.689941,1258.109985,1240.140015,1244.410034,1244.410034,1581200 2019-10-21,1248.699951,1253.510010,1239.989990,1244.280029,1244.280029,904700 2019-10-22,1244.479980,1248.729980,1239.849976,1241.199951,1241.199951,1143100 2019-10-23,1240.209961,1258.040039,1240.209961,1257.630005,1257.630005,1064100 2019-10-24,1259.109985,1262.900024,1252.349976,1259.109985,1259.109985,1011200 ...and so on.
I figured it out. That link is generated by javascript and requests.get() method won't work on dynamic content. I switched to selenium to download that link.
beautiful soup unable to find elements from website
It's my first time working with web scraping so cut me some slack. I'm trying to pull the "card_tag" from a website. I triple checked that the card tag is inside their respected tags as seen in the code. import requests from bs4 import BeautifulSoup result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped") src = result.content soup = BeautifulSoup(src, features="html.parser") urls = [] for div_tag in soup.find_all('div id="siteContainer"'): ul_tag = div_tag.find("ul class") li_tag = ul_tag.find("li") card_tag = li_tag.find("h3") urls.append(card_tag) print(urls) When I go to print the url list it outputs nothing. You can see the thing I'm looking for by visiting the link as seen in the code and inspecting element on "Blood-C". As you can see it's listed in the tag I'm trying to find, yet my code can't seem to find it. Any help would be much appreciated.
just minor syntax you need to change with the tags and attributes. import requests from bs4 import BeautifulSoup result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped") src = result.content soup = BeautifulSoup(src, features="html.parser") urls = [] containers = soup.find_all('div', {'id':'siteContainer'}) for div_tag in containers: ul_tag = div_tag.find("ul", {'data-type':'anime'}) li_tag = ul_tag.find_all("li") for each in li_tag: card_tag = each.find("h3") urls.append(card_tag) print(card_tag) Also, you could just skip all that and go straight to those <h3> tags with the class attribute cardName: import requests from bs4 import BeautifulSoup result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped") src = result.content soup = BeautifulSoup(src, features="html.parser") urls = [] for card_tag in soup.find_all('h3', {'class':'cardName'}): print(card_tag) urls.append(card_tag) Output: <h3 class="cardName">Black Butler</h3> <h3 class="cardName">Blood-C</h3> <h3 class="cardName">Place to Place</h3>
Having trouble finding Span tag (Python 3)
I'm trying to strip out the Span tags from a html file. I am using a page which has a lot of Span tags in it. I need to extract some numbers and add them together. However, I can't even get the lines I need out, so I am hoping someone can offer some advice. My code is below: from urllib.request import urlopen from bs4 import BeautifulSoup import ssl # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE # url = input('Enter - ') html = urlopen(url, context=ctx).read() soup = BeautifulSoup(html, "html.parser") # print(soup) spans = soup.findAll('span') for span in spans: print span Thanks
Python3.5 BeautifulSoup4 get text from 'p' in div
I am trying to pull all the text from the div class 'caselawcontent searchable-content'. This code just prints the HTML without the text from the web page. What am I missing to get the text? The following link is in the 'finteredcasesdoc.text' file: http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html import requests from bs4 import BeautifulSoup with open('filteredcasesdoc.txt', 'r') as openfile1: for line in openfile1: rulingpage = requests.get(line).text soup = BeautifulSoup(rulingpage, 'html.parser') doctext = soup.find('div', class_='caselawcontent searchable-content') print (doctext)
from bs4 import BeautifulSoup import requests url = 'http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html' soup = BeautifulSoup(requests.get(url).text, 'html.parser') I've added a much more reliable .find method ( key : value) whole_section = soup.find('div',{'class':'caselawcontent searchable-content'}) the_title = whole_section.center.h2 #e.g. Missouri Court of Appeals,Southern District,Division Two. second_title = whole_section.center.h3.p #e.g. STATE of Missouri, Plaintiff-Appellant v.... number_text = whole_section.center.h3.next_sibling.next_sibling #e.g. the_date = number_text.next_sibling.next_sibling #authors authors = whole_section.center.next_sibling para = whole_section.findAll('p')[1:] #Because we don't want the paragraph h3.p. # we could aslso do findAll('p',recursive=False) doesnt pickup children Basically, I've dissected this whole tree as for the Paragraphs (e.g. Main text, the var para), you'll have to loop print(authors) # and you can add .text (e.g. print(authors.text) to get the text without the tag. # or a simple function that returns only the text def rettext(something): return something.text #Usage: print(rettext(authorts))
Try printing doctext.text. This will get rid of all the HTML tags for you. from bs4 import BeautifulSoup cases = [] with open('filteredcasesdoc.txt', 'r') as openfile1: for url in openfile1: # GET the HTML page as a string, with HTML tags rulingpage = requests.get(url).text soup = BeautifulSoup(rulingpage, 'html.parser') # find the part of the HTML page we want, as an HTML element doctext = soup.find('div', class_='caselawcontent searchable-content') print(doctext.text) # now we have the inner HTML as a string cases.append(doctext.text) # do something useful with this !