How to write a css/xpath for dynamically changing element? - html

I am using beautiful soup and below is my selector to scrape href.
html = ''' <a data-testid="Link" class="sc-pciXn eUevWj JobTile___StyledJobLink-sc-
1nulpkp-0 gkKKqP JobTile___StyledJobLink-sc-1nulpkp-0 gkKKqP"
href="https://join.com/companies/talpasolutions/4978529-project-customer-
success-manager-heavy-industries-d-f-m">'''
soup = beautifulsoup(HTML , "lxml")
jobs = soup.find_all( "a" ,class_= "sc-pciXn eUevWj JobTile___StyledJobLink-sc-1nulpkp-0
gkKKqP JobTile___StyledJobLink-sc-1nulpkp-0 gkKKqP")
for job in jobs:
job_url = job.get("href")
I am using find_all because there is a total of 3 elements with hrefs.
Above method is working but the website keeps changing the classes on a daily basis. I need a different way to design CSS/XPath

Try:
import requests
from bs4 import BeautifulSoup
url = "https://join.com/companies/talpasolutions"
soup = BeautifulSoup(requests.get(url).content, "lxml")
for a in soup.select("a:has(h3)"):
print(a.get("href"))
Prints:
https://join.com/companies/talpasolutions/4978529-project-customer-success-manager-heavy-industries-d-f-m
https://join.com/companies/talpasolutions/4925936-senior-data-engineer-d-f-m
https://join.com/companies/talpasolutions/4926107-senior-data-scientist-d-f-m

Related

Extract tex from href (BeautifulSoup)

I want to extract the TEXT from this HTML element:
mail#1st-architects.com
all_profiles.find("a", {"???":"???"}).get_text(strip=True)
Consider that I have a list of 1000 companies and each company has a href="mailto:mail#1st-architects.com" different.
You could combine attribute = value css selector using starts with ^ and ends with $ operators to match on hrefs with specified substrings
emails = [i.text for i in all_profiles.select("[href^=mailto][href$='#1st-architects.com']")]
You could try something like this.
This code will print the text of all <a> with href as an email.
import re
from bs4 import BeautifulSoup
s = '''
mail#1st-architects.com
second_mail#2nd-architects.com
Some Link
mail#example.com
'''
soup = BeautifulSoup(s, 'lxml')
a = soup.find_all('a', attrs= {'href': re.compile(r'^mailto:')})
for i in a:
print(i.text.strip())
mail#1st-architects.com
second_mail#2nd-architects.com
mail#example.com

beautiful soup unable to find elements from website

It's my first time working with web scraping so cut me some slack. I'm trying to pull the "card_tag" from a website. I triple checked that the card tag is inside their respected tags as seen in the code.
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped")
src = result.content
soup = BeautifulSoup(src, features="html.parser")
urls = []
for div_tag in soup.find_all('div id="siteContainer"'):
ul_tag = div_tag.find("ul class")
li_tag = ul_tag.find("li")
card_tag = li_tag.find("h3")
urls.append(card_tag)
print(urls)
When I go to print the url list it outputs nothing. You can see the thing I'm looking for by visiting the link as seen in the code and inspecting element on "Blood-C". As you can see it's listed in the tag I'm trying to find, yet my code can't seem to find it.
Any help would be much appreciated.
just minor syntax you need to change with the tags and attributes.
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped")
src = result.content
soup = BeautifulSoup(src, features="html.parser")
urls = []
containers = soup.find_all('div', {'id':'siteContainer'})
for div_tag in containers:
ul_tag = div_tag.find("ul", {'data-type':'anime'})
li_tag = ul_tag.find_all("li")
for each in li_tag:
card_tag = each.find("h3")
urls.append(card_tag)
print(card_tag)
Also, you could just skip all that and go straight to those <h3> tags with the class attribute cardName:
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.anime-planet.com/users/mistersenpai/anime/dropped")
src = result.content
soup = BeautifulSoup(src, features="html.parser")
urls = []
for card_tag in soup.find_all('h3', {'class':'cardName'}):
print(card_tag)
urls.append(card_tag)
Output:
<h3 class="cardName">Black Butler</h3>
<h3 class="cardName">Blood-C</h3>
<h3 class="cardName">Place to Place</h3>

Having trouble finding Span tag (Python 3)

I'm trying to strip out the Span tags from a html file.
I am using a page which has a lot of Span tags in it. I need to extract some numbers and add them together. However, I can't even get the lines I need out, so I am hoping someone can offer some advice.
My code is below:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# print(soup)
spans = soup.findAll('span')
for span in spans:
print span
Thanks

Python3.5 BeautifulSoup4 get text from 'p' in div

I am trying to pull all the text from the div class 'caselawcontent searchable-content'. This code just prints the HTML without the text from the web page. What am I missing to get the text?
The following link is in the 'finteredcasesdoc.text' file:
http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html
import requests
from bs4 import BeautifulSoup
with open('filteredcasesdoc.txt', 'r') as openfile1:
for line in openfile1:
rulingpage = requests.get(line).text
soup = BeautifulSoup(rulingpage, 'html.parser')
doctext = soup.find('div', class_='caselawcontent searchable-content')
print (doctext)
from bs4 import BeautifulSoup
import requests
url = 'http://caselaw.findlaw.com/mo-court-of-appeals/1021163.html'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
I've added a much more reliable .find method ( key : value)
whole_section = soup.find('div',{'class':'caselawcontent searchable-content'})
the_title = whole_section.center.h2
#e.g. Missouri Court of Appeals,Southern District,Division Two.
second_title = whole_section.center.h3.p
#e.g. STATE of Missouri, Plaintiff-Appellant v....
number_text = whole_section.center.h3.next_sibling.next_sibling
#e.g.
the_date = number_text.next_sibling.next_sibling
#authors
authors = whole_section.center.next_sibling
para = whole_section.findAll('p')[1:]
#Because we don't want the paragraph h3.p.
# we could aslso do findAll('p',recursive=False) doesnt pickup children
Basically, I've dissected this whole tree
as for the Paragraphs (e.g. Main text, the var para), you'll have to loop
print(authors)
# and you can add .text (e.g. print(authors.text) to get the text without the tag.
# or a simple function that returns only the text
def rettext(something):
return something.text
#Usage: print(rettext(authorts))
Try printing doctext.text. This will get rid of all the HTML tags for you.
from bs4 import BeautifulSoup
cases = []
with open('filteredcasesdoc.txt', 'r') as openfile1:
for url in openfile1:
# GET the HTML page as a string, with HTML tags
rulingpage = requests.get(url).text
soup = BeautifulSoup(rulingpage, 'html.parser')
# find the part of the HTML page we want, as an HTML element
doctext = soup.find('div', class_='caselawcontent searchable-content')
print(doctext.text) # now we have the inner HTML as a string
cases.append(doctext.text) # do something useful with this !

Can't seem to scrape the website "Forbes" properly

I'm trying to scrape the links and titles of the articles on the frontpage of the website https://www.forbes.com/ .
I'm not proficient in html, but I'm been following some beautfiul soup tutorials and have been getting by with the knowledge I'm picking up along the way.
Here is what I have so far:
source = urllib.request.urlopen('https://www.forbes.com').read()
soup = bs.BeautifulSoup(source,'lxml') # Tried 'html.parser' as well
##print(soup.findAll('div',{'class':"c-entry-box--compact c-entry-box--compact--article"}))
for url in soup.findAll('a',{'class':"exit_trigger_set"}):
print (url.get('href'))
Inspecting the site's html, I seem to have the class and 'a' (not sure what you call 'a' in this case) correct.
However, instead of getting all the links of the articles on the frontpage, I'm only getting one.
https://www.amazon.com/Intelligent-REIT-Investor-Wealth-Investment/dp/1119252717
Not sure what I'm doing wrong.
Thank you.
EDIT:
This seems to find some of the top stories but I don't know how to pull out the links only
for i in soup.findAll('h4', {'class': "editable editable-hed"}):
print (i)
Here's how I would do it:
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
source = urllib2.urlopen('https://www.forbes.com')
soup = BeautifulSoup(source,'lxml')
lst = []
for i in soup.findAll('h4', {'class': "editable editable-hed"}):
title = i.text
link = i.find('a')['href'][2:]
title = title.replace('\t','')
title = title.replace('\n','')
title = title.strip()
lst.append({'title':title, 'link':link})
df = pd.DataFrame.from_dict(lst)
And you get 15 articles and their links.