I have a website in the following format:
<html lang="en">
<head>
#anything
</head>
<body>
<div id="div1">
<div id="div2">
<div class="class1">
#something
</div>
<div class="class2">
#something
</div>
<div class="class3">
<div class="sub-class1">
<div id="statHolder">
<div class="Class 1 of 15">
"Name"
<b>Bob</b>
</div>
<div class="Class 2 of 15">
"Age"
<b>24</b>
</div>
# Here are 15 of these kinds
</div>
</div>
</div>
</div>
</div>
</body>
</html>
I want to retrieve all the content in those 15 classes. How do I do that?
Edit:
My Current Approach:
import requests
from bs4 import BeautifulSoup
url = 'my-url-here'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
name_box = soup.findAll('div', {"id": "div1"}) #I dont know what to do after this
Expected Output:
Name: Bob
Age: 24
#All 15 entries like this
I am using BeautifulSoup4 for this.
Is there any direct way to get all the contents in <div id="stats">?
Based on the HTML above, you can try it this way:
import requests
from bs4 import BeautifulSoup
result = {}
url = 'my-url-here'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
stats = soup.find('div', {'id': 'statHolder'})
for data in stats.find_all('div'):
key, value = data.text.split()
result[key.replace('"', '')] = value
print(result)
# Prints:
# [{'Name': 'Bob'}, {'Age': '24'}]
for key, value in result.items():
print(f'{key}: {value}')
# Prints:
# Name: Bob
# Age: 24
This finds the div with the id of statHolder.
Then, we find all divs inside that div, and extract the two lines of text (using split) -- the first line being the key, and the second line being the value. We also remove the double quotes from the value using replace.
Then, we add the key-value pair to our result dictionary.
Iterating through this, you can get the desired output as shown.
If you do it according to the actual html of the webpage the following will give you the stats as a dictionary. It takes each element with class pSt as the key and then moves to the following strong tag to get the associated value.
from bs4 import BeautifulSoup as bs
#html is response.content assuming not dynamic
soup = bs(html, 'html.parser')
stats = {i.text:i.strong.text for i in soup.select('.pSt')}
For your shown html you can use stripped_strings to get the first sibling
from bs4 import BeautifulSoup as bs
html = '''
<html lang="en">
<head>
#anything
</head>
<body>
<div id="div1">
<div id="div2">
<div class="class1">
#something
</div>
<div class="class2">
#something
</div>
<div class="class3">
<div class="sub-class1">
<div id="statHolder">
<div class="Class 1 of 15">
"Name"
<b>Bob</b>
</div>
<div class="Class 2 of 15">
"Age"
<b>24</b>
</div>
# Here are 15 of these kinds
</div>
</div>
</div>
</div>
</div>
</body>
</html>
'''
soup = bs(html, 'html.parser')
stats = {[s for s in i.stripped_strings][0]:i.b.text for i in soup.select('#statHolder [class^=Class]')}
print(stats)
Related
Im scraping some information off MyAnimeList using BeautifulSoup on python3 and am trying to get information about a show's 'Status', but am having trouble accessing it.
Here is the html:
<h2>Information</h2>
<div>
<span class="dark_text">Type:</span>
Movie
</div>
<div class="spaceit">
<span class="dark_text">Episodes:</span>
1
</div>
<div>
<span class="dark_text">Status:</span>
Finished Airing
</div>
All of this is also contained within another div tag but I only included the portion of the html that I want to scrape. To clarify, I want to obtain the text 'Finished Airing' contained within 'Status'.
Here's the code I have so far but I'm not really sure if this is the best approach or where to go from here:
Page_soup = soup(Page_html, "html.parser")
extra_info = Page_soup.find('td', attrs={'class': 'borderClass'})
span_html = extra_info.select('span')
for i in range(len(span_html)):
if 'Status:' in span_html[i].getText():
Any help would be appreciated, thanks!
To get the text next to the <span> with "Status:", you can use:
from bs4 import BeautifulSoup
html_doc = """
<h2>Information</h2>
<div>
<span class="dark_text">Type:</span>
Movie
</div>
<div class="spaceit">
<span class="dark_text">Episodes:</span>
1
</div>
<div>
<span class="dark_text">Status:</span>
Finished Airing
</div>
"""
soup = BeautifulSoup(html_doc, "html.parser")
txt = soup.select_one('span:-soup-contains("Status:")').find_next_sibling(text=True)
print(txt.strip())
Prints:
Finished Airing
Or:
txt = soup.find("span", text="Status:").find_next_sibling(text=True)
print(txt.strip())
Another solution (maybe):
f = soup.find_all('span',attrs={'class':'dark_text'})
for i in f:
if i.text == 'Status:':
print(i.parent.text)
And change 'Status:' to whatever other thing you want to find.
Hope I helped!
I want to extract HTML between two HTML tags with identical id
html = '''<div id="note">
<div id="seccion">
<a name="title">Title of the seccion 1</a>
</div>
<div id="content">
<div id="col1">xxx</div>
<div id="col2">xxx</div>
</div>
<div id="content">
<div id="col1">xxx</div>
<div id="col2">xxx</div>
</div>
<div id="seccion">
<a name="title">Title of the seccion 2</a>
</div>
<div id="block">
<div id="col1">xxx</div>
<div id="col2">xxx</div>
</div>
<div id="block">
<div id="col1">xxx</div>
<div id="col2">xxx</div>
</div>
<div id="seccion">
<a name="title">Title of the seccion 3</a>
</div>
<div id="block">
<div id="col1">xxx</div>
<div id="col2">xxx</div>
</div>
</div>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
seccion= soup.find_all("div", {"id": "seccion"})
for item in seccion:
print([a.text for a in item.find_all("a", {"name": "title"})])
Unfortunately, sections are not separated in a div from which children are dropped.
In sections with I don't know how many blocks there are.
I am not sure if it is posible to extract html between 2 divs when names are identical.
You can separate sections by using .find_all() with parameter recursive=False and then check if the <div> contains id="seccion" attribute.
For example:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
sections = []
for div in soup.select_one('div#note').find_all('div', recursive=False):
if div.get('id') == 'seccion':
sections.append([div])
else:
sections[-1].append(div)
for section in sections:
for div in section:
print(div.get_text(strip=True, separator='\n'))
print('-' * 80)
Prints the three sections separated:
Title of the seccion 1
xxx
xxx
xxx
xxx
--------------------------------------------------------------------------------
Title of the seccion 2
xxx
xxx
xxx
xxx
--------------------------------------------------------------------------------
Title of the seccion 3
xxx
xxx
--------------------------------------------------------------------------------
One option is to use selenium
Download driver for google Chrome here
To get 'xpath' right click on the element then 'copy' and select 'Copy XPATH' or 'Copy Full XPATH'
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless') #Opens Chrome in background
driver = webdriver.Chrome(executable_path='Path_to_chromedriver.exe', options=options)
driver.get('url') #Webpage url
Text = driver.find_element("xpath","Element_xpath").Text #Get the label text
driver.close() #Close Chrome
I'm trying to get the text inside the class="hardfact" but is also getting the text of the class="hardfactlabel color_f_03" because this class is inside hardfact.
.text.strip() get the text of both class because they are nested.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
import lxml
my_url = 'https://www.immowelt.de/expose/2QC5D4A?npv=52'
page = requests.get(my_url)
ct = soup(page.text, 'lxml')
specs = ct.find('div', class_="hardfacts clear").findAll('div', class_="hardfact")
for items in specs:
e = items.text.strip()
print(e)
I'm getting this
82.500 €
Kaufpreis
47 m²
Wohnfläche (ca.)
1
Zimmer
and i want this
82.500 €
47 m²
1
Here is the html content you are trying to crawl:
<div class="hardfact ">
<strong>82.500 € </strong>
<div class="hardfactlabel color_f_03">
Kaufpreis
</div>
</div>
<div class="hardfact ">
47 m²
<div class="hardfactlabel color_f_03">
Wohnfläche (ca.)
</div>
</div>
<div class="hardfact rooms">
1
<div class="hardfactlabel color_f_03">
Zimmer
</div>
</div>
What you want to achieve is to remove the div tags within, so you can just decompose the div:
for items in specs:
items.div.decompose()
e = items.text.strip()
print(e)
If your first "hardfact" class doesn't contain the "strong" tag, you can just find the first element like so
e = items.find().text.strip()
but we can't do this so you have to decompose the div tag.
You can use stripped strings. You probably want to add a condition to ensure at least length of 3 before attempting to slice list.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.immowelt.de/expose/2QC5D4A?npv=52')
soup = bs(r.content, 'lxml')
items = soup.select('.hardfact')[:3]
for item in items:
strings = [string for string in item.stripped_strings]
print(strings[0])
Hi I'm new at Python and getting stuck with a couple of things. Firstly, I've got my script to do what I want, but can't get it to loop through all items (prod_image_wrap) items in the url.
from __future__ import print_function
import requests
import re
import csv
import asana
from bs4 import BeautifulSoup as bs
request = requests.get('url')
content = request.content
soup = bs(content, 'html.parser')
i = soup.find_all('div', attrs = {'class': 'prod_image_wrap'})
for i in soup:
title = soup.find('h1')
product = soup.find('div', attrs = {'class': 'prod_name'})
links = soup.find('div', 'prod_image')
if links:
for a in links.find_all('a', href=True):
url = a['href']
elements = ("name:", title.string, "notes:", product.string, "PLink:", url)
dict = elements
import json
json_string = json.dumps(elements)
print (json_string)
The html is:
<title>TWEU-08.01.18-[4]-Lacoste_shoes</title>
</head>
<body>
<h1>TWEU-08.01.18-[4]-Lacoste_shoes</h1>
<div class="prod_image_wrap">
<div class="prod_image">
<img src="http://img.url.com/watermark/rs.php?path=WMK2SCB-1.jpg&nw=130" alt="Wilson Kaos 2.0 SFT Camo Bk M's Shoe 9.0" />
</div>
<div class="prod_name">Wilson Kaos 2.0 SFT Camo Bk M's Shoe 9.0</div>
<div class="prod_msrp">MSRP: $140,00</div>
<div class="prod_price">$104,92</div>
<div class="prod_price">360 View</div>
</div>
<div class="prod_image_wrap">
<div class="prod_image">
<img src="http://img.url.com/watermark/rs.php?path=LWSCEGW-1.jpg&nw=130" alt="Lacoste Carnab Evo Gry/Wht Wom's Shoes 38.0" />
</div>
<div class="prod_name">Lacoste Carnab Evo Gry/Wht Wom's Shoes 38.0</div>
<div class="prod_msrp">MSRP: $99,90</div>
<div class="prod_price">$74,92</div>
<div class="prod_price">360 View</div>
</div>
<div class="prod_image_wrap">
<div class="prod_image">
<img src="http://img.url.com/watermark/rs.php?path=LWCESWW-1.jpg&nw=130" alt="Lacoste Carnaby Evo White Wom's Shoes 38.0" />
</div>
<div class="prod_name">Lacoste Carnaby Evo White Wom's Shoes 38.0</div>
<div class="prod_msrp">MSRP: $99,90</div>
<div class="prod_price">$74,92</div>
<div class="prod_price">360 View</div>
</div>
Output looks like:
["name:", "TWEU-08.01.18-[4]-Lacoste_shoes", "notes:", "Wilson Kaos 2.0 SFT Camo Bk M's Shoe 9.0", "PLink:", "http://www.url.com/zzz/invisdescpage.html?pcode=WMK2SCB"]
This is what I want, but just need it to loop, so in this case there would be 3 lines. The idea is to then assign the json variables to fields and send them Asana to create a task, but I'm struggling with how how to assign the values properly. Please let me know if I'm on the right path here.
Code is:
import asana
client = asana.Client.basic_auth('key')
new_task = client.tasks.create_in_workspace("10568240677771",{"name":(dict['name:']), "notes":"This is a test created via API to begin auto transferring data\n\nhttps://example.com\n\nProduct Name 10.90\nhttps://example.com\n\nProduct Name 10.90\nhttps://example.com\n\n", "projects":"21543812277118", "due_on":"2018-09-23"})
The error I get is:
TypeError: tuple indices must be integers or slices, not str
I have a HTML file with following format:
<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
</div>
Suppose there is only one div tag with class value "entry". I want to print all text inside those p tags which are inside div tag with class value "entry" , except those p tags which are followed by div or script tag. So here I want to print "para1" and "para2" but not "Ignore this part1" and "Ignore this part2"
How do I achieve this using beautiful soup?
Use a lambda expression to filter what you don't need.
Example:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
example = """<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
<p>example para</p>
</div>"""
soup = BeautifulSoup(example, 'html.parser')
entry = soup.find('div', class_="entry")
p = entry.find_all(lambda tag: tag.name == "p" and not (tag.find("div")
or tag.find("script")))
for content in p:
print (content.get_text(strip=True))
Outputs:
para1
para2
example para
Alternativ solution with css selector
A simple solution could be the use of modern css selectors:
soup.select('div.entry p:not(:empty,:has(div,script))')
Example
from bs4 import BeautifulSoup
example = '''<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
<p>example para</p>
</div>'''
soup = BeautifulSoup(example, 'html.parser')
for e in soup.select('div.entry p:not(:empty,:has(div,script))'):
print(e.text)
Output
para1
para2
example para