Remove height and width from inline styles - html

I'm using BeautifulSoup to remove inline heights and widths from my elements. Solving it for images was simple:
def remove_dimension_tags(tag):
for attribute in ["width", "height"]:
del tag[attribute]
return tag
But I'm not sure how to go about processing something like this:
<div id="attachment_9565" class="wp-caption aligncenter" style="width: 2010px;background-color:red">
when I would want to leave the background-color (for example) or any other style attributes other than height or width.
The only way I can think of doing it is with a regex but last time I suggested something like that the spirit of StackOverflow came out of my computer and murdered my first-born.

A full walk-through would be:
from bs4 import BeautifulSoup
import re
string = """
<div id="attachment_9565" class="wp-caption aligncenter" style="width: 2010px;background-color:red">
<p>Some line here</p>
<hr/>
<p>Some other beautiful text over here</p>
</div>
"""
# look for width or height, followed by not a ;
rx = re.compile(r'(?:width|height):[^;]+;?')
soup = BeautifulSoup(string, "html5lib")
for div in soup.findAll('div'):
div['style'] = rx.sub("", string)
As stated by others, using regular expressions on the actual value is not a problem.

You could use regex if you want, but there is a simpler way.
Use cssutils for a simpler css parsing
A simple example:
from bs4 import BeautifulSoup
import cssutils
s = '<div id="attachment_9565" class="wp-caption aligncenter" style="width: 2010px;background-color:red">'
soup = BeautifulSoup(s, "html.parser")
div = soup.find("div")
div_style = cssutils.parseStyle(div["style"])
del div_style["width"]
div["style"] = div_style.cssText
print (div)
Outputs:
>>><div class="wp-caption aligncenter" id="attachment_9565" style="background-color: red"></div>

import bs4
html = '''<div id="attachment_9565" class="wp-caption aligncenter" style="width: 2010px;background-color:red">'''
soup = bs4.BeautifulSoup(html, 'lxml')
Tag's attribute is a dict object, you can modify it like a dict:
get item:
soup.div.attrs
{'class': ['wp-caption', 'aligncenter'],
'id': 'attachment_9565',
'style': 'width: 2010px;background-color:red'}
set item:
soup.div.attrs['style'] = soup.div.attrs['style'].split(';')[-1]
{'class': ['wp-caption', 'aligncenter'],
'id': 'attachment_9565',
'style': 'background-color:red'}
Use Regex:
soup.div.attrs['style'] = re.search(r'background-color:\w+', soup.div.attrs['style']).group()

Related

How do I search for an attribute using BeautifulSoup?

I am trying to scrape a that contains the following HTML.
<div class="FeedCard urn:publicid:ap.org:db2b278b7e4f9fea9a2df48b8508ed14 Component-wireStory-0-2-116 card-0-2-117" data-key="feed-card-wire-story-with-image" data-tb-region-item="true">
<div class="FeedCard urn:publicid:ap.org:2f23aa3df0f2f6916ad458785dd52c59 Component-wireStory-0-2-116 card-0-2-117" data-key="feed-card-wire-story-with-image" data-tb-region-item="true">
As you can see, "FeedCard " is something they have in common. Therefore, I am trying to use a regular expression in conjunction with BeautifulSoup. Here is the code I've tried.
pattern = r"\AFeedCard"
for card in soup.find('div', 'class'==re.compile(pattern)):
print(card)
print('**********')
I'm expecting it to give me each on of the divs from above, with the asterisks separating them. Instead it is giving me the entire HTML of the page in a single instance
Thank you,
No need to use regular expression here. Just use CSS selector or BS4 Api:
from bs4 import BeautifulSoup
html = """\
<div class="FeedCard urn:publicid:ap.org:db2b278b7e4f9fea9a2df48b8508ed14 Component-wireStory-0-2-116 card-0-2-117" data-key="feed-card-wire-story-with-image" data-tb-region-item="true">
Item 1
</div>
<div class="FeedCard urn:publicid:ap.org:2f23aa3df0f2f6916ad458785dd52c59 Component-wireStory-0-2-116 card-0-2-117" data-key="feed-card-wire-story-with-image" data-tb-region-item="true">
Item 2
</div>
"""
soup = BeautifulSoup(html, "html.parser")
for card in soup.select(".FeedCard"):
print(card.text.strip())
Prints:
Item 1
Item 2

Use BeautifulSoup to find specific tag based on the text the tag covers?

Say I have the following HTML:
<div class="test">
<a class="someclass" href="somesite.com"> LINK </a>
<a class="someclass" href="othersite.com"> IMAGE</a>
</div>
is there a way to get the href from all a-tags encapsling the text "LINK" i.e in this example somesite.com ?
The issue is that you're trying to find text which has whitespaces. You can use Regular Expressions to ignore the whitespaces and do a find on the text
from bs4 import BeautifulSoup
import re
html = '''<div class="test">
<a class="someclass" href="somesite.com"> LINK </a>
<a class="someclass" href="othersite.com"> IMAGE</a>
</div>'''
soup = BeautifulSoup(html, 'html.parser')
regex = re.compile(r'\s*%s\s*' % 'LINK')
results = soup.find("a", text=regex)
print(results['href'])
Output:
somesite.com
An alternative way is to preform find_all & then loop through results & compare the text using text.strip()
from bs4 import BeautifulSoup
html = '''<div class="test">
<a class="someclass" href="somesite.com"> LINK </a>
<a class="someclass" href="othersite.com"> IMAGE</a>
</div>'''
# Find href by text 'link'
soup = BeautifulSoup(html, 'html.parser')
results = soup.find_all('a')
print([x['href'] for x in results if x.text.strip() == 'LINK'])
Output
['somesite.com']

How do i get the text inside a class while ignoring the text of the next class that is inside

I'm trying to get the text inside the class="hardfact" but is also getting the text of the class="hardfactlabel color_f_03" because this class is inside hardfact.
.text.strip() get the text of both class because they are nested.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
import lxml
my_url = 'https://www.immowelt.de/expose/2QC5D4A?npv=52'
page = requests.get(my_url)
ct = soup(page.text, 'lxml')
specs = ct.find('div', class_="hardfacts clear").findAll('div', class_="hardfact")
for items in specs:
e = items.text.strip()
print(e)
I'm getting this
82.500 € 
Kaufpreis
47 m²
Wohnfläche (ca.)
1
Zimmer
and i want this
82.500 €
47 m²
1
Here is the html content you are trying to crawl:
<div class="hardfact ">
<strong>82.500 € </strong>
<div class="hardfactlabel color_f_03">
Kaufpreis
</div>
</div>
<div class="hardfact ">
47 m²
<div class="hardfactlabel color_f_03">
Wohnfläche (ca.)
</div>
</div>
<div class="hardfact rooms">
1
<div class="hardfactlabel color_f_03">
Zimmer
</div>
</div>
What you want to achieve is to remove the div tags within, so you can just decompose the div:
for items in specs:
items.div.decompose()
e = items.text.strip()
print(e)
If your first "hardfact" class doesn't contain the "strong" tag, you can just find the first element like so
e = items.find().text.strip()
but we can't do this so you have to decompose the div tag.
You can use stripped strings. You probably want to add a condition to ensure at least length of 3 before attempting to slice list.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.immowelt.de/expose/2QC5D4A?npv=52')
soup = bs(r.content, 'lxml')
items = soup.select('.hardfact')[:3]
for item in items:
strings = [string for string in item.stripped_strings]
print(strings[0])

removing elements from html using BeautifulSoup and Python 3

I'm scraping data from the web and trying to remove all elements that have tag 'div' and class 'notes module' like this html below:
<div class="notes module" role="complementary">
<h3 class="heading">Notes:</h3>
<ul class="associations">
<li>
Translation into Русский available:
Два-два-один Браво Бейкер by <a rel="author" href="/users/dzenka/pseuds/dzenka">dzenka</a>, <a rel="author" href="/users/La_Ardilla/pseuds/La_Ardilla">La_Ardilla</a>
</li>
</ul>
<blockquote class="userstuff">
<p>
<i>Warnings: numerous references to and glancing depictions of combat, injury, murder, and mutilation of the dead; deaths of minor and major original characters. Numerous explicit depictions of sex between two men.</i>
</p>
</blockquote>
<p class="jump">(See the end of the work for other works inspired by this one.)</p>
</div>
source is here: view-source:http://archiveofourown.org/works/180121?view_full_work=true
I'm struggling to even find and print the elements I want to delete. So far I have:
import urllib.request, urllib.parse, urllib.error
from lxml import html
from bs4 import BeautifulSoup
url = 'http://archiveofourown.org/works/180121?view_full_work=true'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')
removals = soup.find_all('div', {'id':'notes module'})
for match in removals:
match.decompose()
but removals returns an empty list. Can you help me select the entire div element that I've shown above so that I can select and remove all such elements from the html?
Thank you.
The div you are trying to find hasclass = "notes module", yet in your code you are trying to find those divs by id = "notes module".
Change this line:
removals = soup.find_all('div', {'id':'notes module'})
To this:
removals = soup.find_all('div', {'class':'notes module'})
Give it a go. It will kick out all available divs from that webpage under class='wrapper'.
import requests
from bs4 import BeautifulSoup
html = requests.get('http://archiveofourown.org/works/180121?view_full_work=true')
soup = BeautifulSoup(html.text, 'lxml')
for item in soup.select(".wrapper"):
[elem.extract() for elem in item("div")]
print(item)

following a hierarchy in beautiful soup

I have a HTML file with following format:
<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
</div>
Suppose there is only one div tag with class value "entry". I want to print all text inside those p tags which are inside div tag with class value "entry" , except those p tags which are followed by div or script tag. So here I want to print "para1" and "para2" but not "Ignore this part1" and "Ignore this part2"
How do I achieve this using beautiful soup?
Use a lambda expression to filter what you don't need.
Example:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
example = """<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
<p>example para</p>
</div>"""
soup = BeautifulSoup(example, 'html.parser')
entry = soup.find('div', class_="entry")
p = entry.find_all(lambda tag: tag.name == "p" and not (tag.find("div")
or tag.find("script")))
for content in p:
print (content.get_text(strip=True))
Outputs:
para1
para2
example para
Alternativ solution with css selector
A simple solution could be the use of modern css selectors:
soup.select('div.entry p:not(:empty,:has(div,script))')
Example
from bs4 import BeautifulSoup
example = '''<div class="entry">
<p>para1</p>
<p>para2</p>
<p><div class="abc"> Ignore this part1</div> </p>
<p><script class="xyz">Ignore this part2 </script></p>
<p>example para</p>
</div>'''
soup = BeautifulSoup(example, 'html.parser')
for e in soup.select('div.entry p:not(:empty,:has(div,script))'):
print(e.text)
Output
para1
para2
example para