Convert dataframe into a nested html file with R - html

I am trying to convert a csv file (in this example the tibble tree) into a nested html file like the one below. I did it expressing the csv file in MarkDown and the using pandoc.
What is the best way to do it with R? Is there an adequate package(s) to use? Is it also possible also in R to transform the html result inserting class and span in certain HTML elements?
library(tidyverse)
tree <- tibble::tribble(
~level1,~level2,~level3,~level4,
"Beverages","Water","","",
"Beverages","Coffee","","",
"Beverages","Tea","Black tea","",
"Beverages","Tea","White tea","",
"Beverages","Tea","Green tea","Sencha",
"Beverages","Tea","Green tea","Gyokuro",
"Beverages","Tea","Green tea","Matcha",
"Beverages","Tea","Green tea","Pi Lo Chun"
)
Created on 2021-04-23 by the reprex package (v1.0.0)
This is the nested html file that I want to obtain.
<ul>
<li>
<p>Beverages</p>
<ul>
<li>
<p>Water</p>
</li>
<li>
<p>Coffee</p>
</li>
<li>
<p>Tea</p>
<ul>
<li>
<p>Black Tea</p>
</li>
<li>
<p>White Tea</p>
</li>
<li>
<p>Green Tea</p>
<ul>
<li>Sencha</li>
<li>Gyokuro</li>
<li>Matcha</li>
<li>Pi Lo Chun</li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>

dat <- tibble::tribble(
~level1,~level2,~level3,~level4,
"Beverages","Water","","",
"Beverages","Coffee","","",
"Beverages","Tea","Black tea","",
"Beverages","Tea","White tea","",
"Beverages","Tea","Green tea","Sencha",
"Beverages","Tea","Green tea","Gyokuro",
"Beverages","Tea","Green tea","Matcha",
"Beverages","Tea","Green tea","Pi Lo Chun"
)
paths <- data.frame(pathString = apply(dat, 1, paste0, collapse = "/"))
library(data.tree)
tree <- as.Node(paths)
LL <- as.list(tree)
L <- LL[-1]
library(htmltools)
f <- function(node, nodeName){
if(all(lengths(node) == 0) && length(names(node))){
tagList(
tags$p(nodeName),
do.call(tags$ul, unname(lapply(names(node), tags$li)))
)
}else{
if(length(names(node))){
tags$li(
tags$p(nodeName),
do.call(tags$ul, mapply(f, node, names(node), SIMPLIFY = FALSE, USE.NAMES = FALSE))
)
}else{
tags$li(
tags$p(nodeName)
)
}
}
}
lis <- mapply(f, L, names(L), SIMPLIFY = FALSE, USE.NAMES = FALSE)
ul <- do.call(tags$ul, lis)
html <- as.character(tagList(tags$p(LL$name), ul))
> cat(html)
<p>Beverages</p>
<ul>
<li>
<p>Water</p>
</li>
<li>
<p>Coffee</p>
</li>
<li>
<p>Tea</p>
<ul>
<li>
<p>Black tea</p>
</li>
<li>
<p>White tea</p>
</li>
<p>Green tea</p>
<ul>
<li>Sencha</li>
<li>Gyokuro</li>
<li>Matcha</li>
<li>Pi Lo Chun</li>
</ul>
</ul>
</li>
</ul>

Related

Not able to open pages through navbar getting no page found error

<!--navigation bar-->
<header>
<nav>
<ul>
<li> About </li>
<li> Home </li>
<li>Login/Register </li>
<li> Create your closet </li>
</ul>
</nav>
</header>
<!--app.py-->
from flask import Flask,render_template
import os
prerna = os.path.join('static','pic.png')
prerna2 = os.path.join('static','Untitled.png')
prerna3 = os.path.join('static','wardrobe.png')
video = os.path.join('static','DigitalWardrobe.mp4')
app = Flask(__name__)
#app.route('/index')
def index():
return render_template("index.html",img = prerna2,img1 = prerna,img2 = prerna3,vid=video)
#app.route('/home')
def home() :
return render_template("home.html")
if __name__=="__main__":
app.run(debug=True,port=8000)
i have only added route to home and the index page to app.py till now not to other options in navbar and still i m not able to acess even that two pages also.
Mention the route in the anchor tag instead of the filename.
<li> About </li>
You can also try the url_for() method too.
<li> About </li>

CSS children selector (not being able to select all children)

This is the image of what I'm trying to scrape using beautiful soup. But whenever I use the code shown below, I only get access to the first child. I am never able to get access to all the children. Can someone help me with this?
item = soup.select("ul.items > li")
print(len(item))
The problem can be fixed in 2 steps as follows:
Use select_one on soup to get the ul
Use find_all on ul to fetch all the li items.
Working solution:
# File name: soup-demo.py
inputHTML = """
<ul class="items">
<li class="class1">item 1</li>
<li class="class1">item 3</li>
<li class="class1">item 3</li>
</ul>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(inputHTML, 'html.parser')
itemList = soup.select_one("ul", class_="items")
items = itemList.find_all("li")
print("Found ", len(items), " items")
for item in items:
print(item)
Output:
$ python3 soup-demo.py
Found 3 items
<li class="class1">item 1</li>
<li class="class1">item 3</li>
<li class="class1">item 3</li>
Maybe your version is wrong. This is OK.
from bs4 import BeautifulSoup
html = '''
<ul class="items">
<li>1</li>
<li>2</li>
</ul>
'''
soup = BeautifulSoup(html,features="lxml")
item = soup.select('ul.items>li')
print (len(item))
There's another solution here
from simplified_scrapy.simplified_doc import SimplifiedDoc
html = '''
<ul class="items">
<li>1</li>
<li>2</li>
</ul>
'''
doc = SimplifiedDoc(html)
item = doc.selects('ul.items>li')
print(len(item))
Here are more examples here

Extracting content of next and different tag using Beautifulsoup

I want to scrape some particular peice of html code.
my python code :
soup = '''
<p>
<strong> abc </strong>
</p>
<ul>
<li> 123 </li>
<li> 456 </li>
</ul>
'''
import bs4
soup = bs4.BeautifulSoup(soup, 'html.parser')
for link in soup.find_all('strong') :
k = link.next_sibling
print (link.text)
print (k)
print (k.text)
and output :
abc
AttributeError: 'NavigableString' object has no attribute 'text'
How can i extract "123" and "456" using above tags?
Thanks .
There are many solutions, for example, you can combine find_next() and find_next_sibling() methods:
soup = '''
<p>
<strong> abc </strong>
</p>
<ul>
<li> 123 </li>
<li> 456 </li>
</ul>
'''
import bs4
soup = bs4.BeautifulSoup(soup, 'html.parser')
for link in soup.find_all('strong') :
li1 = link.find_next().li
li2 = li1.find_next_sibling()
print(link.text)
print(li1.text)
print(li2.text)
Prints:
abc
123
456
You wanted 123 and 456 so you can use :has and :contains (bs4 4.7.1+) to target the parent p having the child strong with text 'abc', then use an adjacent sibling combinator with type selector to get the adjacent ul; finally use a child combinator with li type selector to get the child li elements.
from bs4 import BeautifulSoup as bs
html = '''
<p>
<strong> abc </strong>
</p>
<ul>
<li> 123 </li>
<li> 456 </li>
</ul>
'''
soup = bs(html, 'lxml')
print([i.text for i in soup.select('p:has(>strong:contains("abc")) + ul > li')])
Read about css selectors here.
from simplified_scrapy.simplified_doc import SimplifiedDoc
html = '''<div><p>
<strong> abc </strong>
</p>
<ul>
<li> 123 </li>
<li> 456 </li>
</ul></div>'''
doc = SimplifiedDoc(html)
s = doc.strong # doc.getElementByTag('strong')
lis = s.parent.next.children
print(s.text)
print(lis[0].text)
print(lis[1].text)
result:
abc
123
456

BeautifulSoup - Adding attributes on Resultset

Here's my html structure to scrape:
<div class='schedule-lists'>
<ul>
<li>...</li>
<ul>
<li>...</li>
<ul class='showtime-lists'>
<li>...</li>
<li><a auditype="N" cinema="0100" href="javascript:void(0);" >12:45</a></li>
<li>...</li> -- (same structured as above)
<li>...</li> -- (same structured as above)
<li>...</li> -- (same structured as above)
<li>...</li> -- (same structured as above)
Here's my code:
from requests import get
from bs4 import BeautifulSoup
response = get('www.example.com')
response_html = BeautifulSoup(response.text, 'html.parser')
containers = response_html.find_all('ul', class_='showtime-lists')
#print(containers)
[<ul class="showtime-lists">
<li><a auditype="N" cinema="0100" href="javascript:void(0);" >12:45</a></li>
How can i add attributes on my Resultset containers? like adding movietitle="Logan" so it become:
<li><a movietitle="Logan" auditype="N" cinema="0100" href="javascript:void(0);" >12:45</a></li>
My best trial is using .append method but it can be done because the ResultSet act like a dictionary
You can try this:
...
a = find_all('a')
i = 0
for tag in a:
a[i]['movietitle'] = 'Logan'
i += 1
print str(a)

Parse HTML data using R

I have a html data set as below, which I want to parse and convert into a tabular format which I can use .
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | <a href='http://www.google.com/maps/place/317 Springtown Rd++New Paltz+NY+United States' target='_blank'>Map</a> </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | <a href='http://www.google.com/maps/place/1716 Topeka St++Norman+OK+United States' target='_blank'>Map</a> </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>
Below is the code which I have used. The issue I am facing is it converts into text file using Rvest but cant seem to make it of any useful format.
library(dplyr)
library(rvest)
url<-html("beer.html")
selector_name<-".brewery"
fnames<-html_nodes(x = url, css = selector_name) %>%
html_text()
head(fnames)
fnames
Would this be a correct approach or should I be doing it using some other package to go through each div and the inner elements.
The out put I would like to see it is
No. Name Address Type Website
Thank You.
library(rvest)
library(dplyr)
html_file <- '<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | Map </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | Map </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>'
page <- read_html(html_file)
tibble(
name = page %>% html_nodes(".vcard .name") %>% html_text(),
address = page %>% html_nodes(".vcard .address") %>% html_text(),
type = page %>% html_nodes(".vcard .brewery_type") %>% html_text() %>% stringr::str_replace_all("^Type: ", ""),
website = page %>% html_nodes(".vcard .url a") %>% html_attr("href")
)
#> # A tibble: 2 x 4
#> name address type website
#> <chr> <chr> <chr> <chr>
#> 1 Bradley Farm / RB Brew, LLC 317 Springtown Rd Micro http://www.raybradleyfarm.com
#> 2 (405) Brewing Co 1716 Topeka St Micro http://www.405brewing.com
The problem is that it's not a table, so it's not super easy to parse. It's just two lists, which the below code concatenates into one list. Also FYI, try looking into the xml2 package for parsing html/xml.
library(dplyr)
library(rvest)
library(xml2)
vcard <-
'<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | <a href=\'http://www.google.com/maps/place/317 Springtown Rd++New Paltz+NY+United States\' target=\'_blank\'>Map</a> </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | <a href=\'http://www.google.com/maps/place/1716 Topeka St++Norman+OK+United States\' target=\'_blank\'>Map</a> </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>' %>%
read_html(html) %>%
xml_find_all("//ul[#class = 'vcard simple']")
two_children <- sapply(vcard, function(x) xml2::xml_children(x))
data.frame(
class = sapply(two_children, function(x) xml2::xml_attrs(x)),
value = sapply(two_children, function(x) xml2::xml_text(x)),
stringsAsFactors = FALSE
)