Parse HTML data using R - html

I have a html data set as below, which I want to parse and convert into a tabular format which I can use .
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | <a href='http://www.google.com/maps/place/317 Springtown Rd++New Paltz+NY+United States' target='_blank'>Map</a> </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | <a href='http://www.google.com/maps/place/1716 Topeka St++Norman+OK+United States' target='_blank'>Map</a> </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>
Below is the code which I have used. The issue I am facing is it converts into text file using Rvest but cant seem to make it of any useful format.
library(dplyr)
library(rvest)
url<-html("beer.html")
selector_name<-".brewery"
fnames<-html_nodes(x = url, css = selector_name) %>%
html_text()
head(fnames)
fnames
Would this be a correct approach or should I be doing it using some other package to go through each div and the inner elements.
The out put I would like to see it is
No. Name Address Type Website
Thank You.

library(rvest)
library(dplyr)
html_file <- '<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | Map </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | Map </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>'
page <- read_html(html_file)
tibble(
name = page %>% html_nodes(".vcard .name") %>% html_text(),
address = page %>% html_nodes(".vcard .address") %>% html_text(),
type = page %>% html_nodes(".vcard .brewery_type") %>% html_text() %>% stringr::str_replace_all("^Type: ", ""),
website = page %>% html_nodes(".vcard .url a") %>% html_attr("href")
)
#> # A tibble: 2 x 4
#> name address type website
#> <chr> <chr> <chr> <chr>
#> 1 Bradley Farm / RB Brew, LLC 317 Springtown Rd Micro http://www.raybradleyfarm.com
#> 2 (405) Brewing Co 1716 Topeka St Micro http://www.405brewing.com

The problem is that it's not a table, so it's not super easy to parse. It's just two lists, which the below code concatenates into one list. Also FYI, try looking into the xml2 package for parsing html/xml.
library(dplyr)
library(rvest)
library(xml2)
vcard <-
'<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<div class="brewery" id="brewery">
<ul class="vcard simple">
<li class="name"> Bradley Farm / RB Brew, LLC</li>
<li class="address">317 Springtown Rd </li>
<li class="address_2">New Paltz, NY 12561-3020 | <a href=\'http://www.google.com/maps/place/317 Springtown Rd++New Paltz+NY+United States\' target=\'_blank\'>Map</a> </li>
<li class="telephone">Phone: (845) 255-8769</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.raybradleyfarm.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
<div class="brewery">
<ul class="vcard simple">
<li class="name">(405) Brewing Co</li>
<li class="address">1716 Topeka St </li>
<li class="address_2">Norman, OK 73069-8224 | <a href=\'http://www.google.com/maps/place/1716 Topeka St++Norman+OK+United States\' target=\'_blank\'>Map</a> </li>
<li class="telephone">Phone: (405) 816-0490</li>
<li class="brewery_type">Type: Micro</li>
<li class="url">www.405brewing.com </li>
</ul>
<ul class="vcard simple col2"></ul>
</div>
</body>' %>%
read_html(html) %>%
xml_find_all("//ul[#class = 'vcard simple']")
two_children <- sapply(vcard, function(x) xml2::xml_children(x))
data.frame(
class = sapply(two_children, function(x) xml2::xml_attrs(x)),
value = sapply(two_children, function(x) xml2::xml_text(x)),
stringsAsFactors = FALSE
)

Related

Convert dataframe into a nested html file with R

I am trying to convert a csv file (in this example the tibble tree) into a nested html file like the one below. I did it expressing the csv file in MarkDown and the using pandoc.
What is the best way to do it with R? Is there an adequate package(s) to use? Is it also possible also in R to transform the html result inserting class and span in certain HTML elements?
library(tidyverse)
tree <- tibble::tribble(
~level1,~level2,~level3,~level4,
"Beverages","Water","","",
"Beverages","Coffee","","",
"Beverages","Tea","Black tea","",
"Beverages","Tea","White tea","",
"Beverages","Tea","Green tea","Sencha",
"Beverages","Tea","Green tea","Gyokuro",
"Beverages","Tea","Green tea","Matcha",
"Beverages","Tea","Green tea","Pi Lo Chun"
)
Created on 2021-04-23 by the reprex package (v1.0.0)
This is the nested html file that I want to obtain.
<ul>
<li>
<p>Beverages</p>
<ul>
<li>
<p>Water</p>
</li>
<li>
<p>Coffee</p>
</li>
<li>
<p>Tea</p>
<ul>
<li>
<p>Black Tea</p>
</li>
<li>
<p>White Tea</p>
</li>
<li>
<p>Green Tea</p>
<ul>
<li>Sencha</li>
<li>Gyokuro</li>
<li>Matcha</li>
<li>Pi Lo Chun</li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
dat <- tibble::tribble(
~level1,~level2,~level3,~level4,
"Beverages","Water","","",
"Beverages","Coffee","","",
"Beverages","Tea","Black tea","",
"Beverages","Tea","White tea","",
"Beverages","Tea","Green tea","Sencha",
"Beverages","Tea","Green tea","Gyokuro",
"Beverages","Tea","Green tea","Matcha",
"Beverages","Tea","Green tea","Pi Lo Chun"
)
paths <- data.frame(pathString = apply(dat, 1, paste0, collapse = "/"))
library(data.tree)
tree <- as.Node(paths)
LL <- as.list(tree)
L <- LL[-1]
library(htmltools)
f <- function(node, nodeName){
if(all(lengths(node) == 0) && length(names(node))){
tagList(
tags$p(nodeName),
do.call(tags$ul, unname(lapply(names(node), tags$li)))
)
}else{
if(length(names(node))){
tags$li(
tags$p(nodeName),
do.call(tags$ul, mapply(f, node, names(node), SIMPLIFY = FALSE, USE.NAMES = FALSE))
)
}else{
tags$li(
tags$p(nodeName)
)
}
}
}
lis <- mapply(f, L, names(L), SIMPLIFY = FALSE, USE.NAMES = FALSE)
ul <- do.call(tags$ul, lis)
html <- as.character(tagList(tags$p(LL$name), ul))
> cat(html)
<p>Beverages</p>
<ul>
<li>
<p>Water</p>
</li>
<li>
<p>Coffee</p>
</li>
<li>
<p>Tea</p>
<ul>
<li>
<p>Black tea</p>
</li>
<li>
<p>White tea</p>
</li>
<p>Green tea</p>
<ul>
<li>Sencha</li>
<li>Gyokuro</li>
<li>Matcha</li>
<li>Pi Lo Chun</li>
</ul>
</ul>
</li>
</ul>

Angular 2 routing to specific part of a webpage from one component link to another component

I have a link in footer that takes to the terms page but I want that link to take to a specific part of the terms webpage (3rd paragraph in terms).
Here is the code for footer.component.html
<a routerLink="terms">Terms & conditions</a> |
<a routerLink="terms">Privacy policy</a> |
<a routerLink="terms">Cancellation policy</a>
In terms.component.html is the place where i want the privacy policy link in the footer to be opened when clicked.
<ol start="3">
<li>INFORMATION SUBMITTED THROUGH OR TO OUR SERVICES</li>
</ol>
What do we need to use for these to work? Any help is appreciated. Thanks.
Here is the answer ,
and <a routerLink="terms">Terms & conditions</a> | <a routerLink="terms">Privacy policy</a> | <a routerLink="terms">Cancellation policy</a>
Instead of just using . <a routerLink="terms">Terms & conditions</a>
use like this
<a [routerLink]="['/terms']" fragment="terms"> Terms & conditions </a>
<a [routerLink]="['/terms']" fragment="cancel"> Cancellation policy </a>
<a [routerLink]="['/terms']" fragment="privacy"> Privacy policy </a>
page should be like
<ol start="3" id="privacy" >
<li>INFORMATION SUBMITTED THROUGH OR TO OUR SERVICES</li>
</ol>
<ol start="3" id="terms" >
<li>INFORMATION SUBMITTED THROUGH OR TO OUR SERVICES</li>
</ol>
<ol start="3" id="cancel" >
<li>INFORMATION SUBMITTED THROUGH OR TO OUR SERVICES</li>
</ol>
Refer to the API for detailed descriptions and usage. - https://angular.io/api/router/RouterLink
To add to #Rahul VV's answer i have edited component with event scrollIntoView below like this
footer.component.ts
import { Router, NavigationEnd } from '#angular/router';
constructor(router: Router) {
router.events.subscribe(s => {
if (s instanceof NavigationEnd) {
const tree = router.parseUrl(router.url);
if (tree.fragment) {
const element = document.querySelector("#" + tree.fragment);
if (element) { element.scrollIntoView(); }
}
}
});
}

how to simplify template?

how to simplify template in angularjs? Here there are three different variants lie within podgruzki data objects, but depending on different objects IF loaded with different sets of properties. Is it possible to simplify both the template
<div class="suggest" ng-show="showSuggest" ng-if="$ctrl.Name == 'A'">
<ul class="height-list">
<li ng-repeat="node in $ctrl.Searched()" ng-mousedown="add(this)">
<span ng-attr-title="{{node.a}}">{{node.a}}</span>
</li>
</ul>
</div>
<div class="suggest" ng-show="showSuggest" ng-if="$ctrl.Name == 'B'">
<ul class="height-list">
<li ng-repeat="node in $ctrl.Searched() " ng-mousedown="add(this)">
<span ng-attr-title="{{node.b}}">{{node.b}}</span>
</li>
</ul>
</div>
<div class="suggest" ng-show="showSuggest" ng-if="$ctrl.Name == 'C'">
<ul class="height-list">
<li ng-repeat="node in $ctrl.Searched()" ng-mousedown="add(this)">
<span ng-attr-title="{{node.C}}">{{node.C}}</span>
</li>
</ul>
</div>
Let's say you normalize your node and name to be 1 to 1. So if your controller name is "a" or "Foo", then your node will have a property named a or Foo.
Then you will be able to reduce your template to the following:
<div class="suggest" ng-show="showSuggest">
<ul class="height-list">
<li ng-repeat="node in $ctrl.Searched()" ng-mousedown="add(this)">
<span ng-attr-title="{{node[$ctrl.Name]}}">{{node[$ctrl.Name]}}</span>
</li>
</ul>
</div>

Click a link in IE through VBA

I am trying to click a link but it is different from others links I have made it. I don't have a field called "id" or something.
Here is the HTML. I need it to click "pibmunic super" button/link. The FIREPATH XPath shows .//*[#id='informacoes_estatisticas']/ul/li[20]/span
<div id="informacoes_estatisticas">
<h3 class="titulo">Informações Estatísticas</h3>
<ul class="links">
<li class="item Censo Agropecuário_2006">
<li class="censo2010 super">
<li class="educa super">
<li class="empresas super">
<li class="vida super">
<li class="item Estimativa da População 2014_">
<li class="prodext2013 super">
<li class="financas super">
<li class="frota super">
<li class="item Fundações Privadas e Associações sem Fins Lucrativos no Brasil 2010_">
<li class="item Índice de Desenvolvimento Humano Municipal - IDHM_">
<li class="instfin super">
<li class="item Mapa de Pobreza e Desigualdade - Municípios Brasileiros_2003">
<li class="morbid super">
<li class="prodpec2013 super">
<li class="item Pesquisa Nacional de Saneamento Básico_2008">
<li class="item Produção Agrícola Municipal - Cereais, Leguminosas e Oleaginosas_2007">
<li class="lavperm2013 super">
<li class="lavtemp2013 super">
<li class="pibmunic super">
<span class="super">Produto Interno Bruto dos Municípios</span>
<ul class="pibmunic sub">
</li>
<li class="partpol super">
<li class="assismed super">
<li class="snig_censo2010 super">
</ul>
</div>
I am trying to write my code as something like this
Set m = html.getElementById("something")
Set a = m.getElementsByTagName("something")(something)
a.Click
I am not sure this is right, I have tried lots of combinations and it does not work. I don't know exactly which "getelementsby" I should use or what to give the command inside ().
Hope you can help me !! thanks
All of your classes seem to be unique. If that's the case you should change all the class attributes to id attributes ex:
class="something"
to
id="something"
Once you do that getElementById("something") will work. getElementsByTagName() returns an array of elements with a specific tag. ex:
getElementsByTagName("li")
retrieves all the li elements under a node.
If the newest VBA supports querySelectorAll() that will allow you to use CSS selectors to find elements. ex:
querySelectorAll(".something")
This will find all elements with the class something. However, since there can be multiple elements with the same class, you will get an array as a result. You can either loop over these elements to process them, narrow your selector, or get the first index of the returned array.
If VBA still does not support querySeletorAll() and you will need to combine getElementsByTagName() with a loop over the matching tags to find elements with the something class. ex:
For Each el in getElementById("container").getElementsByTagName("li")
For Each cl in el.className.Split(' ')
If cl = "something" Then
el.onclick()
End If
Next
Next
I would recommend assigning Id's or if at all possible use JavaScript or jQuery. They are more specialized at handling at these tasks (more abstraction). jQuery would be as simple as:
$(".something").trigger("click");

Passing the id of an HTML <li> tag to a VBScript subroutine when clicked on

I have loads of VBScripts that I plan to give a GUI front end (using an HTA). At the top of the HTA's window I have a drop down menu bar created as follows (haven't included CSS code as I don't think that it adds any relevent information).
I have been trying for sometime but have been unable to work out how I can pass the value of the id of the clicked li to the MenuClicked subroutine.
My previous experience with HTML was using it to create static documents so I might have got this all wrong. If that is the case then please let me know.
<script type="text/VBScript">
Sub MenuClicked()
Select Case WhatDoIPutHere
Case "#Option1A" : Sub_Option1A
Case "#Option1B" : Sub_Option1B
Case "#Option2A" : Sub_Option2A
Case "#Option2B" : Sub_Option2B
Case "#Option3" : Sub_Option3
End Select
End Sub
</script>
<div>
<ul id="nav" class="drop" onClick="MenuClicked()">
<li><a>Option1A</a>
<ul>
<li id="#Option1A"><a>Option 1A</a></li>
<li id="#Option1B"><a>Option 1B</a></li>
</ul>
</li>
<li><a>Option 2</a>
<ul>
<li id="#Option2A"><a>Option 2A</a></li>
<li id="#Option2B"><a>Option 2B</a></li>
</ul>
</li>
<li id="#Option3"><a>Option 3</a></li>
</ul>
</div>
Since HTAs are run by an Internet Explorer engine, you should use the srcElement property
<script type="text/VBScript">
Sub MenuClicked()
Dim target
' .parentNode because the <a> element is really what's being clicked,
' but we want the ID of the <li> element
Set target = window.event.srcElement.parentNode
Select Case target.id
Case "#Option1A" : Sub_Option1A
Case "#Option1B" : Sub_Option1B
Case "#Option2A" : Sub_Option2A
Case "#Option2B" : Sub_Option2B
Case "#Option3" : Sub_Option3
End Select
End Sub
</script>
<div>
<ul id="nav" class="drop" onClick="MenuClicked()">
<li><a>Option1A</a>
<ul>
<li id="#Option1A"><a>Option 1A</a></li>
<li id="#Option1B"><a>Option 1B</a></li>
</ul>
</li>
<li><a>Option 2</a>
<ul>
<li id="#Option2A"><a>Option 2A</a></li>
<li id="#Option2B"><a>Option 2B</a></li>
</ul>
</li>
<li id="#Option3"><a>Option 3</a></li>
</ul>
</div>
Here's how I would do it:
<script type="text/VBScript">
Sub MenuClicked(obj)
Select Case obj.id
Case "#Option1A" : Sub_Option1A
Case "#Option1B" : Sub_Option1B
Case "#Option2A" : Sub_Option2A
Case "#Option2B" : Sub_Option2B
Case "#Option3" : Sub_Option3
End Select
End Sub
</script>
<div>
<ul id="nav" class="drop">
<li><a>Option1A</a>
<ul>
<li id="#Option1A" onClick="MenuClicked Me"><a>Option 1A</a></li>
<li id="#Option1B" onClick="MenuClicked Me"><a>Option 1B</a></li>
</ul>
</li>
<li><a>Option 2</a>
<ul>
<li id="#Option2A" onClick="MenuClicked Me"><a>Option 2A</a></li>
<li id="#Option2B" onClick="MenuClicked Me"><a>Option 2B</a></li>
</ul>
</li>
<li id="#Option3" onClick="MenuClicked Me"><a>Option 3</a></li>
</ul>
</div>