How to scrape from headings and content - html

I've been given a pile of about 100 html files that I want to put into rectangular form. Here's an example: http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/abergsson-anna.html . I would like to extract headings (h3) as column names and the content in between as strings taking up one row each.
I've managed to extract the column names with Rvest in R, but I'm stuck at extracting the content. I'm sure I'll get stuck again when trying to bind everything together in one dataframe.
This is what I've done for extracting the variable names:
variable.names <- map(LIST.html, ~read_html(.x) %>%
html_nodes("h3") %>%
html_text(trim = TRUE) %>%
tolower())
Here's the code I used to get all the files: system( "wget -r -np -nH --cut-dirs=3 -R index.html http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/" )

library(rvest)
library(stringr)
library(data.table)
malformed_documents <- character(0)
parse_profile_page <- function(pg, nm) {
# extract section divs, omit byline
divs <- html_nodes(pg, '#mittvagn > div')[ -1 ]
idx <- which((lapply(divs, html_attr, 'id') |> unlist()) == 'bakvagn')
if (length(idx) > 0) {
divs <- divs[ -idx ]
}
# extract section headers
titles <- html_nodes(divs, 'h3') |>
html_text(trim = TRUE) |>
tolower() |>
unlist()
# extract section contents
paragraphs <- lapply(divs, html_nodes, 'p') |>
lapply(html_text, trim = TRUE) |>
lapply(paste0, collapse = '\n') |>
lapply(str_squish) |>
unlist()
if (length(paragraphs) != length(titles)) {
message(sprintf('%s is malformed, not parsing', nm))
malformed_documents <<- c(malformed_documents, nm)
return(data.frame())
}
df <- data.frame(title = titles,
contents = paragraphs,
url = rep(nm, length(paragraphs)))
df
}
# obtain list of files to download
url <- 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/'
pg <- read_html(url)
file_urls <- html_nodes(pg, '#mittvagn > ol > li > a:nth-child(1)') |>
html_attr('href') |>
str_replace_all('^\\.\\./', '') |>
sprintf(fmt= 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/%s')
# file_urls <- sample(file_urls, 10) # uncomment to run on a small sample of pages
file_contents <- lapply(file_urls, function(x) {
message('downloading: ', x)
fl <- read_html(x)
})
names(file_contents) <- file_urls
parsed_contents <- lapply(file_urls, function(x) {
message('parsing: ', x)
pg <- file_contents[[ x ]]
parse_profile_page(pg, x)
})
parsed_contents_df <- rbindlist(parsed_contents) |> as.data.frame()
if (length(malformed_documents) > 0) {
warning('the following documents were malformed and not parsed: %s', paste0(malformed_documents, collapse = ', '))
}
View(parsed_contents_df)

Related

R: Inferring a Common Merge Key

I am trying to webscrape a site to get addresses for a set of names (part A) along with the longitude and latitudes (part B). I don't know how to do this all together, so I did this in two parts:
# part A
library(tidyverse)
library(rvest)
library(httr)
library(XML)
# Define function to scrape 1 page
get_info <- function(page_n) {
cat("Scraping page ", page_n, "\n")
page <- paste0("https://www.mywebsite/",
page_n, "?extension") %>% read_html
tibble(title = page %>%
html_elements(".title a") %>%
html_text2(),
adress = page %>%
html_elements(".marker") %>%
html_text2(),
page = page_n)
}
# Apply function to pages 1:10
df_1 <- map_dfr(1:10, get_info)
# Check dimensions
dim(df_1)
[1] 90
Here is part B:
# Recognize pattern in websites
part1 = "https://www.mywebsite/"
part2 = c(0:55)
part3 = "extension"
temp = data.frame(part1, part2, part3)
# Create list of websites
temp$all_websites = paste0(temp$part1, temp$part2, temp$part3)
# Scrape
df_2 <- list()
for (i in 1:10)
{tryCatch({
url_i <-temp$all_websites[i]
page_i <-read_html(url_i)
b_i = page_i %>% html_nodes("head")
listanswer_i <- b_i %>% html_text() %>% strsplit("\\n")
df_2[[i]] <- listanswer_i
print(listanswer_i)
}, error = function(e){})
}
# Extract long/lat from results
lat_long = grep("LatLng", unlist(df_2[]), value = TRUE)
df_2 = data.frame(str_match(lat_long, "LatLng(\\s*(.*?)\\s*);"))
df_2 = df_2 %>% filter(X1 != "LatLngBounds();")
> dim(df_2)
[1] 86 3
We can see that df_1 and df_2 have a different number of rows - but also, there is no common merge key between df_1 and df_2. How can I re-write my code in such a way that I can create a merge key between df_1 and df_2 such that I can merge the common records between these files together?
I am not sure multiple requests to the same URIs are needed. There are some lat long values not listed either on the results pages or on the result specific linked webpage e.g.Toronto Beaches Dentist from current page 2 results has no lat long shown on either page 2 or the website specific page. In these cases, you may choose to fill the blanks using another service which returns lat long based on an address.
You can re-write your function and alter your regex patterns to produce 2 dataframes which can be joined and the resultant dataframe returned. With the appropriate regex changes, as given below, you can use the address column to join the 2 dataframes. I dislike a key which is an address but it does appear to be internally consistent across the result page. I have used a left join to return all rows from the dentist listings i.e. the practice business names.
library(tidyverse)
library(rvest)
urls <- sprintf("https://www.dentistsearch.ca/search-doctor/%i?category=0&services=0&province=55&city=&k=", 1:10)
pages <- lapply(urls, read_html)
get_dentist_info <- function(page) {
page_text <- page %>% html_text()
address_keys <- page_text %>%
str_match_all('marker_\\d+\\.set\\("content", "(.*?)"\\);') %>%
.[[1]] %>%
.[, 2]
lat_long <- page_text %>%
str_match_all("LatLng\\((.*)\\);(?![\\s\\S]+myOptions)") %>%
.[[1]] %>%
.[, 2]
lat_lon <- tibble(address = address_keys, lat_long = lat_long) %>%
separate(lat_long, into = c("lat", "long"), sep = ", ") %>%
mutate(lat = as.numeric(lat), long = as.numeric(long))
practice_info <- tibble(
title = page %>% html_elements(".title > a") %>% html_text(trim = T),
address = page %>% html_elements(".marker") %>% html_text()
)
dentist_info <- left_join(practice_info, lat_lon, by = "address")
return(dentist_info)
}
all_dentist_info <- map_dfr(pages, get_dentist_info)

web scraping in r with SelectorGadget

I was running this simple code below to scrape the employee number from this Fortune 500 page. I used the Chrome's extention: SelectorGadget to identify that the number I want matches with ".info__row--7f9lE:nth-child(13) .info__value--2AHH7"
library(rvest)
library(dplyr)
#download google chrome extention: SelectorGadget
link = "https://fortune.com/company/walmart/"
page = read_html(link)
Employees = page %>% html_nodes(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") %>% html_text()
Employees
However, it returned "character(0)". Does anyone know what is the cause? I feel it must be a simple mistake somewhere. Thanks in advance!
Update
Here is the code I modified based on Jon's comments.
a <- c("https://fortune.com/company/walmart/", "https://fortune.com/company/amazon-com/"
,"https://fortune.com/company/apple/"
,"https://fortune.com/company/cvs-health/"
,"https://fortune.com/company/unitedhealth-group/"
, "https://fortune.com/company/berkshire-hathaway/"
, "https://fortune.com/company/mckesson/"
,"https://fortune.com/company/amerisourcebergen/"
, "https://fortune.com/company/alphabet/"
, "https://fortune.com/company/exxon-mobil/"
,"https://fortune.com/company/att/"
,"https://fortune.com/company/costco/"
,"https://fortune.com/company/cigna/"
, "https://fortune.com/company/cardinal-health/"
,"https://fortune.com/company/microsoft/"
,"https://fortune.com/company/walgreens-boots-alliance/"
,"https://fortune.com/company/kroger/"
, "https://fortune.com/company/home-depot/"
,"https://fortune.com/company/jpmorgan-chase/"
,"https://fortune.com/company/verizon/"
,"https://fortune.com/company/ford-motor/"
, "https://fortune.com/company/general-motors/"
,"https://fortune.com/company/anthem/"
, "https://fortune.com/company/centene/"
,"https://fortune.com/company/fannie-mae/"
, "https://fortune.com/company/comcast/"
, "https://fortune.com/company/chevron/"
,"https://fortune.com/company/dell-technologies/"
,"https://fortune.com/company/bank-of-america-corp/"
,"https://fortune.com/company/target/")
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
numEmp <- numeric()
for (i in 1:length(a)){
json_data <- read_html(a[i]) |>
html_element("script#preload") |>
html_text() |>
sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |>
sub(";\\s*$", "", x = _, perl = TRUE) |>
fromJSON(simplifyVector = FALSE)
temp<-gsub(".*https://fortune.com", "", a[i])
page_data <- json_data$components$page[[temp]]
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
numEmp[i] <- info_data$employees # Results will be fed into this numEmp variable.
}
numEmp
An error says
Error in find_by_name(page_data, "body", "children") :
length(idx) > 0 is not TRUE
Should I somehow change the code stopifnot(length(idx) > 0)?
When I do document.querySelectorAll(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") I see you want to scrape the # of employees. Maurits is right, looks like the data is downloaded as (inline) JSON and then rendered later. You can use Selenium to save the rendered page then apply your CSS selector there. Or you can extract the inline JSON and scrape it from there.
After some manual work, you can do the 2nd option like below in R 4.2.x
library(rvest)
library(jsonlite)
# R 4.1.x
sub2 <- function(x, pattern, replacement) sub(pattern, replacement, x = x, perl = TRUE)
url <- "https://fortune.com/company/walmart/"
json_data <- read_html(url) |>
html_element("script#preload") |>
html_text() |>
## sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |> # R 4.2.x
sub2("\\s*window\\.__PRELOADED_STATE__ = ", "") |> # R 4.1.x
## sub(";\\s*$", "", x = _, perl = TRUE) |> # R 4.2.x
sub2(";\\s*$", "") |> # R 4.1.x
fromJSON(simplifyVector = FALSE)
page_data <- json_data$components$page[["/company/walmart/"]]
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
info_data$employees
#> [1] "2300000"
# Extra code to scrape company-data-table segments
library(purrr)
data_tables <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-table-wrapper", "children")
rows <- data_tables |>
lapply(\(x) c(x$config$data, x$config$change)) |>
purrr::flatten() |>
discard(~ is.null(.$key))
df <- data.frame(
key = rows |> map_chr(~ .$key),
title = rows |> map_chr(~ .$fieldMeta$title),
type = rows |> map_chr(~ .$fieldMeta$type),
value = rows |> map_chr(~ .$value)
)

Scrape page content after option tag is selected

I'd like to scrape the content of a page once the province (and the commune) are selected.
The following code correctly outputs the provinces and their values.
library(rvest)
page <- read_html(x = "https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati/")
text <- page %>% html_nodes(xpath='//select[#name="provincia"]/option')%>% html_text()
values <- page %>% html_nodes(xpath='//select[#name="provincia"]/option')%>% html_attr("value")
Res <- data.frame(text = text, values = values, stringsAsFactors = FALSE)
Res
Now, I'd like to access the page for each value, e.g. this might be helpful for getting access to value=19.
text <- page %>% html_nodes(xpath="//*/option[#value = '19']")%>% html_text()
text
The source code is the following
<div class="row results_form_search">
<form role="search" method="POST" class="search-form" action="/progetto-torelli/progetto-torelli-risultati/" id="search_location">
<input type="hidden" name="comune_from" value="" />
<div class="form-row">
<input type="text" name="cognome" placeholder="Cognome" autocomplete="off" value="">
<select name="provincia">
<option value="0" selected>Seleziona Provincia</option>
<option value="74"
>-
</option>
<option value="75"
>AGRIGENTO
</option>
<option value="19"
>ALESSANDRIA
This is where the content that I want to scrape might be.
<div class="row">
<ul class="listing_search">
</ul>
</div>
Thank you so much for your advice!
RSelenium may end up being the way to go. However, if you can insert some judicious waits, or chunk your requests, so server isn't swamped with requests, you can use rvest and make the same requests the page does.
You first need to generate all the combinations of province and comune (filtering out unwanted values); this can be done by making xmlhttp requests, using the value attribute for the options within the select for province, to gather back the comune dropdown options and their associated values.
You then make further requests, for each combination pair, to get the page content, which you would get when making selections from each of those dropdowns manually and pushing CERCA.
Pauses are needed as there are 10,389 valid combinations, by my reckoning, and, if you attempt to make all those requests one after the other, following the initial requests as well, the server will cut-off the connection.
Another option would be to chunk up combined into smaller dataframes and make requests for those at timed intervals and then combine the results.
library(rvest)
library(dplyr)
library(purrr)
get_provincias <- function(link) {
nodes <- read_html(link) %>%
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
df <- data.frame(
Provincia = nodes %>% html_text(trim = T),
id0 = nodes %>% html_attr("value")
)
return(df)
}
get_comunes <- function(id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')
df <- data.frame(
id0 = id, # id1
Comune = nodes %>% html_text(trim = T),
id3 = nodes %>% html_attr("value")
)
return(df)
}
get_page <- function(prov_id, com_id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
prov_id,
com_id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
page <- read_html(link)
# print(page %>% html_node(".listing_name") %>% html_text(trim = T))
# print(tibble(id3 = com_id, page = page))
return(tibble(id3 = com_id, page = page))
}
provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")
comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")
combined <- dplyr::right_join(provincias, comunes, by = "id0")
# length(combined$Comune) -> 10389
results <- map2_dfr(combined$id0, combined$id3, .f = get_page)
final <- dplyr::inner_join(combined, results, by = "id3")
Below is a longer version, with the additional info you requested, where I played around with adding pauses. I still found that I could run up to, and including
combined <- dplyr::right_join(provincias, comunes, by = "id0")
in one go. But after that I needed to chunk requests into about 2000 requests batches with 20-30 minutes in between. You can try tweaking the timings below. I ended up using the commented out section to run each batch and then left a pause of 30 mins in between.
Some things to consider:
It seems that you can have comunes values like ... which still return listings. With that in mind you may wish to remove the :not parts of this:
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
as I assumed that was filtering out invalid results.
Next, you might consider writing a helper function with httr and retry,
to make the requests with backoff/retry, rather than use pauses.
Such a function might look like this:
httr::RETRY(
"GET",
<request url>,
times = 3,
pause_min = 20*60,
pause_base = 20*60)
Anyway, those are some ideas. Even without the server cutting the connection, via uses of waits, I still found it started to throttle requests, meaning some requests took quite a long time to complete. Optimizing this could potentially take a lot of time and effort. I spent a good few days playing around looking at chunk size and waits.
library(rvest)
library(dplyr)
library(purrr)
get_provincias <- function(link) {
nodes <- read_html(link) %>%
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
df <- data.frame(
Provincia = nodes %>% html_text(trim = T),
id0 = nodes %>% html_attr("value")
)
return(df)
}
get_comunes <- function(id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')
df <- data.frame(
id0 = id, # id1
Comune = nodes %>% html_text(trim = T),
id3 = nodes %>% html_attr("value")
)
return(df)
}
get_data <- function(prov_id, com_id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
prov_id,
com_id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
page <- read_html(link)
df <- data.frame(
cognome = page %>% html_nodes(".listing_name") %>% html_text(trim = T),
livello = page %>% html_nodes(".listing_level") %>% html_text(trim = T),
id3 = com_id,# for later join back on comune
id0 = prov_id
)
Sys.sleep(.25) # pause for . sec
return(df)
}
get_chunks <- function(df, chunk_size) { # adapted from #BenBolker https://stackoverflow.com/a/7060331
n <- nrow(df)
r <- rep(1:ceiling(n / chunk_size), each = chunk_size)[1:n]
d <- split(df, r)
return(d)
}
write_rows <- function(df, filename) {
flag <- file.exists(filename)
df2 <- purrr::map2_dfr(df$id0, df$id3, .f = get_data)
write.table(df2,
file = filename, sep = ",",
append = flag,
quote = F, col.names = !flag,
row.names = F
)
Sys.sleep(60*10)
}
provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")
Sys.sleep(60*5)
comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")
Sys.sleep(60*10)
combined <- dplyr::right_join(provincias, comunes, by = "id0")
Sys.sleep(60*10)
chunked <- get_chunks(combined, 2000) # https://stackoverflow.com/questions/7060272/split-up-a-dataframe-by-number-of-rows
filename <- "prov_com_cog_liv.csv"
map(chunked, ~ write_rows(.x, filename))
## #### test case #####################
# df <- chunked[[6]]
#
# flag <- file.exists(filename)
#
# df2 <- map2_dfr(df$id0, df$id3, .f = get_data)
#
# write.table(df2,
# file = filename, sep = ",",
# append = flag,
# quote = F, col.names = !flag,
# row.names = F
# )
####################################
results <- read.csv(filename)
final <- dplyr::right_join(combined, results, by = "id3")

webscraping a pdf file using R

I've been web scraping articles in R from the Oxford journals and want to grab the full text of specific articles. All articles have a pdf link to them so I've been trying to pull the pdf link and scrape the entire text onto a csv. The full text should all fit into 1 row however the output in the csv file shows one article of 11 rows. How can I fix this issue?
The code is below:
####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)
fullText <- function(parsedDocument){
endLink <- parsedDocument %>%
html_node('.article-pdfLink') %>% html_attr('href')
frontLink <- "https://academic.oup.com"
#link of pdf
pdfLink <- paste(frontLink,endLink,sep = "")
#extract full text from pdfLink
pdfFullText <- pdf_text(pdfLink)
fulltext <- paste(pdfFullText, sep = "\n")
return(fulltext)
}
#############################################
#main function with input as parameter year
testFullText <- function(DOIurl){
parsedDocument <- read_html(DOIurl)
DNAresearch <- data.frame()
allData <- data.frame("Full Text" = fullText(parsedDocument), stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")
This is how I would approach this task.
library(tidyverse)
library(rvest)
df <- data.frame(
# you have a data.frame with a column where there are links to html research articles
links_to_articles = c("https://doi.org/10.1093/dnares/dsm026", "https://doi.org/10.1093/dnares/dsm027")
) %>%
# telling R to process each row separately (it is useful because functions such as read_html process one link rather than a vector of links)
rowwise() %>%
mutate(
pdf_link = read_html(links_to_articles) %>%
html_node('.article-pdfLink') %>%
html_attr('href') %>%
paste0("https://academic.oup.com", .),
articles_txt = pdf_text(pdf_link) %>%
paste0(collapse = " ")
) %>%
ungroup()
# writing the csv
df %>%
write_csv(file = "DNAresearch.csv")
Using your code, I would do:
####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)
fullText <- function(parsedDocument){
endLink <- parsedDocument %>%
html_node('.article-pdfLink') %>% html_attr('href')
frontLink <- "https://academic.oup.com"
#link of pdf
pdfLink <- paste(frontLink,endLink,sep = "")
#extract full text from pdfLink
pdfFullText <- pdf_text(pdfLink)
fulltext <- paste(pdfFullText, collapse = " ") # here I changed sep to collapse
return(fulltext)
}
#############################################
#main function with input as parameter year
testFullText <- function(DOIurl){
parsedDocument <- read_html(DOIurl)
DNAresearch <- data.frame()
allData <- data.frame("Full Text" = fullText(parsedDocument) %>% str_squish(), stringsAsFactors = FALSE) # here I used str_squish to remove extra spaces
DNAresearch <- rbind(DNAresearch, allData)
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")

Scraping table with multiple headers in R using any package? (XML, rCurl, rlist htmltab, rvest etc)

I am attempting to scrape this table
http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1
Here are all my attempts. None of them get even close to extracting any information. Am i missing something?
library("rvest")
library("tidyverse")
# METHOD 1
url <- "http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1"
data <- url %>%
read_html() %>%
html_nodes(xpath='//*[#id="t1"]/tbody/tr[1]') %>%
html_table()
data <- data[[1]]
# METHOD 2
library(XML)
library(RCurl)
library(rlist)
theurl <- getURL("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(theurl)
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tables[[which.max(n.rows)]]
# METHOD 3
library(htmltab)
tab <- htmltab("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",
which = '//*[#id="t1"]/tbody/tr[4]',
header = '//*[#id="t1"]/tbody/tr[3]',
rm_nodata_cols = TRUE)
# METHOD 4
website <-read_html("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1")
scraped <- website %>%
html_nodes("table") %>%
.[(2)] %>%
html_table(fill = TRUE) %>%
`[[`(1)
# METHOD 5
getHrefs <- function(node, encoding)
if (!is.null(xmlChildren(node)$a)) {
paste(xpathSApply(node, './a', xmlGetAttr, "href"), collapse = ",")
} else {
return(xmlValue(xmlChildren(node)$text))
}
data <- ( readHTMLTable("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1", which = 1, elFun = getHrefs) )
The expected results should be the 12 colnames in the table & the data below