web scraping in r with SelectorGadget - html

I was running this simple code below to scrape the employee number from this Fortune 500 page. I used the Chrome's extention: SelectorGadget to identify that the number I want matches with ".info__row--7f9lE:nth-child(13) .info__value--2AHH7"
library(rvest)
library(dplyr)
#download google chrome extention: SelectorGadget
link = "https://fortune.com/company/walmart/"
page = read_html(link)
Employees = page %>% html_nodes(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") %>% html_text()
Employees
However, it returned "character(0)". Does anyone know what is the cause? I feel it must be a simple mistake somewhere. Thanks in advance!
Update
Here is the code I modified based on Jon's comments.
a <- c("https://fortune.com/company/walmart/", "https://fortune.com/company/amazon-com/"
,"https://fortune.com/company/apple/"
,"https://fortune.com/company/cvs-health/"
,"https://fortune.com/company/unitedhealth-group/"
, "https://fortune.com/company/berkshire-hathaway/"
, "https://fortune.com/company/mckesson/"
,"https://fortune.com/company/amerisourcebergen/"
, "https://fortune.com/company/alphabet/"
, "https://fortune.com/company/exxon-mobil/"
,"https://fortune.com/company/att/"
,"https://fortune.com/company/costco/"
,"https://fortune.com/company/cigna/"
, "https://fortune.com/company/cardinal-health/"
,"https://fortune.com/company/microsoft/"
,"https://fortune.com/company/walgreens-boots-alliance/"
,"https://fortune.com/company/kroger/"
, "https://fortune.com/company/home-depot/"
,"https://fortune.com/company/jpmorgan-chase/"
,"https://fortune.com/company/verizon/"
,"https://fortune.com/company/ford-motor/"
, "https://fortune.com/company/general-motors/"
,"https://fortune.com/company/anthem/"
, "https://fortune.com/company/centene/"
,"https://fortune.com/company/fannie-mae/"
, "https://fortune.com/company/comcast/"
, "https://fortune.com/company/chevron/"
,"https://fortune.com/company/dell-technologies/"
,"https://fortune.com/company/bank-of-america-corp/"
,"https://fortune.com/company/target/")
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
numEmp <- numeric()
for (i in 1:length(a)){
json_data <- read_html(a[i]) |>
html_element("script#preload") |>
html_text() |>
sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |>
sub(";\\s*$", "", x = _, perl = TRUE) |>
fromJSON(simplifyVector = FALSE)
temp<-gsub(".*https://fortune.com", "", a[i])
page_data <- json_data$components$page[[temp]]
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
numEmp[i] <- info_data$employees # Results will be fed into this numEmp variable.
}
numEmp
An error says
Error in find_by_name(page_data, "body", "children") :
length(idx) > 0 is not TRUE
Should I somehow change the code stopifnot(length(idx) > 0)?

When I do document.querySelectorAll(".info__row--7f9lE:nth-child(13) .info__value--2AHH7") I see you want to scrape the # of employees. Maurits is right, looks like the data is downloaded as (inline) JSON and then rendered later. You can use Selenium to save the rendered page then apply your CSS selector there. Or you can extract the inline JSON and scrape it from there.
After some manual work, you can do the 2nd option like below in R 4.2.x
library(rvest)
library(jsonlite)
# R 4.1.x
sub2 <- function(x, pattern, replacement) sub(pattern, replacement, x = x, perl = TRUE)
url <- "https://fortune.com/company/walmart/"
json_data <- read_html(url) |>
html_element("script#preload") |>
html_text() |>
## sub("\\s*window\\.__PRELOADED_STATE__ = ", "", x = _, perl = TRUE) |> # R 4.2.x
sub2("\\s*window\\.__PRELOADED_STATE__ = ", "") |> # R 4.1.x
## sub(";\\s*$", "", x = _, perl = TRUE) |> # R 4.2.x
sub2(";\\s*$", "") |> # R 4.1.x
fromJSON(simplifyVector = FALSE)
page_data <- json_data$components$page[["/company/walmart/"]]
find_by_name <- function(list_data, name, elem = NULL) {
idx <- which(sapply(list_data, \(x) x$name) == name, arr.ind = TRUE)
stopifnot(length(idx) > 0)
if (length(idx) > 1) { idx <- idx[1] }
dat <- list_data[[idx]]
if (is.null(elem)) dat else dat[[elem]]
}
info_data <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-information", "config")
info_data$employees
#> [1] "2300000"
# Extra code to scrape company-data-table segments
library(purrr)
data_tables <- page_data |>
find_by_name("body", "children") |>
find_by_name("company-about-wrapper", "children") |>
find_by_name("company-table-wrapper", "children")
rows <- data_tables |>
lapply(\(x) c(x$config$data, x$config$change)) |>
purrr::flatten() |>
discard(~ is.null(.$key))
df <- data.frame(
key = rows |> map_chr(~ .$key),
title = rows |> map_chr(~ .$fieldMeta$title),
type = rows |> map_chr(~ .$fieldMeta$type),
value = rows |> map_chr(~ .$value)
)

Related

How to scrape from headings and content

I've been given a pile of about 100 html files that I want to put into rectangular form. Here's an example: http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/abergsson-anna.html . I would like to extract headings (h3) as column names and the content in between as strings taking up one row each.
I've managed to extract the column names with Rvest in R, but I'm stuck at extracting the content. I'm sure I'll get stuck again when trying to bind everything together in one dataframe.
This is what I've done for extracting the variable names:
variable.names <- map(LIST.html, ~read_html(.x) %>%
html_nodes("h3") %>%
html_text(trim = TRUE) %>%
tolower())
Here's the code I used to get all the files: system( "wget -r -np -nH --cut-dirs=3 -R index.html http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/" )
library(rvest)
library(stringr)
library(data.table)
malformed_documents <- character(0)
parse_profile_page <- function(pg, nm) {
# extract section divs, omit byline
divs <- html_nodes(pg, '#mittvagn > div')[ -1 ]
idx <- which((lapply(divs, html_attr, 'id') |> unlist()) == 'bakvagn')
if (length(idx) > 0) {
divs <- divs[ -idx ]
}
# extract section headers
titles <- html_nodes(divs, 'h3') |>
html_text(trim = TRUE) |>
tolower() |>
unlist()
# extract section contents
paragraphs <- lapply(divs, html_nodes, 'p') |>
lapply(html_text, trim = TRUE) |>
lapply(paste0, collapse = '\n') |>
lapply(str_squish) |>
unlist()
if (length(paragraphs) != length(titles)) {
message(sprintf('%s is malformed, not parsing', nm))
malformed_documents <<- c(malformed_documents, nm)
return(data.frame())
}
df <- data.frame(title = titles,
contents = paragraphs,
url = rep(nm, length(paragraphs)))
df
}
# obtain list of files to download
url <- 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/'
pg <- read_html(url)
file_urls <- html_nodes(pg, '#mittvagn > ol > li > a:nth-child(1)') |>
html_attr('href') |>
str_replace_all('^\\.\\./', '') |>
sprintf(fmt= 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/%s')
# file_urls <- sample(file_urls, 10) # uncomment to run on a small sample of pages
file_contents <- lapply(file_urls, function(x) {
message('downloading: ', x)
fl <- read_html(x)
})
names(file_contents) <- file_urls
parsed_contents <- lapply(file_urls, function(x) {
message('parsing: ', x)
pg <- file_contents[[ x ]]
parse_profile_page(pg, x)
})
parsed_contents_df <- rbindlist(parsed_contents) |> as.data.frame()
if (length(malformed_documents) > 0) {
warning('the following documents were malformed and not parsed: %s', paste0(malformed_documents, collapse = ', '))
}
View(parsed_contents_df)

Tips for Increasing the Effectiveness of this R Code

I am writing a loop (in R) to webscrape Reddit posts - using Reddit's API ("Pushshift").
Essentially, I would like to get every comment that contains the word "Trump" between now and until 20,000 hours ago at an hourly basis. The API stores the comments in a JSON frame - I wrote the following code in R to obtain these comments (note - I made it so that the results are saved after every 200 iterations in case of a crash):
library(jsonlite)
part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 = "h&before="
part3 = "h&size=500"
results = list()
for (i in 1:20000)
{tryCatch({
{
url_i<- paste0(part1, i+1, part2, i, part3)
r_i <- data.frame(fromJSON(url_i))
results[[i]] <- r_i
myvec_i <- sapply(results, NROW)
print(c(i, sum(myvec_i)))
ifelse(i %% 200 == 0, saveRDS(results, "results_index.RDS"), "" )
}
}, error = function(e){})
}
final = do.call(rbind.data.frame, results)
saveRDS(final, "final.RDS")
The code runs - but I am looking for tips to increase the speed and efficiency of this code. For example, I have noticed that:
Sometimes this code seems to take a really long time on certain iterations
I also have a feeling that as the "list" grows in size and the global environment with R becomes more full, things are also slowing down.
Sometimes, the webscraping stops collecting new results (i.e. I added a statement which shows the cumulative number of results that have been collected at each iteration - sometimes, this number stops updating)
I used "tryCatch()" to skip errors to prevent the loop from crashing - but perhaps there might have been some way around this that could have potentially resulted in more Reddit comments being scraped?
Could someone please recommend some tips on how to optimize and speed this code up? Perhaps someone could try running this code and let me know what they think?
Thank you!
There are two things you can do : 1) save the data.frame into a ".RData file" at each iteration. You need less memory when you do this because you do not store data in the RAM 2) use parallel calculations. Here is an example :
library(parallel)
library(doParallel)
library(RSelenium)
fn_Par <- function(core_Id, all_Index, list_remDr, nb_Core)
{
library(jsonlite)
library(RSelenium)
remDr <- list_remDr[[core_Id]]
remDr$open()
setwd("D:\\")
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "h&before="
part3 <- "h&size=500"
nb_Index_All <- length(all_Index)
nb_Id_Per_Core <- floor(nb_Index_All / nb_Core)
index_To_Extract <- all_Index[(1 + (core_Id - 1) * nb_Id_Per_Core) : min((core_Id * nb_Id_Per_Core), nb_Index_All)]
for(i in index_To_Extract)
{
url_i <- paste0(part1, i + 1, part2, i, part3)
remDr$navigate(url_i)
Sys.sleep(0.5)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
if(is.null(dim(r_i)) == FALSE)
{
Sys.sleep(10)
remDr$navigate(url_i)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
}
save(r_i, file = paste0(i, "_core_Id_", core_Id, ".RData"))
Sys.sleep(0.5)
}
}
nb_CPU <- 4
cluster <- parallel::makeCluster(nb_CPU)
doParallel::registerDoParallel(cl = cluster)
list_remDr <- list()
list_rd <- list()
for(i in 1 : nb_CPU)
{
print(i)
port <- as.integer(4444L + rpois(lambda = 1000, 1))
list_rd[[i]] <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
list_remDr[[i]] <- list_rd[[i]]$client
}
parLapply(cluster, X = 1 : nb_CPU, fun = fn_Par, all_Index = 1 : 2000, list_remDr = list_remDr, nb_Core = nb_CPU)
Here is another approach that can be considered :
library(parallel)
library(doParallel)
fn_Par <- function(core_Id, all_Index, nb_Core)
{
library(jsonlite)
setwd("D:\\")
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "h&before="
part3 <- "h&size=500"
nb_Index_All <- length(all_Index)
nb_Id_Per_Core <- floor(nb_Index_All / nb_Core)
index_To_Extract <- all_Index[(1 + (core_Id - 1) * nb_Id_Per_Core) : min((core_Id * nb_Id_Per_Core), nb_Index_All)]
for(i in index_To_Extract)
{
url_i <- paste0(part1, i + 1, part2, i, part3)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
save(r_i, file = paste0(i, "_core_Id_", core_Id, ".RData"))
}
}
nb_CPU <- 4
cluster <- parallel::makeCluster(nb_CPU)
doParallel::registerDoParallel(cl = cluster)
parLapply(cluster, X = 1 : nb_CPU, fun = fn_Par, all_Index = 1 : 2000, nb_Core = nb_CPU)

Scrape page content after option tag is selected

I'd like to scrape the content of a page once the province (and the commune) are selected.
The following code correctly outputs the provinces and their values.
library(rvest)
page <- read_html(x = "https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati/")
text <- page %>% html_nodes(xpath='//select[#name="provincia"]/option')%>% html_text()
values <- page %>% html_nodes(xpath='//select[#name="provincia"]/option')%>% html_attr("value")
Res <- data.frame(text = text, values = values, stringsAsFactors = FALSE)
Res
Now, I'd like to access the page for each value, e.g. this might be helpful for getting access to value=19.
text <- page %>% html_nodes(xpath="//*/option[#value = '19']")%>% html_text()
text
The source code is the following
<div class="row results_form_search">
<form role="search" method="POST" class="search-form" action="/progetto-torelli/progetto-torelli-risultati/" id="search_location">
<input type="hidden" name="comune_from" value="" />
<div class="form-row">
<input type="text" name="cognome" placeholder="Cognome" autocomplete="off" value="">
<select name="provincia">
<option value="0" selected>Seleziona Provincia</option>
<option value="74"
>-
</option>
<option value="75"
>AGRIGENTO
</option>
<option value="19"
>ALESSANDRIA
This is where the content that I want to scrape might be.
<div class="row">
<ul class="listing_search">
</ul>
</div>
Thank you so much for your advice!
RSelenium may end up being the way to go. However, if you can insert some judicious waits, or chunk your requests, so server isn't swamped with requests, you can use rvest and make the same requests the page does.
You first need to generate all the combinations of province and comune (filtering out unwanted values); this can be done by making xmlhttp requests, using the value attribute for the options within the select for province, to gather back the comune dropdown options and their associated values.
You then make further requests, for each combination pair, to get the page content, which you would get when making selections from each of those dropdowns manually and pushing CERCA.
Pauses are needed as there are 10,389 valid combinations, by my reckoning, and, if you attempt to make all those requests one after the other, following the initial requests as well, the server will cut-off the connection.
Another option would be to chunk up combined into smaller dataframes and make requests for those at timed intervals and then combine the results.
library(rvest)
library(dplyr)
library(purrr)
get_provincias <- function(link) {
nodes <- read_html(link) %>%
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
df <- data.frame(
Provincia = nodes %>% html_text(trim = T),
id0 = nodes %>% html_attr("value")
)
return(df)
}
get_comunes <- function(id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')
df <- data.frame(
id0 = id, # id1
Comune = nodes %>% html_text(trim = T),
id3 = nodes %>% html_attr("value")
)
return(df)
}
get_page <- function(prov_id, com_id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
prov_id,
com_id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
page <- read_html(link)
# print(page %>% html_node(".listing_name") %>% html_text(trim = T))
# print(tibble(id3 = com_id, page = page))
return(tibble(id3 = com_id, page = page))
}
provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")
comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")
combined <- dplyr::right_join(provincias, comunes, by = "id0")
# length(combined$Comune) -> 10389
results <- map2_dfr(combined$id0, combined$id3, .f = get_page)
final <- dplyr::inner_join(combined, results, by = "id3")
Below is a longer version, with the additional info you requested, where I played around with adding pauses. I still found that I could run up to, and including
combined <- dplyr::right_join(provincias, comunes, by = "id0")
in one go. But after that I needed to chunk requests into about 2000 requests batches with 20-30 minutes in between. You can try tweaking the timings below. I ended up using the commented out section to run each batch and then left a pause of 30 mins in between.
Some things to consider:
It seems that you can have comunes values like ... which still return listings. With that in mind you may wish to remove the :not parts of this:
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
as I assumed that was filtering out invalid results.
Next, you might consider writing a helper function with httr and retry,
to make the requests with backoff/retry, rather than use pauses.
Such a function might look like this:
httr::RETRY(
"GET",
<request url>,
times = 3,
pause_min = 20*60,
pause_base = 20*60)
Anyway, those are some ideas. Even without the server cutting the connection, via uses of waits, I still found it started to throttle requests, meaning some requests took quite a long time to complete. Optimizing this could potentially take a lot of time and effort. I spent a good few days playing around looking at chunk size and waits.
library(rvest)
library(dplyr)
library(purrr)
get_provincias <- function(link) {
nodes <- read_html(link) %>%
html_nodes('[name="provincia"] > option:not([selected]):not(:contains("-")):not(:contains("\u0085"))')
df <- data.frame(
Provincia = nodes %>% html_text(trim = T),
id0 = nodes %>% html_attr("value")
)
return(df)
}
get_comunes <- function(id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_comuni.php?id0=%s&id1=0&_=%i",
id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
nodes <- read_html(link) %>% html_nodes('option:not([value="0"])')
df <- data.frame(
id0 = id, # id1
Comune = nodes %>% html_text(trim = T),
id3 = nodes %>% html_attr("value")
)
return(df)
}
get_data <- function(prov_id, com_id) {
link <- sprintf(
"https://www.solferinoesanmartino.it/db-torelli/_get_soldati.php?id0=1&id1=&id2=%s&id3=%s&_=%i",
prov_id,
com_id,
as.numeric(as.POSIXct(Sys.Date(), format = "%Y-%m-%d"))
)
# print(link)
page <- read_html(link)
df <- data.frame(
cognome = page %>% html_nodes(".listing_name") %>% html_text(trim = T),
livello = page %>% html_nodes(".listing_level") %>% html_text(trim = T),
id3 = com_id,# for later join back on comune
id0 = prov_id
)
Sys.sleep(.25) # pause for . sec
return(df)
}
get_chunks <- function(df, chunk_size) { # adapted from #BenBolker https://stackoverflow.com/a/7060331
n <- nrow(df)
r <- rep(1:ceiling(n / chunk_size), each = chunk_size)[1:n]
d <- split(df, r)
return(d)
}
write_rows <- function(df, filename) {
flag <- file.exists(filename)
df2 <- purrr::map2_dfr(df$id0, df$id3, .f = get_data)
write.table(df2,
file = filename, sep = ",",
append = flag,
quote = F, col.names = !flag,
row.names = F
)
Sys.sleep(60*10)
}
provincias <- get_provincias("https://www.solferinoesanmartino.it/progetto-torelli/progetto-torelli-risultati")
Sys.sleep(60*5)
comunes <- map_df(provincias$id0, get_comunes) %>% filter(Comune != "-")
Sys.sleep(60*10)
combined <- dplyr::right_join(provincias, comunes, by = "id0")
Sys.sleep(60*10)
chunked <- get_chunks(combined, 2000) # https://stackoverflow.com/questions/7060272/split-up-a-dataframe-by-number-of-rows
filename <- "prov_com_cog_liv.csv"
map(chunked, ~ write_rows(.x, filename))
## #### test case #####################
# df <- chunked[[6]]
#
# flag <- file.exists(filename)
#
# df2 <- map2_dfr(df$id0, df$id3, .f = get_data)
#
# write.table(df2,
# file = filename, sep = ",",
# append = flag,
# quote = F, col.names = !flag,
# row.names = F
# )
####################################
results <- read.csv(filename)
final <- dplyr::right_join(combined, results, by = "id3")

How can I get the data of the second web page?

I am trying to get the data from a web in R using rvest package: https://etfdb.com/stock/AAPL/
But no matter how I tried, I can only get the table of the first page. Could anybody help me do this? Thank you so much.
See code below. tb1 and tb2 are the same!! That's wired.
url1 <- "https://etfdb.com/stock/AAPL/#etfs&sort_name=weighting&sort_order=desc&page=1"
url2 <- "https://etfdb.com/stock/AAPL/#etfs&sort_name=weighting&sort_order=desc&page=2"
tbs1 <- rvest::html_nodes(xml2::read_html(url1), "table")
tbs2 <- rvest::html_nodes(xml2::read_html(url2), "table")
tb1 <- rvest::html_table(tbs1[1])[[1]]
tb2 <- rvest::html_table(tbs2[1])[[1]]
This website post GET requests to update JSON data to the table. After some attempts, this is the code I came up with to deal with JSON data: (not a beautiful code but it works)
library(rjson)
library(rvest)
library(writexl)
lastpage <- 9;
df <- data.frame();
for (i in 1:lastpage){
x <- fromJSON(file = paste("https://etfdb.com/data_set/?tm=40274&cond={%22by_stock%22:25}&no_null_sort=&count_by_id=true&limit=25&sort=weighting&order=desc&limit=25&offset=", 25 * (i-1), sep = ""));
x <- x[2][[1]];
pg_df <- data.frame(matrix(unlist(x), nrow=length(x), byrow=T),stringsAsFactors=FALSE);
df <- rbind(df, pg_df);
}
for (i in 1:nrow(df)){
df$X1[i] <- read_html(df$X1[i]) %>% html_text(trim = TRUE);
df$X3[i] <- read_html(df$X3[i]) %>% html_text(trim = TRUE);
df$X5[i] <- read_html(df$X5[i]) %>% html_text(trim = TRUE);
}
df <- data.frame(df$X1, df$X3, df$X5, df$X7, df$X9);
colnames(df) <- c("Ticker", "ETF", "ETFdb.com Category", "Expense Ratio", "Weighting");
write_xlsx(
df,
path = "stock.xlsx",
col_names = TRUE,
format_headers = TRUE,
use_zip64 = FALSE
)
Update:
You can see the data source at the attribute data-url of the table here:
I'll update the code that makes it easier for you:
library(rjson)
library(rvest)
library(writexl)
stock_ticket <- "AAPL";
url <- paste("https://etfdb.com/stock/", stock_ticket, sep = "");
lastpage <- 9;
df <- data.frame();
data_url <- read_html(url) %>% html_node(xpath = "//table[#id='etfs']") %>% html_attr("data-url");
for (i in 1:lastpage){
x <- fromJSON(file = paste("https://etfdb.com", data_url, "&offset=", 25 * (i-1), sep = ""));
x <- x[2][[1]];
pg_df <- data.frame(matrix(unlist(x), nrow=length(x), byrow=T),stringsAsFactors=FALSE);
df <- rbind(df, pg_df);
}
for (i in 1:nrow(df)){
df$X1[i] <- read_html(df$X1[i]) %>% html_text(trim = TRUE);
df$X3[i] <- read_html(df$X3[i]) %>% html_text(trim = TRUE);
df$X5[i] <- read_html(df$X5[i]) %>% html_text(trim = TRUE);
}
df <- data.frame(df$X1, df$X3, df$X5, df$X7, df$X9);
colnames(df) <- c("Ticker", "ETF", "ETFdb.com Category", "Expense Ratio", "Weighting");
write_xlsx(
df,
path = "stock.xlsx",
col_names = TRUE,
format_headers = TRUE,
use_zip64 = FALSE
)

Scraping table with multiple headers in R using any package? (XML, rCurl, rlist htmltab, rvest etc)

I am attempting to scrape this table
http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1
Here are all my attempts. None of them get even close to extracting any information. Am i missing something?
library("rvest")
library("tidyverse")
# METHOD 1
url <- "http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1"
data <- url %>%
read_html() %>%
html_nodes(xpath='//*[#id="t1"]/tbody/tr[1]') %>%
html_table()
data <- data[[1]]
# METHOD 2
library(XML)
library(RCurl)
library(rlist)
theurl <- getURL("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(theurl)
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tables[[which.max(n.rows)]]
# METHOD 3
library(htmltab)
tab <- htmltab("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",
which = '//*[#id="t1"]/tbody/tr[4]',
header = '//*[#id="t1"]/tbody/tr[3]',
rm_nodata_cols = TRUE)
# METHOD 4
website <-read_html("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1")
scraped <- website %>%
html_nodes("table") %>%
.[(2)] %>%
html_table(fill = TRUE) %>%
`[[`(1)
# METHOD 5
getHrefs <- function(node, encoding)
if (!is.null(xmlChildren(node)$a)) {
paste(xpathSApply(node, './a', xmlGetAttr, "href"), collapse = ",")
} else {
return(xmlValue(xmlChildren(node)$text))
}
data <- ( readHTMLTable("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1", which = 1, elFun = getHrefs) )
The expected results should be the 12 colnames in the table & the data below