Web Scraping using R - non-table content/list - html

I am trying to scrap the list of compound synonyms from this website:
https://pubchem.ncbi.nlm.nih.gov/compound/440917#section=Depositor-Supplied-Synonyms&fullscreen=true
My current code looks like this:
dl <- read_html("https://pubchem.ncbi.nlm.nih.gov/compound/440917#section=Depositor-Supplied-Synonyms&fullscreen=true")
get_synonyms <- function(x){
x %>%
html_nodes(".section-content-item") %>%
html_text()
}
get_synonyms(dl)
I want to be able to do this for multiple compounds from PubChem so I am using a function. I am unsure what to put in the html_nodes() function based on the website's structure. The following did not work:
section-content-item, section-content, Depositor-Supplied-Synonyms all which seem to be classes holding the table of synonyms.
Thank you for any help

The following function will return all the synonyms for a given compound from the site, as long as you know the ID number:
library(httr)
library(dplyr)
get_synonyms <- function(compound) {
GET(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/",
compound, "/JSON/?heading=Depositor+Supplied+Synonyms")) %>%
content("parsed") %>%
{.$Record$Section[[1]]$Section[[1]]$Section[[1]]$Information} %>%
unlist() %>%
as.vector()
}
For example, with your Limolene case:
get_synonyms(440917)
#> [1] "84"
#> [2] "D-Limonene"
#> [3] "5989-27-5"
#> [4] "(R)-(+)-Limonene"
#> [5] "(+)-Limonene"
#> [6] "(D)-Limonene"
#> [7] "(+)-(4R)-Limonene"
#> [8] "(+)-carvene"
#> [9] "(4R)-Limonene"
#> [10] "D-(+)-Limonene"
#> [11] "D-Limonen"
#> [12] "(R)-Limonene"
#> [13] "(R)-p-Mentha-1,8-diene"
#> [14] "Citrene"
#> [15] "(+)-p-Mentha-1,8-diene"
#> [16] "(R)-4-Isopropenyl-1-methyl-1-cyclohexene"
#> [17] "Limonene, D-"
#> [18] "(+)-R-Limonene"
#> [19] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (4R)-"
#> [20] "d-p-Mentha-1,8-diene"
#> [21] "(+)-4-Isopropenyl-1-methylcyclohexene"
#> [22] "(4R)-4-isopropenyl-1-methylcyclohexene"
#> [23] "(R)-(+)-p-Mentha-1,8-diene"
#> [24] "FEMA No. 2633"
#> [25] "(R)-1-Methyl-4-(1-methylethenyl)cyclohexene"
#> [26] "(+)-(R)-Limonene"
#> [27] "UNII-GFD7C86Q1W"
#> [28] "r-(+)-limonene"
#> [29] "(R)-1-Methyl-4-(prop-1-en-2-yl)cyclohex-1-ene"
#> [30] "(4R)-1-methyl-4-prop-1-en-2-ylcyclohexene"
#> [31] "MFCD00062991"
#> [32] "GFD7C86Q1W"
#> [33] "4betaH-p-mentha-1,8-diene"
#> [34] "CHEBI:15382"
#> [35] "(+) Limonene"
#> [36] "(+)-Dipentene"
#> [37] "Carvene"
#> [38] "Glidesafe"
#> [39] "Glidsafe"
#> [40] "Kautschiin"
#> [41] "Refchole"
#> [42] "(4R)-1-methyl-4-isopropenylcyclohex-1-ene"
#> [43] "(4R)-1-methyl-4-(1-methylethenyl)cyclohexene"
#> [44] "Biogenic SE 374"
#> [45] "(+)-alpha-Limonene"
#> [46] "d-Limonene (natural)"
#> [47] "d-Limoneno [Spanish]"
#> [48] "Limonene, (+)-"
#> [49] "Limonene, dl-"
#> [50] "d-Limoneno"
#> [51] "Hemo-sol"
#> [52] "(4R)-(+)-Limonene"
#> [53] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (R)-"
#> [54] "D-limonene [JAN]"
#> [55] "(4R)-4-isopropenyl-1-methyl-cyclohexene"
#> [56] "Citrus stripper oil"
#> [57] "CCRIS 671"
#> [58] "EC 7"
#> [59] "HSDB 4186"
#> [60] "D-1,8-p-Menthadiene"
#> [61] "NCI-C55572"
#> [62] "EINECS 227-813-5"
#> [63] "p-Mentha-1,8-diene, (R)-(+)-"
#> [64] "NSC-844"
#> [65] "Sulfate turpentine, distilled"
#> [66] "(+)-1,8-para-Menthadiene"
#> [67] "Dextro-limonene"
#> [68] "d limonene"
#> [69] "AI3-15191"
#> [70] "NSC-21446"
#> [71] "Orange x"
#> [72] "NSC-757069"
#> [73] "1-Methyl-4-(1-methylethenyl)cyclohexene, (R)-"
#> [74] "EINECS 266-034-5"
#> [75] "(4R)-1-methyl-4-(prop-1-en-2-yl)cyclohex-1-ene"
#> [76] "Dipentene no. 122"
#> [77] "D-Limonene Reagent Grade"
#> [78] "DSSTox_CID_778"
#> [79] "EC 227-813-5"
#> [80] "DSSTox_RID_75785"
#> [81] "(+)-Limonene, stabilized with 0.03% tocopherol"
#> [82] "DSSTox_GSID_20778"
#> [83] "CHEMBL449062"
#> [84] "Cyclohexene, 1-methyl-4-(1-methylethenyl)-, (theta)-"
#> [85] "DTXSID1020778"
#> [86] "(R)-(+)-Limonene, 95%"
#> [87] "(R)-(+)-Limonene, 97%"
#> [88] "ZINC967513"
#> [89] "CS-M3273"
#> [90] "(R)-(+)-Limonene, >=93%"
#> [91] "Tox21_200400"
#> [92] "6458AE"
#> [93] "AKOS015899935"
#> [94] "CCG-266134"
#> [95] "DB08921"
#> [96] "LMPR0102090013"
#> [97] "NSC 757069"
#> [98] "(R)-(+)-Limonene, analytical standard"
#> [99] "NCGC00248591-01"
#> [100] "NCGC00248591-02"
#> [101] "NCGC00257954-01"
#> [102] "BS-22387"
#> [103] "CAS-5989-27-5"
#> [104] "(R)-(+)-4-Isopropenyl-1-methylcyclohexene"
#> [105] "L0047"
#> [106] "L0105"
#> [107] "(R)-Limonene 2000 microg/mL in Acetonitrile"
#> [108] "C06099"
#> [109] "D91245"
#> [110] "(4R)-1-Methyl-4-(prop-1-en-2-yl)cyclohexene"
#> [111] "J-502148"
#> [112] "W-105295"
#> [113] "Q27888324"
#> [114] "(R)-(+)-Limonene, primary pharmaceutical reference standard"
#> [115] "UNII-9MC3I34447 component XMGQYMWWDOXHJM-JTQLQIEISA-N"
#> [116] "(R)-(+)-Limonene, purum, >=96.0% (sum of enantiomers, GC)"
#> [117] "(R)-(+)-Limonene, technical, ~90% (sum of enantiomers, GC)"
Or hydrochloric acid (313)
get_synonyms(313)
#> [1] "74"
#> [2] "hydrochloric acid"
#> [3] "hydrogen chloride"
#> [4] "7647-01-0"
#> [5] "Muriatic acid"
#> [6] "Chlorohydric acid"
#> [7] "chlorane"
#> [8] "Acide chlorhydrique"
#> [9] "Chlorwasserstoff"
#> [10] "Anhydrous hydrochloric acid"
#> [11] "Spirits of salt"
#> [12] "Hydrogen chloride (HCl)"
#> [13] "Chloorwaterstof"
#> [14] "Chlorowodor"
#> [15] "Acido cloridrico"
#> [16] "Muriaticum acidum"
#> [17] "Aqueous hydrogen chloride"
#> [18] "chlorure d'hydrogene"
#> [19] "Hydrochloric acid gas"
#> [20] "Marine acid"
#> [21] "monohydrochloride"
#> [22] "Spirit of salt"
#> [23] "UNII-QTT17582CB"
#> [24] "NSC 77365"
#> [25] "CHEBI:17883"
#> [26] "Hydrogen chloride (acid)"
#> [27] "[HCl]"
#> [28] "HCl"
#> [29] "QTT17582CB"
#> [30] "MFCD00011324"
#> [31] "NSC-77365"
#> [32] "E507"
#> [33] "Bowl Cleaner"
#> [34] "4-D Bowl Sanitizer"
#> [35] "Chlorowodor [Polish]"
#> [36] "Hydrochloric Acid Solution, 1N"
#> [37] "Emulsion Bowl Cleaner"
#> [38] "Caswell No. 486"
#> [39] "Hydrogenchlorid"
#> [40] "Chloorwaterstof [Dutch]"
#> [41] "o-Tolidine Dihydrochloride Solution"
#> [42] "Hydrochloric acid [JAN]"
#> [43] "Chlorwasserstoff [German]"
#> [44] "Hydrogen Chloride - Methanol Reagent"
#> [45] "Titanium, Reference Standard Solution"
#> [46] "Vanadium, Reference Standard Solution"
#> [47] "Acido clorhidrico"
#> [48] "UN 1789 (solution)"
#> [49] "Hydrochloric acid, ACS reagent, 37%"
#> [50] "UN 1050 (anhydrous)"
#> [51] "mono hydrochloride"
#> [52] "Acido cloridrico [Italian]"
#> [53] "Platinum Cobalt Color Standard Solution"
#> [54] "White Emulsion Bowl Cleaner"
#> [55] "Acido clorhidrico [Spanish]"
#> [56] "Varley Poly-Pak Bowl Creme"
#> [57] "Acide chlorhydrique [French]"
#> [58] "Hydrogen chloride (gas only)"
#> [59] "Hydrochloric Acid Solution, 0.2N (N/5)"
#> [60] "Hydrochloric Acid Solution, 0.5N (N/2)"
#> [61] "Chlorure d'hydrogene [French]"
#> [62] "Chloruro de hidrogeno"
#> [63] "HSDB 545"
#> [64] "Hydrochloric Acid Solution, 0.1N (N/10)"
#> [65] "Chloruro de hidrogeno [Spanish]"
#> [66] "Hygeia Creme Magic Bowl Cleaner"
#> [67] "Percleen Bowl and Urinal Cleaner"
#> [68] "Hydrogen chloride solution 1.0M in ethyl acetate"
#> [69] "EINECS 231-595-7"
#> [70] "UN1050"
#> [71] "UN1789"
#> [72] "UN2186"
#> [73] "Anhydrous hydrogen chloride"
#> [74] "Wuest Bowl Cleaner Super Concentrated"
#> [75] "Chlorure d'hydrogene anhydre [French]"
#> [76] "Cloruro de hidrogeno anhidro [Spanish]"
#> [77] "EPA Pesticide Chemical Code 045901"
#> [78] "Chlorure d'hydrogene anhydre"
#> [79] "Cloruro de hidrogeno anhidro"
#> [80] "UN 2186 (refrigerated liquefied gas)"
#> [81] "chloro"
#> [82] "chlorum"
#> [83] "hydochloride"
#> [84] "hydrochlorie"
#> [85] "hydrochoride"
#> [86] "hydrocloride"
#> [87] "Salzsaeure"
#> [88] "Hydrochloric acid [JAN:NF]"
#> [89] "chloridohydrogen"
#> [90] "hydro chloride"
#> [91] "hydro-chloride"
#> [92] "hydrogenchloride"
#> [93] "Chloro radical"
#> [94] "Soldering acid"
#> [95] "chlorhydric acid"
#> [96] "hydochloric acid"
#> [97] "hydogen chloride"
#> [98] "hydrochoric acid"
#> [99] "hydrocloric acid"
#> [100] "hydrogen chlorid"
Created on 2022-07-21 by the reprex package (v2.0.1)

The text in rendered in JavaScript, so it is easier to scrape the API as JSON and proceed by parsing it. You would need something like Selenium to scrape regularly. I tried with citric acid (311), and the same procedure works by substituting 311 for 440917 in the URL. Let me know if this works.
Additionally, I have added a procedure for a column of multiple chemicals.
library(tidyverse)
library(jsonlite)
data <- jsonlite::fromJSON("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/511/JSON/?heading=Depositor+Supplied+Synonyms")
data$Record$Section$Section[[1]]$Section[[1]]$Information[[1]][[2]][[1,1]]
#For multiple chemicals
df <- as_tibble_col(c(311, 440917, 5280450, 16129778, 1175), "IDs") %>%
rowwise() %>%
mutate(synonyms = list(jsonlite::fromJSON(paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/", IDs, "/JSON/?heading=Depositor+Supplied+Synonyms", sep = ""))$Record$Section$Section[[1]]$Section[[1]]$Information[[1]][[2]][[1,1]])) %>%
ungroup()

Related

Can't update role of mutated variables

Background
I'm using the R package {recipes} for data preprocessing. Assume that I would like to transform some variable and then declare the transformed variable as the outcome variable for modeling.
Problem and minimal example:
However, an error is thrown:
library(tidymodels)
rec <- recipe( ~ ., data = mtcars) |>
step_mutate(mpg2 = mpg * 2) |>
update_role(mpg2, new_role = "outcome")
#> Error in `update_role()`:
#> ! Can't subset columns that don't exist.
#> ✖ Column `mpg2` doesn't exist.
Created on 2023-01-15 with reprex v2.0.2
What I've tried
The help pages of step_mutate() and update_role() do not mention the case of updating the role of an mutated variables. When I update the role of a variable without having mutated it, no error is thrown.
There are SO questions around with a similar error message (such as here, here, or here), but those questions seem to tap into different aspects.
Sessioninfo
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] yardstick_1.1.0 workflowsets_1.0.0 workflows_1.1.2 tune_1.0.1
#> [5] tidyr_1.2.1 tibble_3.1.8 rsample_1.1.1 recipes_1.0.4
#> [9] purrr_1.0.1 parsnip_1.0.3 modeldata_1.0.1 infer_1.0.4
#> [13] ggplot2_3.4.0 dplyr_1.0.10 dials_1.1.0 scales_1.2.1
#> [17] broom_1.0.2 tidymodels_1.0.0
#>
#> loaded via a namespace (and not attached):
#> [1] foreach_1.5.2 splines_4.2.1 R.utils_2.12.2
#> [4] prodlim_2019.11.13 assertthat_0.2.1 highr_0.10
#> [7] GPfit_1.0-8 yaml_2.3.6 globals_0.16.2
#> [10] ipred_0.9-13 pillar_1.8.1 backports_1.4.1
#> [13] lattice_0.20-45 glue_1.6.2 digest_0.6.31
#> [16] hardhat_1.2.0 colorspace_2.0-3 htmltools_0.5.4
#> [19] Matrix_1.5-3 R.oo_1.25.0 timeDate_4022.108
#> [22] pkgconfig_2.0.3 lhs_1.1.6 DiceDesign_1.9
#> [25] listenv_0.9.0 gower_1.0.1 lava_1.7.1
#> [28] timechange_0.2.0 styler_1.8.1 generics_0.1.3
#> [31] ellipsis_0.3.2 furrr_0.3.1 withr_2.5.0
#> [34] nnet_7.3-18 cli_3.6.0 survival_3.5-0
#> [37] magrittr_2.0.3 evaluate_0.19 R.methodsS3_1.8.2
#> [40] fs_1.5.2 fansi_1.0.3 future_1.30.0
#> [43] parallelly_1.34.0 R.cache_0.16.0 MASS_7.3-58.1
#> [46] class_7.3-20 tools_4.2.1 lifecycle_1.0.3
#> [49] stringr_1.5.0 munsell_0.5.0 reprex_2.0.2
#> [52] compiler_4.2.1 rlang_1.0.6 grid_4.2.1
#> [55] iterators_1.0.14 rstudioapi_0.14 rmarkdown_2.19
#> [58] gtable_0.3.1 codetools_0.2-18 DBI_1.1.3
#> [61] R6_2.5.1 lubridate_1.9.0 knitr_1.41
#> [64] fastmap_1.1.0 future.apply_1.10.0 utf8_1.2.2
#> [67] stringi_1.7.12 parallel_4.2.1 Rcpp_1.0.9
#> [70] vctrs_0.5.1 rpart_4.1.19 tidyselect_1.2.0
#> [73] xfun_0.36
```
This behavior is currently not properly documented.
The reason why you are having problems is because add_role(), update_role() and remove_role() can only be applied to the variables passed to recipe(), and they are all executed before the step functions.
This means that the following two snippets of code returns the same result
recipe( ~ ., data = mtcars) |>
step_mutate(mpg2 = mpg * 2) |>
update_role(mpg2, new_role = "outcome")
recipe( ~ ., data = mtcars) |>
update_role(mpg2, new_role = "outcome") |>
step_mutate(mpg2 = mpg * 2)
Reference here https://github.com/tidymodels/recipes/blob/ab2405a0393bba06d9d7a52b4dbba6659a6dfcbd/R/roles.R#L132 :
Roles can only be changed on the original data supplied to recipe()
More talk here https://github.com/tidymodels/recipes/issues/437.
The role argument of step_mutate() allows you to specify the role of the variables it creates
library(recipes)
recipe( ~ ., data = mtcars) |>
step_mutate(mpg2 = mpg * 2, role = "outcome") |>
prep() |>
summary()
#> # A tibble: 12 × 4
#> variable type role source
#> <chr> <list> <chr> <chr>
#> 1 mpg <chr [2]> predictor original
#> 2 cyl <chr [2]> predictor original
#> 3 disp <chr [2]> predictor original
#> 4 hp <chr [2]> predictor original
#> 5 drat <chr [2]> predictor original
#> 6 wt <chr [2]> predictor original
#> 7 qsec <chr [2]> predictor original
#> 8 vs <chr [2]> predictor original
#> 9 am <chr [2]> predictor original
#> 10 gear <chr [2]> predictor original
#> 11 carb <chr [2]> predictor original
#> 12 mpg2 <chr [2]> outcome derived
Additionally, it is not recommended that you try to create/modify the outcome inside a recipe. Such modifications should happen before, preferable before data splitting.

How to scrape HTML table with nested column with Rvest?

I encounter a big problem in scrapping of HTML table with nested columns.
The table is from the immigration department of Hong Kong.
A screenshot is shown here:
I tried to do it with rvest, but the result is messy.
library(rvest)
library(tidyverse)
library(stringr)
library(dplyr)
url_data <- "https://www.immd.gov.hk/eng/stat_20220901.html"
url_data %>%
read_html()
css_selector <- "body > section:nth-child(7) > div > div > div > div > table"
immiTable <- url_data %>%
read_html() %>% html_element(css = css_selector) %>% html_table()
immiTable
My goal is to extract the first row (i.e. Airport) and plot it to a pie chart, and produce a dataframe of the whole table and save it to excel.
I realize that teaching material for unnest table and scrapping nested table is rather scarce. Therefore I need your guidance. Thank you very much for your help.
Here is a way. The headers format complicates things but the code below works. It extracts the entire table, not just the first row.
suppressPackageStartupMessages({
library(rvest)
library(dplyr)
library(ggplot2)
})
url_data <- "https://www.immd.gov.hk/eng/stat_20220901.html"
page <- url_data %>% read_html()
page %>%
html_elements("[headers='Arrival']") %>%
html_text() %>%
paste("Arrival", .) -> col_names
page %>%
html_elements("[headers='Departure']") %>%
html_text() %>%
paste("Departure", .) %>%
c(col_names, .) -> col_names
page %>%
html_elements("[headers='Control_Point']") %>%
html_text() -> row_names
page %>%
html_elements("[class='hRight']") %>%
html_text() %>%
sub(",", "", .) %>%
as.numeric() %>%
matrix(nrow = length(row_names), byrow = TRUE) %>%
as.data.frame() %>%
setNames(col_names) %>%
`row.names<-`(row_names) -> final
final
#> Arrival Hong Kong Residents
#> Airport 4258
#> Express Rail Link West Kowloon 0
#> Hung Hom 0
#> Lo Wu 0
#> Lok Ma Chau Spur Line 0
#> Heung Yuen Wai 0
#> Hong Kong-Zhuhai-Macao Bridge 333
#> Lok Ma Chau 0
#> Man Kam To 0
#> Sha Tau Kok 0
#> Shenzhen Bay 3404
#> China Ferry Terminal 0
#> Harbour Control 0
#> Kai Tak Cruise Terminal 0
#> Macau Ferry Terminal 0
#> Total 7995
#> Arrival Mainland Visitors Arrival Other Visitors
#> Airport 1488 422
#> Express Rail Link West Kowloon 0 0
#> Hung Hom 0 0
#> Lo Wu 0 0
#> Lok Ma Chau Spur Line 0 0
#> Heung Yuen Wai 0 0
#> Hong Kong-Zhuhai-Macao Bridge 28 39
#> Lok Ma Chau 0 0
#> Man Kam To 0 0
#> Sha Tau Kok 0 0
#> Shenzhen Bay 348 37
#> China Ferry Terminal 0 0
#> Harbour Control 0 0
#> Kai Tak Cruise Terminal 0 0
#> Macau Ferry Terminal 0 0
#> Total 1864 498
#> Arrival Total Departure Hong Kong Residents
#> Airport 6168 3775
#> Express Rail Link West Kowloon 0 0
#> Hung Hom 0 0
#> Lo Wu 0 0
#> Lok Ma Chau Spur Line 0 0
#> Heung Yuen Wai 0 0
#> Hong Kong-Zhuhai-Macao Bridge 400 243
#> Lok Ma Chau 0 0
#> Man Kam To 0 0
#> Sha Tau Kok 0 0
#> Shenzhen Bay 3789 1301
#> China Ferry Terminal 0 0
#> Harbour Control 0 0
#> Kai Tak Cruise Terminal 0 0
#> Macau Ferry Terminal 0 0
#> Total 10357 5319
#> Departure Mainland Visitors
#> Airport 1154
#> Express Rail Link West Kowloon 0
#> Hung Hom 0
#> Lo Wu 0
#> Lok Ma Chau Spur Line 0
#> Heung Yuen Wai 0
#> Hong Kong-Zhuhai-Macao Bridge 194
#> Lok Ma Chau 0
#> Man Kam To 0
#> Sha Tau Kok 0
#> Shenzhen Bay 524
#> China Ferry Terminal 0
#> Harbour Control 0
#> Kai Tak Cruise Terminal 0
#> Macau Ferry Terminal 0
#> Total 1872
#> Departure Other Visitors Departure Total
#> Airport 315 5244
#> Express Rail Link West Kowloon 0 0
#> Hung Hom 0 0
#> Lo Wu 0 0
#> Lok Ma Chau Spur Line 0 0
#> Heung Yuen Wai 0 0
#> Hong Kong-Zhuhai-Macao Bridge 15 452
#> Lok Ma Chau 0 0
#> Man Kam To 0 0
#> Sha Tau Kok 0 0
#> Shenzhen Bay 28 1853
#> China Ferry Terminal 0 0
#> Harbour Control 0 0
#> Kai Tak Cruise Terminal 0 0
#> Macau Ferry Terminal 0 0
#> Total 358 7549
Created on 2022-09-18 with reprex v2.0.2
To plot the pie chart in ggplot plot a bar chart then change to polar coordinates.
Airport <- final[1,,]
Airport %>%
t() %>%
as.data.frame() %>%
mutate(`Arrival/Departure` = row.names(.)) %>%
ggplot(aes("", Airport, fill = `Arrival/Departure`)) +
geom_col(width = 1) +
scale_fill_manual(values = RColorBrewer::brewer.pal(n = 8, name = "Spectral")) +
coord_polar(theta = "y", start = 0) +
theme_void()
Created on 2022-09-18 with reprex v2.0.2
An alternative would be to select the tbody rows, filtering out the hidden items by attribute, then add in the headers later.
library(rvest)
library(tidyverse)
rows <- read_html("https://www.immd.gov.hk/eng/stat_20220901.html") %>% html_elements(".table-passengerTrafficStat tbody tr")
prefixes <- c("arr", "dep")
cols <- c("Hong Kong Residents", "Mainland Visitors", "Other Visitors", "Total")
headers <- c("Control_Point", crossing(prefixes, cols) %>% unite("headers", 1:2, remove = T) %>% unlist() %>% unname())
df <- map_dfr(
rows,
function(x) {
x %>%
html_elements("td[headers]") %>%
set_names(headers) %>%
html_text()
}
) %>%
mutate(across(c(-1), ~ str_replace(.x, ",", "") %>% as.integer()))
Or somewhat condensed,
library(rvest)
library(tidyverse)
rows <- read_html("https://www.immd.gov.hk/eng/stat_20220901.html") %>% html_elements(".table-passengerTrafficStat tbody tr")
prefixes <- c("arr", "dep")
cols <- c("Hong Kong Residents", "Mainland Visitors", "Other Visitors", "Total")
headers <- c("Control_Point", crossing(prefixes, cols) %>% unite("headers", 1:2, remove = T) %>% unlist() %>% unname())
df <- map_dfr(rows, ~ set_names(.x %>% html_elements("td[headers]") %>% html_text(), headers)) %>%
mutate(across(c(-1), ~ str_replace(.x, ",", "") %>% as.integer()))

Tidymodels prediction methods giving different results

I'm a bit confused about getting metrics from resamples using tidymodels.
I seem to be getting 3 different metrics from the same set of resamples, depending on if I use collect_predictions() %>% metrics() or simply collect_metrics()
Here is a simple example...
library(tidyverse)
library(tidymodels)
starwars_df <- starwars %>% select(name:sex) %>% drop_na()
lasso_linear_reg_glmnet_spec <-
linear_reg(penalty = .1, mixture = 1) %>%
set_engine('glmnet')
basic_rec <-
recipe(mass ~ height + sex + skin_color,
data = starwars_df) %>%
step_novel(all_nominal_predictors()) %>%
step_other(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
sw_wf <- workflow() %>%
add_recipe(basic_rec) %>%
add_model(lasso_linear_reg_glmnet_spec)
sw_boots <- bootstraps(starwars_df, times = 50)
resampd <- fit_resamples(
sw_wf,
sw_boots,
control = control_resamples(save_pred = TRUE)
)
The following three lines give different results
resampd %>% collect_predictions(resampd, summarize = T) %>% metrics(mass, .pred)
resampd %>% collect_predictions(resampd, summarize = F) %>% metrics(mass, .pred)
resampd %>% collect_metrics()
As an additional question, what would be the best/correct way to get confidence intervals for the rmse in the above example. Here is one way...
individ_metrics <- resampd %>% collect_predictions() %>% group_by(id) %>% rmse(mass, .pred)
confintr::ci_mean(individ_metrics$.estimate)
mean(individ_metrics$.estimate)
Thanks!
The reason that none of those are the same is they are not aggregated in the same way. It turns that taking a mean of a set of means doesn't give you the same (right) result as taking the mean of the whole underlying set. If you were to do something like resampd %>% collect_predictions(summarize = TRUE) %>% metrics(mass, .pred), that is like taking a mean of a set of means.
It turns out that these two things are the same:
## these are the same:
resampd %>%
collect_predictions(summarize = FALSE) %>%
group_by(id) %>%
metrics(mass, .pred)
#> # A tibble: 150 × 4
#> id .metric .estimator .estimate
#> <chr> <chr> <chr> <dbl>
#> 1 Bootstrap01 rmse standard 16.4
#> 2 Bootstrap02 rmse standard 23.1
#> 3 Bootstrap03 rmse standard 31.6
#> 4 Bootstrap04 rmse standard 17.6
#> 5 Bootstrap05 rmse standard 9.59
#> 6 Bootstrap06 rmse standard 25.0
#> 7 Bootstrap07 rmse standard 16.3
#> 8 Bootstrap08 rmse standard 35.1
#> 9 Bootstrap09 rmse standard 25.7
#> 10 Bootstrap10 rmse standard 25.3
#> # … with 140 more rows
resampd %>% collect_metrics(summarize = FALSE)
#> # A tibble: 100 × 5
#> id .metric .estimator .estimate .config
#> <chr> <chr> <chr> <dbl> <chr>
#> 1 Bootstrap01 rmse standard 16.4 Preprocessor1_Model1
#> 2 Bootstrap01 rsq standard 0.799 Preprocessor1_Model1
#> 3 Bootstrap02 rmse standard 23.1 Preprocessor1_Model1
#> 4 Bootstrap02 rsq standard 0.193 Preprocessor1_Model1
#> 5 Bootstrap03 rmse standard 31.6 Preprocessor1_Model1
#> 6 Bootstrap03 rsq standard 0.608 Preprocessor1_Model1
#> 7 Bootstrap04 rmse standard 17.6 Preprocessor1_Model1
#> 8 Bootstrap04 rsq standard 0.836 Preprocessor1_Model1
#> 9 Bootstrap05 rmse standard 9.59 Preprocessor1_Model1
#> 10 Bootstrap05 rsq standard 0.860 Preprocessor1_Model1
#> # … with 90 more rows
Created on 2022-08-23 with reprex v2.0.2

tidy() function cant process last_fit() obejcts

Functions like last_fit() from the tune package produces last_fit objects which are large nested lists containing the fit results. I tried to transform them into data.frames using the tidy() function from the broom package but this resulted in the following error:
MRE :
library(tidymodels)
library(tidyverse)
data <- mtcars
model_default<-
parsnip::boost_tree(
mode = "regression"
) %>%
set_engine('xgboost',objective = 'reg:squarederror')
wf <- workflow() %>%
add_model(model_default) %>%
add_recipe(recipe(mpg~.,data))
lf <- last_fit(wf,split)
tidy_lf <- tidy(lf)
Error in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm) :
is.atomic(x) is not TRUE
In addition: Warning messages:
1: Data frame tidiers are deprecated and will be removed in an upcoming release of broom.
2: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
3: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
4: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
5: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
6: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
7: In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
Question : How can I use tidy() with an last_fit() output?
The object that last_fit() creates is a tibble (containing metrics, predictions, etc), not a model that can be tidied. You can use extract_workflow() to extract out the fitted workflow from the object created by last_fit(), and this object can be tidied:
library(tidymodels)
car_split <- initial_split(mtcars)
wf <- workflow() %>%
add_model(linear_reg()) %>%
add_recipe(recipe(mpg ~ ., mtcars))
lf <- last_fit(wf, car_split)
lf
#> # Resampling results
#> # Manual resampling
#> # A tibble: 1 × 6
#> splits id .metrics .notes .predictions .workflow
#> <list> <chr> <list> <list> <list> <list>
#> 1 <split [24/8]> train/test split <tibble> <tibble> <tibble [8 × 4]> <workflow>
lf %>%
extract_workflow() %>%
tidy()
#> # A tibble: 11 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -33.6 36.0 -0.935 0.367
#> 2 cyl -0.0296 1.34 -0.0221 0.983
#> 3 disp 0.0252 0.0269 0.934 0.367
#> 4 hp -0.00539 0.0319 -0.169 0.868
#> 5 drat -0.167 2.54 -0.0659 0.948
#> 6 wt -5.69 2.79 -2.04 0.0623
#> 7 qsec 3.32 1.76 1.89 0.0820
#> 8 vs -4.40 3.80 -1.16 0.268
#> 9 am 2.54 2.67 0.950 0.360
#> 10 gear 2.69 2.28 1.18 0.259
#> 11 carb -0.0486 1.11 -0.0439 0.966
Created on 2022-03-23 by the reprex package (v2.0.1)

How to unpack nested JSON type column in a dataframe with R(plus RegEx issue)

I'm very new to R,and I'm currently stuck on this problem:
so I imported a JSON file and already***convert it to a dataframe***, now I need to return rows under condition:
As you can see in the picture, I have a column recording hours(payload.hours)
My GOAL is to find out the hours that meet: 1. sunday 2. time ealier than 10AM.
I tried several ways but somehow it even doesn't come close at all... I havent dealt with such nested form before... so I have to seek your idea&help...
e.g. one element in payload.hours column
payload.hours
...
[530] "{\"monday\":[[\"10:30\",\"16:00\"]],\"tuesday\":[[\"10:30\",\"16:00\"]],\"wednesday\":[[\"10:30\",\"16:00\"]],\"thursday\":[[\"10:30\",\"16:00\"]],\"friday\":[[\"10:30\",\"16:00\"]],\"saturday\":[[\"10:30\",\"16:00\"]],\"sunday\":[[\"10:30\",\"16:00\"]]}"
this is what I used for unpacking the nested lists in "hours" column...but it doesn't work...
library(ndjson)
json<- ndjson::stream_in("#localpath")
#successfully converted json to a dataframe...but each element in payload.hours column remains nested like above.
lapply(json$payload.hours, jsonlite::fromJSON)
#continue unwarp nested jason column BUT RESULT Error in if (is.character(txt) && length(txt) == 1 && nchar(txt, type = "bytes") < :missing value where TRUE/FALSE needed
Another approach I tried(FOR A LONG TIME) is RegEx
hrs<-json1$payload.hours #select column hours into hrs
tme<-"sunday{1}.{8}[0-9]{1}\"" # ???(not sure about this...seruously)...? match string with sunday and after 8characters..aka find preceding digit{1} when meet ":"
iftme<-grepl(tme,hrs) #set logical factor T/F if matches
checkhrs<-hrs[iftme] #check if open hours are correct
checkhrs
And this seems to work...but I am not sure why...(YES.IDK WHY)...so if anyone could explain to me that would be great!
This is the original json file:
https://drive.google.com/open?id=0B-jU6pp4pjS4Smg2RGpHSTlvN2c
This is RegEx output...seems right...but I am not sure about my expression..LOL
Unpacking JSON can be a lot of work, particularly if it is deeply nested. Most JSON reading packages (jsonlite, RJSONIO, etc.) can turn data into something close to a data.frame, but fixing the structure requires an understanding that the reader functions don't have. Since JSON most nearly corresponds to R's lists, cleaning up data coming from JSON typically involves a lot of lapply and its variants. Here I'll use purrr, which has many useful variants and helper functions and works neatly with dplyr.
library(tidyverse)
# Read data
json <- jsonlite::stream_in(file('~/Downloads/jsondata.json'))
# Initial cleanup to proper data.frame
json <- json$payload %>% map_df(simplify_all) %>% dmap(simplify) %>%
mutate(uuid = json$uuid, # re-add uuid subset out at beginning
# Convert hours to a list column of data.frames
hours = hours %>% map_if(negate(is.na), jsonlite::fromJSON) %>%
map(~map_df(.x, as_data_frame, .id = 'day')),
# Add Boolean variable for whether Sunday opening hours are before 10a. Subset,
open_sun_before_10 = hours %>% map(~.x %>% filter(day == 'sunday') %>% .[[2]]) %>%
map(as.POSIXct, format = '%H:%M') %>% # convert to datetime,
map(~.x < as.POSIXct('10:00', format = '%H:%M')) %>% # compare to 10a
map_lgl(~ifelse(length(.x) == 0, NA, .x))) # and cleanup.
Whereas stream_in returned a data.frame with two columns (one very deeply nested), the columns are now less nested. There are still JSON structures in some of the untouched columns, though, which will have to be addressed if you want to use the data.
json
#> # A tibble: 538 × 42
#> existence_full geo_virtual latitude
#> <dbl> <chr> <chr>
#> 1 1.000000 ["56.9459720|-2.1971226|20|within_50m|4"] 56.945972
#> 2 1.000000 ["56.237480|-5.073578|20|within_50m|4"] 56.237480
#> 3 1.000000 ["51.483872|-0.606820|100|rooftop|2"] 51.483872
#> 4 1.000000 ["57.343233|-2.191955|100|rooftop|4"] 57.343233
#> 5 1.000000 ["53.225815|-4.094775|20|within_50m|4"] 53.225815
#> 6 1.000000 ["58.9965740|-3.1882195|20|within_50m|4"] 58.996574
#> 7 1.000000 ["57.661419|-2.520144|100|rooftop|4"] 57.661419
#> 8 1.000000 ["51.642727|-3.934845|20|within_50m|4"] 51.642727
#> 9 0.908251 <NA> <NA>
#> 10 1.000000 ["56.510558|-5.401638|100|rooftop|2"] 56.510558
#> # ... with 528 more rows, and 39 more variables: locality <chr>,
#> # `_records_touched` <chr>, address <chr>, email <chr>,
#> # existence_ml <dbl>, domain_aggregate <chr>, name <chr>,
#> # search_tags <list>, admin_region <chr>, existence <dbl>,
#> # category_labels <list>, post_town <chr>, region <chr>,
#> # review_count <chr>, geocode_level <chr>, tel <chr>, placerank <int>,
#> # longitude <chr>, placerank_ml <dbl>, fax <chr>,
#> # category_ids_text_search <chr>, website <chr>, status <chr>,
#> # geocode_confidence <chr>, postcode <chr>, category_ids <list>,
#> # country <chr>, `_geocode_quality` <chr>, hours_display <chr>,
#> # hours <list>, neighborhood <list>, languages <chr>,
#> # address_extended <chr>, status_closed <chr>, po_box <chr>,
#> # name_variants <list>, yext_id <chr>, uuid <chr>,
#> # open_sun_before_10 <lgl>
And the columns created:
json %>% select(hours, open_sun_before_10)
#> # A tibble: 538 × 2
#> hours open_sun_before_10
#> <list> <lgl>
#> 1 <tibble [1 × 2]> NA
#> 2 <tibble [1 × 2]> NA
#> 3 <tibble [7 × 3]> FALSE
#> 4 <tibble [1 × 2]> NA
#> 5 <tibble [7 × 3]> FALSE
#> 6 <tibble [1 × 2]> NA
#> 7 <tibble [1 × 2]> NA
#> 8 <tibble [6 × 3]> NA
#> 9 <tibble [1 × 2]> NA
#> 10 <tibble [7 × 3]> TRUE
#> # ... with 528 more rows