Extract nested JSON from R dataframe without knowing keys - json

I am trying to extract JSON from a TSV column. The difficulty is the JSON is shallowly nested, and the key values may not be present in every row.
I have a minimal example to illustrate my point.
df <- tibble(index = c(1, 2),
data = c('{"json_char":"alpha", "json_list1":["x","y"]}',
'{"json_char":"beta", "json_list1":["x","y","z"], "json_list2":["a","b","c"]}'))
The desired result:
df <- tibble::tibble(index = list(1, 2),
json_char = list("alpha", "beta"),
json_list1 = list(list("x","y"), list("x","y","z")),
json_list2 = list(NA, list("a","b","c")))
After a fair amount of experimentation, I have this function:
extract_json_column <- function(df) {
df %>%
magrittr::use_series(data) %>%
purrr::map(jsonlite::fromJSON) %>%
purrr::map(purrr::simplify) %>%
tibble::enframe() %>%
tidyr::spread("name", "value") %>%
purrr::flatten_dfr()
}
Which gives me the following error: Error in bind_rows_(x, .id) : Argument 2 must be length 3, not 7.
The first row sets the number of parameters for the rest of dataframe. Is there anyway to avoid that behavior?

I modified your function to the following. I hope this helps.
library(tidyverse)
library(rjson)
extract_json_column <- function(df){
df %>%
rowwise() %>%
mutate(data = map(data, fromJSON)) %>%
split(.$index) %>%
map(~.$data[[1]]) %>%
map(~map_if(., function(x) length(x) != 1, list)) %>%
map(as_data_frame) %>%
bind_rows(.id = "index")
}
extract_json_column(df)
# A tibble: 2 x 4
index json_char json_list1 json_list2
<chr> <chr> <list> <list>
1 1 alpha <chr [2]> <NULL>
2 2 beta <chr [3]> <chr [3]>

Related

Webscraping Rvest not working from html page, table showing NA'S - Mc Donalds

I am trying to scrape data from https://www.mcdonalds.com/de/de-de/product/grand-cheese-n-beef-classic-5642.html to make a dataframe with all the nutri values and allerges drop down menu,(Further information, per 100g, per portion, contained allergies), however my rvest cannot detect the information as a table.
I don't even show any required value
library(rvest)
url4 <- "https://www.mcdonalds.com/de/de-de/product/grand-cheese-n-beef-classic-5642.html"
test <- url4 %>% read_html() %>%
html_nodes(xpath = '//*[#id="collapseOne"]/div/div/div/div[1]') %>%
html_table()
test <- as.data.frame(test)
I also tried this
library(rvest)
library(stringr)
library(tidyr)
url <- "https://www.mcdonalds.com/de/de-de/product/grand-cheese-n-beef-classic-5642.html"
webpage <- read_html(url)
sb_table <- html_nodes(webpage, 'table')
sb <- html_table(sb_table)[[1]]
head(sb)
How could that be done, I'm very new to web scraping don't know if it's Html tags are correct
------ This is scraping data I want---------
link correct or not.
You can request the information from their json API
library(tidyverse)
library(httr2)
"https://www.mcdonalds.com/dnaapp/itemDetails?country=de&language=de&showLiveData=true&item=201799" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
.$item %>%
.$nutrient_facts %>%
.$nutrient %>%
as_tibble %>%
select(4:9)
# A tibble: 10 x 6
id name nutrient_~1 uom uom_d~2 value
<int> <chr> <chr> <chr> <chr> <chr>
1 1 Serving Size primary_se~ g grams 302
2 2 Brennwert energy_kJ kJ kiloJo~ 2992
3 3 Brennwert energy_kcal kcal kilo c~ 716
4 4 Fett fat g grams 40
5 5 davon gesättigte Fettsäuren saturated_~ g grams 16
6 6 Kohlenhydrate carbohydra~ g grams 44
7 7 davon Zucker sugar g grams 11
8 8 Ballaststoffe fiber g grams 3.3
9 9 Eiweiß protein g grams 40
10 10 Salz salt g grams 2.4
# ... with abbreviated variable names 1: nutrient_name_id,
# 2: uom_description
Information on the allergies
"https://www.mcdonalds.com/dnaapp/itemDetails?country=de&language=de&showLiveData=true&item=201799" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
.$item %>%
.$item_allergen %>%
str_split(pattern = ", ") %>%
getElement(1)
[1] "Milch (einschl. Laktose)"
[2] "Eier"
[3] "Glutenhaltiges Getreide: Weizen (wie Dinkel und Khorasan-Weizen)"
[4] "Senf"
[5] "Sesamsamen"

Error when web scraping in R: Error in UseMethod("xml_find_all") :

I wrote some code to webscrape air quality data in R. It worked perfectly fine and I had no issues. But now, when I recently reran it, I'm getting an error when using the html_nodes() function.
Here is my code:
library(rvest)
library(tidyverse)
library(lubridate)
## Download MOE Location Data
# https://stackoverflow.com/questions/25677035/how-to-create-a-range-of-dates-in-r
## Create a tibble of dates
start_date <- "2021/1/1"
end_date <- "2021/12/31"
dates <- seq(as.Date(start_date), as.Date(end_date), "days")
df <- NULL
for (datex in dates) {
datef = as.Date(datex, origin = "1970-01-01")
Day = day(datef)
Month = month(datef)
Year = year(datef)
for (hour in 1:24) {
url.new <-
paste(
"http://www.airqualityontario.com/aqhi/locations.php?start_day=",
Day,
"&start_month=",
Month,
"&start_year=",
Year,
"&my_hour=",
hour,
"&pol=36&text_only=1&Submit=Update",
sep = ""
)
download.file(url.new, destfile = "scrapedpage.html", quiet=TRUE)
simple <- read_html("scrapedpage.html")
test <- simple %>%
html_nodes("td") %>%
html_text()
test <- as_tibble(test)
df.temp <-
as.data.frame(matrix(
unlist(test, use.names = FALSE),
ncol = 3,
byrow = TRUE
)) %>%
mutate(date = paste(datef)) %>%
mutate(hour = hour)
df <- rbind(df, df.temp)
}
}
df <- as_tibble(df)
colnames(df) <- c("Station","Address","SurfaceConc","SurfaceDate","Hour")
MOE_data <- df %>%
filter(Address != "Bay St. Wellesley St. W.") %>%
select(-Address) %>%
mutate(Station = trimws(Station)) %>%
# filter(str_detect(Station, 'Toronto')) %>%
mutate(Hour = paste(Hour, ":00:00", sep = "")) %>%
mutate(Hour = hms::as_hms(Hour)) %>%
mutate(SurfaceDate = paste(SurfaceDate, Hour)) %>%
mutate(SurfaceDate = as_datetime(SurfaceDate)) %>%
select(-Hour)
MOE_data <- as_tibble(MOE_data)
rm(list=setdiff(ls(), "MOE_data_2021"))
# save.image(file='Jan2019_Dec2021.RData')
This is the error I get:
Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "xml_document"
What I don't understand is why it happens for some values, some of the time. For example, I get an error when the hour = 16. But when I rerun it, it may work, it's just not consistent.

Is there a reason the xgboost code snippet from the usemodels package has one_hot set to TRUE?

Is there a reason the recipe code snippet for xgboost classifier has one_hot = TRUE? This creates "n" dummy variables instead of "n-1". I usually set it to FALSE but just want to make sure I'm not missing something.
Code -
data <- mtcars %>%
as_tibble() %>%
mutate(cyl = cyl %>% as.factor)
usemodels::use_xgboost(mpg ~ cyl, data = data)
Output -
xgboost_recipe <-
recipe(formula = mpg ~ cyl, data = data) %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_zv(all_predictors())
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(28278)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
The idea there is that, as a tree-based model, xgboost can handle all the levels (unlike a linear model) and can actually require more splits to fit well if you don't include all the categories. Read more about this here.
You don't see the same for the ranger random forest because it can handle factors natively.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
cars <- as_tibble(mtcars) %>%
mutate(cyl = cyl %>% as.factor)
usemodels::use_ranger(mpg ~ cyl, data = cars)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
#> ranger_recipe <-
#> recipe(formula = mpg ~ cyl, data = cars)
#>
#> ranger_spec <-
#> rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
#> set_mode("regression") %>%
#> set_engine("ranger")
#>
#> ranger_workflow <-
#> workflow() %>%
#> add_recipe(ranger_recipe) %>%
#> add_model(ranger_spec)
#>
#> set.seed(54153)
#> ranger_tune <-
#> tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
Created on 2021-04-07 by the reprex package (v2.0.0)

Trying to append new rows to an ongoing spreadsheet using a function in R

I've been using the write.xlsx function to append new rows of data that I pull in via a HTML scrapper. For some reason though instead of pulling in the information from one url pasting it in the sheet and moving on to the next one it just paste the last url I put in the function.
I've tried writing a for loop in the actual code, getting rid of the for loop and calling the function for each individual url and putting the urls into a vector and using the lapply function on the vector. All of these methods "work" but have the same result.
urlpull <- function(site){
url <- site
webpage <- read_html(url)
tbls <- webpage %>% html_nodes("table") %>% html_table(header = FALSE, fill = TRUE)
tbls <- tbls %>% lmap( ~ set_names(.x, nm = pluck(.x, 1, 1, 1))) %>% map(~ set_names(.x, nm = .x[2, ]))
abbr <- as.data.frame(webpage %>% html_nodes('strong') %>% html_text() %>% .[5:6])
rec <- as.data.frame(webpage %>% html_nodes('div') %>% html_text() %>% .[c(26,33)])
date <- as.data.frame(webpage %>% html_nodes('div') %>% html_text() %>% .[36])
awaybas <- tbls %>% .[1]
awayadv <- tbls %>% .[2]
homebas <- tbls %>% .[3]
homeadv <- tbls %>% .[4]
ab1 <- as.data.frame(awaybas)
aa1 <- as.data.frame(awayadv)
hb1 <- as.data.frame(homebas)
ha1 <- as.data.frame(homeadv)
ab <- ab1[-c(1,2,8),]
aa <- aa1[-c(1,2,8),]
hb <- hb1[-c(1,2,8),]
ha <- ha1[-c(1,2,8),]
ab[,c(3:21)] <- sapply(ab[,c(3:21)], as.numeric)
aa[,c(3:16)] <- sapply(aa[,c(3:16)], as.numeric)
hb[,c(3:21)] <- sapply(hb[,c(3:21)], as.numeric)
ha[,c(3:16)] <- sapply(ha[,c(3:16)], as.numeric)
aa <- cbind(aa, abbr[1,], abbr[2,])
ab <- cbind(ab, abbr[1,], abbr[2,])
hb <- cbind(hb, abbr[2,], abbr[1,])
ha <- cbind(ha, abbr[2,], abbr[1,])
aa <- cbind(aa, rec[1,])
ab <- cbind(ab, rec[1,])
hb <- cbind(hb, rec[2,])
ha <- cbind(ha, rec[2,])
aa <- cbind(aa, date)
ab <- cbind(ab, date)
hb <- cbind(hb, date)
ha <- cbind(ha, date)
names(aa)[17:20]<-c("TEAM", "OPP", "RCRD", "DT")
names(ab)[22:25]<-c("TEAM", "OPP", "RCRD", "DT")
names(hb)[22:25]<-c("TEAM", "OPP", "RCRD", "DT")
names(ha)[17:20]<-c("TEAM", "OPP", "RCRD", "DT")
aa <- aa %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
ab <- ab %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
hb <- hb %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
ha <- ha %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
aa[,c(20:21)] <- sapply(aa[,c(20:21)], as.numeric)
ab[,c(25:26)] <- sapply(ab[,c(25:26)], as.numeric)
hb[,c(25:26)] <- sapply(hb[,c(25:26)], as.numeric)
ha[,c(20:21)] <- sapply(ha[,c(20:21)], as.numeric)
aa <- aa %>% mutate(GAME = W + L)
ab <- ab %>% mutate(GAME = W + L)
hb <- hb %>% mutate(GAME = W + L)
ha <- ha %>% mutate(GAME = W + L)
aac <- aa[,-c(1:3)]
hac <- ha[,-c(1:3)]
am <- cbind(ab[,-c(23:28)],aac)
hm <- cbind(hb[,-c(23:28)],hac)
am <- am %>% mutate(LOCAL = "away")
hm <- hm %>% mutate(LOCAL = "home")
final <- rbind(am,hm)
print(final)
write.xlsx(final, "Book1.xlsx", sheetName= "Sheet1", col.names=TRUE, row.names=FALSE, append=TRUE, showNA= TRUE)
}
x <- c("https://www.basketball-reference.com/boxscores/201410280LAL.html", "https://www.basketball-reference.com/boxscores/201811140BRK.html")
lapply(x, urlpull)
I just want the final table from the output to be placed on the first row after the last table was placed there.
To get around this problem I kept the for loop and had the function row bind all the dataframes into one, before exporting it out. I did this by initiating a empty dataframe outside the for loop like this:
oldtable <- data.frame()
for (i in x) {
#Generic lines of code
final #table from one iteration of loop
oldtable <- rbind.data.frame(oldtable, final)
}
Using the rbind.data.frame function from base R got the results that I previously wanted.

How to clean and split HTML tags in R?

My parser create a data frame, which looks like:
name html
1 John <span class="incident-icon" data-minute="68" data-second="37" data-id="8028"></span><span class="name-meta-data">68</span>
2 Steve <span class="incident-icon" data-minute="69" data-second="4" data-id="132205"></span><span class="name-meta-data">69</span>
So how I can extract usefull information from HTML? For example, I want to use some HTML attributes as features:
name minute second id
1 John 68 37 8028
2 Steve 69 4 132205
If you already have the data frame in your question, you can try the following. Your data frame is called mydf here. You can extract all numbers with stri_extract_all_regex(). Then, you follow the classic method converting a list to a data frame. Then, you assign new column names and bind the result with the column, name in the original data frame.
library(stringi)
library(dplyr)
stri_extract_all_regex(str = mydf$url, pattern = "[0-9]+") %>%
unlist %>%
matrix(ncol = 4, byrow = T) %>%
data.frame %>%
setNames(c("minute", "second", "ID", "data")) %>%
bind_cols(mydf["name"], .)
# name minute second ID data
#1 John 68 37 8028 68
#2 Steve 69 4 132205 69
DATA
mydf <- structure(list(name = c("John", "Steve"), url = c("<span class=\"incident-icon\" data-minute=\"68\" data-second=\"37\" data-id=\"8028\"></span><span class=\"name-meta-data\">68</span>",
"<span class=\"incident-icon\" data-minute=\"69\" data-second=\"4\" data-id=\"132205\"></span><span class=\"name-meta-data\">69</span>"
)), .Names = c("name", "url"), row.names = c(NA, -2L), class = "data.frame")
An alternate rvest approach using purrr and dplyr:
library(rvest)
library(purrr)
library(dplyr)
df <- read.table(stringsAsFactors=FALSE, header=TRUE, sep=",", text='name,html
John,<span class="incident-icon" data-minute="68" data-second="37" data-id="8028"></span><span class="name-meta-data">68</span>
Steve,<span class="incident-icon" data-minute="69" data-second="4" data-id="132205"></span><span class="name-meta-data">69</span>')
by_row(df, .collate="cols",
~read_html(.$html) %>%
html_nodes("span:first-of-type") %>%
html_attrs() %>%
flatten_chr() %>%
as.list() %>%
flatten_df()) %>%
select(-html, -class1) %>%
setNames(gsub("^data-|1$", "", colnames(.)))
## # A tibble: 2 × 4
## name minute second id
## <chr> <chr> <chr> <chr>
## 1 John 68 37 8028
## 2 Steve 69 4 132205
regex is possible, but I prefer the rvest package for this,
this is easier with data.table or dplyr, but lets do it base R, (on the off-chance that those are new concepts)
# Example data
df <- structure(list(name = c("John", "Steve"), html = c("<span class=\"incident-icon\" data-minute=\"68\" data-second=\"37\" data-id=\"8028\"></span><span class=\"name-meta-data\">68</span>",
"<span class=\"incident-icon\" data-minute=\"69\" data-second=\"4\" data-id=\"132205\"></span><span class=\"name-meta-data\">69</span>"
)), .Names = c("name", "html"), row.names = c(NA, -2L), class = "data.frame")
rvest lets us split this up using the DOM, which can be a lot nicer than working with regex for the same thing.
library(rvest)
# Get span attributes from each row:
spanattrs <-
lapply(df$html,
function(y) read_html(y) %>% html_node('span') %>% html_attrs)
# rbind to get a data.frame with all attributes
final <- data.frame(df, do.call(rbind,spanattrs))
> final
name html class
1 John <span class="incident-icon" data-minute="68" data-second="37" data-id="8028"></span><span class="name-meta-data">68</span> incident-icon
2 Steve <span class="incident-icon" data-minute="69" data-second="4" data-id="132205"></span><span class="name-meta-data">69</span> incident-icon
data.minute data.second data.id
1 68 37 8028
2 69 4 132205
Lets remove the html so it's a little nicer in the viewer here:
> final$html <- NULL
> final
name class data.minute data.second data.id
1 John incident-icon 68 37 8028
2 Steve incident-icon 69 4 132205