Trying to append new rows to an ongoing spreadsheet using a function in R - html

I've been using the write.xlsx function to append new rows of data that I pull in via a HTML scrapper. For some reason though instead of pulling in the information from one url pasting it in the sheet and moving on to the next one it just paste the last url I put in the function.
I've tried writing a for loop in the actual code, getting rid of the for loop and calling the function for each individual url and putting the urls into a vector and using the lapply function on the vector. All of these methods "work" but have the same result.
urlpull <- function(site){
url <- site
webpage <- read_html(url)
tbls <- webpage %>% html_nodes("table") %>% html_table(header = FALSE, fill = TRUE)
tbls <- tbls %>% lmap( ~ set_names(.x, nm = pluck(.x, 1, 1, 1))) %>% map(~ set_names(.x, nm = .x[2, ]))
abbr <- as.data.frame(webpage %>% html_nodes('strong') %>% html_text() %>% .[5:6])
rec <- as.data.frame(webpage %>% html_nodes('div') %>% html_text() %>% .[c(26,33)])
date <- as.data.frame(webpage %>% html_nodes('div') %>% html_text() %>% .[36])
awaybas <- tbls %>% .[1]
awayadv <- tbls %>% .[2]
homebas <- tbls %>% .[3]
homeadv <- tbls %>% .[4]
ab1 <- as.data.frame(awaybas)
aa1 <- as.data.frame(awayadv)
hb1 <- as.data.frame(homebas)
ha1 <- as.data.frame(homeadv)
ab <- ab1[-c(1,2,8),]
aa <- aa1[-c(1,2,8),]
hb <- hb1[-c(1,2,8),]
ha <- ha1[-c(1,2,8),]
ab[,c(3:21)] <- sapply(ab[,c(3:21)], as.numeric)
aa[,c(3:16)] <- sapply(aa[,c(3:16)], as.numeric)
hb[,c(3:21)] <- sapply(hb[,c(3:21)], as.numeric)
ha[,c(3:16)] <- sapply(ha[,c(3:16)], as.numeric)
aa <- cbind(aa, abbr[1,], abbr[2,])
ab <- cbind(ab, abbr[1,], abbr[2,])
hb <- cbind(hb, abbr[2,], abbr[1,])
ha <- cbind(ha, abbr[2,], abbr[1,])
aa <- cbind(aa, rec[1,])
ab <- cbind(ab, rec[1,])
hb <- cbind(hb, rec[2,])
ha <- cbind(ha, rec[2,])
aa <- cbind(aa, date)
ab <- cbind(ab, date)
hb <- cbind(hb, date)
ha <- cbind(ha, date)
names(aa)[17:20]<-c("TEAM", "OPP", "RCRD", "DT")
names(ab)[22:25]<-c("TEAM", "OPP", "RCRD", "DT")
names(hb)[22:25]<-c("TEAM", "OPP", "RCRD", "DT")
names(ha)[17:20]<-c("TEAM", "OPP", "RCRD", "DT")
aa <- aa %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
ab <- ab %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
hb <- hb %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
ha <- ha %>% separate("MP", c("min","sec"), sep = ":") %>% separate("RCRD", c("W","L"), sep= "-") %>% separate("DT", c("time", "day", "year"), sep = ",") %>% unite(DT, c("day", "year", "time"), sep = ",") %>% mutate(DT = mdy_hm(DT))
aa[,c(20:21)] <- sapply(aa[,c(20:21)], as.numeric)
ab[,c(25:26)] <- sapply(ab[,c(25:26)], as.numeric)
hb[,c(25:26)] <- sapply(hb[,c(25:26)], as.numeric)
ha[,c(20:21)] <- sapply(ha[,c(20:21)], as.numeric)
aa <- aa %>% mutate(GAME = W + L)
ab <- ab %>% mutate(GAME = W + L)
hb <- hb %>% mutate(GAME = W + L)
ha <- ha %>% mutate(GAME = W + L)
aac <- aa[,-c(1:3)]
hac <- ha[,-c(1:3)]
am <- cbind(ab[,-c(23:28)],aac)
hm <- cbind(hb[,-c(23:28)],hac)
am <- am %>% mutate(LOCAL = "away")
hm <- hm %>% mutate(LOCAL = "home")
final <- rbind(am,hm)
print(final)
write.xlsx(final, "Book1.xlsx", sheetName= "Sheet1", col.names=TRUE, row.names=FALSE, append=TRUE, showNA= TRUE)
}
x <- c("https://www.basketball-reference.com/boxscores/201410280LAL.html", "https://www.basketball-reference.com/boxscores/201811140BRK.html")
lapply(x, urlpull)
I just want the final table from the output to be placed on the first row after the last table was placed there.

To get around this problem I kept the for loop and had the function row bind all the dataframes into one, before exporting it out. I did this by initiating a empty dataframe outside the for loop like this:
oldtable <- data.frame()
for (i in x) {
#Generic lines of code
final #table from one iteration of loop
oldtable <- rbind.data.frame(oldtable, final)
}
Using the rbind.data.frame function from base R got the results that I previously wanted.

Related

Error when web scraping in R: Error in UseMethod("xml_find_all") :

I wrote some code to webscrape air quality data in R. It worked perfectly fine and I had no issues. But now, when I recently reran it, I'm getting an error when using the html_nodes() function.
Here is my code:
library(rvest)
library(tidyverse)
library(lubridate)
## Download MOE Location Data
# https://stackoverflow.com/questions/25677035/how-to-create-a-range-of-dates-in-r
## Create a tibble of dates
start_date <- "2021/1/1"
end_date <- "2021/12/31"
dates <- seq(as.Date(start_date), as.Date(end_date), "days")
df <- NULL
for (datex in dates) {
datef = as.Date(datex, origin = "1970-01-01")
Day = day(datef)
Month = month(datef)
Year = year(datef)
for (hour in 1:24) {
url.new <-
paste(
"http://www.airqualityontario.com/aqhi/locations.php?start_day=",
Day,
"&start_month=",
Month,
"&start_year=",
Year,
"&my_hour=",
hour,
"&pol=36&text_only=1&Submit=Update",
sep = ""
)
download.file(url.new, destfile = "scrapedpage.html", quiet=TRUE)
simple <- read_html("scrapedpage.html")
test <- simple %>%
html_nodes("td") %>%
html_text()
test <- as_tibble(test)
df.temp <-
as.data.frame(matrix(
unlist(test, use.names = FALSE),
ncol = 3,
byrow = TRUE
)) %>%
mutate(date = paste(datef)) %>%
mutate(hour = hour)
df <- rbind(df, df.temp)
}
}
df <- as_tibble(df)
colnames(df) <- c("Station","Address","SurfaceConc","SurfaceDate","Hour")
MOE_data <- df %>%
filter(Address != "Bay St. Wellesley St. W.") %>%
select(-Address) %>%
mutate(Station = trimws(Station)) %>%
# filter(str_detect(Station, 'Toronto')) %>%
mutate(Hour = paste(Hour, ":00:00", sep = "")) %>%
mutate(Hour = hms::as_hms(Hour)) %>%
mutate(SurfaceDate = paste(SurfaceDate, Hour)) %>%
mutate(SurfaceDate = as_datetime(SurfaceDate)) %>%
select(-Hour)
MOE_data <- as_tibble(MOE_data)
rm(list=setdiff(ls(), "MOE_data_2021"))
# save.image(file='Jan2019_Dec2021.RData')
This is the error I get:
Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "xml_document"
What I don't understand is why it happens for some values, some of the time. For example, I get an error when the hour = 16. But when I rerun it, it may work, it's just not consistent.

delete_part deletes the top border when outputting pdf

I am using the following rmarkdown code, using xelatex engine:
access <- function(x, ...) {
x <- delete_part(x)
x <- colformat_double(x, big.mark = "'", decimal.mark = ",")
x <- set_table_properties(x, layout = "autofit")
x <- border_remove(x)
std_border <- fp_border_default(width = 1, color = "black")
x <- border_outer(x, part="all", border = std_border )
x <- border_inner_h(x, border = std_border, part="all")
x <- border_inner_v(x, border = std_border, part="all")
autofit(x)
}
firstc <- c("Field:","Table:","Sort:","Show:","Criteria:","Or:")
secondc <- c("Field:","Table:","Sort:","Show:","Criteria:","Or:")
```
```{r echo=FALSE}
tabela <- data.frame(firstc,secondc)
ft <- flextable(tabela)
ft <- access(ft)
ft <- hline_top(ft)
ft <- fit_to_width(ft, max_width = 4)
ft <- set_table_properties(ft, layout = "autofit", width = 1)
ft
```
However, the top hline does not show up in the PDF output.
Any ideas?

Is there a reason the xgboost code snippet from the usemodels package has one_hot set to TRUE?

Is there a reason the recipe code snippet for xgboost classifier has one_hot = TRUE? This creates "n" dummy variables instead of "n-1". I usually set it to FALSE but just want to make sure I'm not missing something.
Code -
data <- mtcars %>%
as_tibble() %>%
mutate(cyl = cyl %>% as.factor)
usemodels::use_xgboost(mpg ~ cyl, data = data)
Output -
xgboost_recipe <-
recipe(formula = mpg ~ cyl, data = data) %>%
step_novel(all_nominal(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_zv(all_predictors())
xgboost_spec <-
boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
loss_reduction = tune(), sample_size = tune()) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
set.seed(28278)
xgboost_tune <-
tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
The idea there is that, as a tree-based model, xgboost can handle all the levels (unlike a linear model) and can actually require more splits to fit well if you don't include all the categories. Read more about this here.
You don't see the same for the ranger random forest because it can handle factors natively.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
cars <- as_tibble(mtcars) %>%
mutate(cyl = cyl %>% as.factor)
usemodels::use_ranger(mpg ~ cyl, data = cars)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
#> ranger_recipe <-
#> recipe(formula = mpg ~ cyl, data = cars)
#>
#> ranger_spec <-
#> rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
#> set_mode("regression") %>%
#> set_engine("ranger")
#>
#> ranger_workflow <-
#> workflow() %>%
#> add_recipe(ranger_recipe) %>%
#> add_model(ranger_spec)
#>
#> set.seed(54153)
#> ranger_tune <-
#> tune_grid(ranger_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
Created on 2021-04-07 by the reprex package (v2.0.0)

Multiple Flextables in RMarkdown chunk (with echo = F) do not render

I am using R 3.5.1 on OS Windows 7 x64.
My package versions are:
flextable: 0.5.1
officer: 0.3.2
tidyverse: 1.2.1
tidyr: 0.8.2
I am trying to create and then print multiple flextables in a single chunk to Word. I also do not want the actual R code in the Word Doc, so I've set echo = FALSE. However, I've run into something strange.
If I use the following code (echo = T):
```{r, echo = F, warning=FALSE, message=FALSE}
library(tidyverse)
library(flextable)
library(officer)
```
```{r, echo = T}
iris_data <- head(iris, n = 10)
iris_data_ft <- iris_data %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data_ft
iris_data2 <- tail(iris, n = 10)
iris_data2_ft <- iris_data2 %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data2_ft
```
I get the following output of proper flextables in Word:
If I use the exact same code, but with echo = F:
```{r, echo = F}
iris_data <- head(iris, n = 10)
iris_data_ft <- iris_data %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data_ft
iris_data2 <- tail(iris, n = 10)
iris_data2_ft <- iris_data2 %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data2_ft
```
I get the following in Word (this goes on for 58 pages):
Finally, if I split the one chunk for the two flextables into two separate chunks and set echo = F:
```{r, echo = F}
iris_data <- head(iris, n = 10)
iris_data_ft <- iris_data %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data_ft
```
```{r, echo = F}
iris_data2 <- tail(iris, n = 10)
iris_data2_ft <- iris_data2 %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data2_ft
```
The tables render just fine:
While I don't mind separating the flextables into different chunks, I am wondering why this is happening. Any guidance would be greatly appreciated. Thanks!
Here is a workaround:
---
title: "Untitled"
output: word_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(flextable)
```
```{r echo=FALSE, results='asis'}
library(htmltools)
ft <- flextable(head(iris))
for(i in 1:3){
cat("```{=openxml}\n")
cat(format(ft, type = "wml"), "\n")
cat("```\n")
}
```
I guess this issue is solved with newer versions of flextable already. If I put the code following two chunks into a .Rmd document and knit to word it compiles without problems and shows tables, even if put into one chunk with echo = F. Just wanted to add: In some cases it might be helpful to add results='asis' - the 'as is'-option (raw Markdown parsing) - to your chunk header to prevent errors when parsing tables, check: bookdown.org/results-as-is.
```{r, echo = F, warning=FALSE, message=FALSE}
library(tidyverse)
library(flextable)
library(officer)
```
```{r, echo = F}
iris_data <- head(iris, n = 10)
iris_data_ft <- iris_data %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data_ft
iris_data2 <- tail(iris, n = 10)
iris_data2_ft <- iris_data2 %>% flextable() %>%
hline(part = 'header', border = fp_border(color = "black", width = 3)) %>%
align(align ='center', part = 'all') %>%
align(j = 1, align ='left', part = 'all') %>%
add_header_lines('this is a test')
iris_data2_ft
```

Extract nested JSON from R dataframe without knowing keys

I am trying to extract JSON from a TSV column. The difficulty is the JSON is shallowly nested, and the key values may not be present in every row.
I have a minimal example to illustrate my point.
df <- tibble(index = c(1, 2),
data = c('{"json_char":"alpha", "json_list1":["x","y"]}',
'{"json_char":"beta", "json_list1":["x","y","z"], "json_list2":["a","b","c"]}'))
The desired result:
df <- tibble::tibble(index = list(1, 2),
json_char = list("alpha", "beta"),
json_list1 = list(list("x","y"), list("x","y","z")),
json_list2 = list(NA, list("a","b","c")))
After a fair amount of experimentation, I have this function:
extract_json_column <- function(df) {
df %>%
magrittr::use_series(data) %>%
purrr::map(jsonlite::fromJSON) %>%
purrr::map(purrr::simplify) %>%
tibble::enframe() %>%
tidyr::spread("name", "value") %>%
purrr::flatten_dfr()
}
Which gives me the following error: Error in bind_rows_(x, .id) : Argument 2 must be length 3, not 7.
The first row sets the number of parameters for the rest of dataframe. Is there anyway to avoid that behavior?
I modified your function to the following. I hope this helps.
library(tidyverse)
library(rjson)
extract_json_column <- function(df){
df %>%
rowwise() %>%
mutate(data = map(data, fromJSON)) %>%
split(.$index) %>%
map(~.$data[[1]]) %>%
map(~map_if(., function(x) length(x) != 1, list)) %>%
map(as_data_frame) %>%
bind_rows(.id = "index")
}
extract_json_column(df)
# A tibble: 2 x 4
index json_char json_list1 json_list2
<chr> <chr> <list> <list>
1 1 alpha <chr [2]> <NULL>
2 2 beta <chr [3]> <chr [3]>