save html from data.frames to pdf using wkhtmltopdf or Markdown - html

I have a df with a column htmltext containing html text that I would like to print (as a batch if possible) as single PDFs with doc_id as filename.
Can I do that directly within R?
I thought about something like
> system("wkhtmltopdf --javascript-delay 1 in.html out.pdf")
how can I implement that in R?
or is there another easy way to to so using markdown for example.
# df
doc_id <- c("doc1","doc2","doc3")
htmltext <- c("<b>good morning</b>","<b>This text is bold</b>","<b>good evening</b>")
df <- data.frame(doc_id,htmltext, stringsAsFactors = FALSE)
# save htmltext single pdfs with doc_id as filename
filenames = filenames = df$doc_id
...?

See if one of these is acceptable:
library(rmarkdown)
library(decapitated) # devtools::install_github("hrbrmstr/decapitated") # requires Chrome
data.frame(
doc_id = c("doc1", "doc2", "doc3"),
htmltext = c("<b>good morning</b>", "<b>This text is bold</b>", "<b>good evening</b>"),
stringsAsFactors = FALSE
) -> xdf
# hackish pandoc way
for(i in 1:nrow(xdf)) {
message(sprintf("Processing %s", xdf$doc_id[i]))
tf <- tempfile(fileext=".html")
writeLines(xdf$htmltext[i], tf)
pandoc_convert(
input = tf,
to = "latex",
output = sprintf("%s.pdf", xdf$doc_id[i]),
wd = getwd()
)
unlink(tf)
}
# using headless chrome
for(i in 1:nrow(xdf)) {
message(sprintf("Processing %s", xdf$doc_id[i]))
tf <- tempfile(fileext=".html")
writeLines(xdf$htmltext[i], tf)
chrome_dump_pdf(sprintf("file://%s", tf), path=sprintf("%s.pdf", xdf$doc[i]))
unlink(tf)
}

Related

Shiny - assigning argument to function

I am trying to create a shiny app that applies a self-made function to an uploaded dataset, then allows to download the modified results. Here is my code:
library(shiny)
library(tidyverse)
namkurz <- function(data, a_spalte) {
kuerzel <- vector(length = length(data$a_spalte))
for (i in 1:length(data$a_spalte)){
spez = data$Art[i]
s = unlist(strsplit(spez, " ", fixed = TRUE))
s = substr(s, 1, 2)
s = paste(s, collapse = ' ')
kuerzel[[i]] = s
}
data <- data %>%
mutate(kurz = kuerzel)
}
ui <- fluidPage(
fileInput('upload','Deine Kartierungsdaten'),
textInput('art', 'Wie heißt die Spalte mit Artnamen?'),
downloadButton('analyse','Artenkürzel hinzufügen')
)
server <- function(input, output, session) {
data <- reactive({
req(input$upload)
ext <- tools::file_ext(input$upload$name)
switch(ext,
csv = vroom::vroom(input$upload$datapath, delim = ";"),
validate("Invalid file; Please upload a .csv file")
)
})
art <- reactive(input$art)
output$analyse <- downloadHandler(
filename = function() {
paste0('mit_kuerzel', ".csv")
},
content = function(file) {
ergebnis <- reactive(namkurz(data(), art()))
vroom::vroom_write(ergebnis(), file)
}
)
}
shinyApp(ui, server)
When trying to save the output I get a 'Warning: Unknown or uninitialised column:' error. I think my problem is in the assignment of argument 'art' to the 'ergebnis' object, but I can't find the way to fix it.
I recommend a few things:
(Required) In your function, a_spalte is a character vector and not the literal name of column in the frame, so you need to use [[ instead of $, see The difference between bracket [ ] and double bracket [[ ]] for accessing the elements of a list or dataframe and Dynamically select data frame columns using $ and a character value.
Change all references of data$a_spalte to data[[a_spalte]].
namkurz <- function(data, a_spalte) {
kuerzel <- vector(length = length(data[[a_spalte]]))
for (i in 1:length(data[[a_spalte]])){
spez = data$Art[i]
s = unlist(strsplit(spez, " ", fixed = TRUE))
s = substr(s, 1, 2)
s = paste(s, collapse = ' ')
kuerzel[[i]] = s
}
data <- data %>%
mutate(kurz = kuerzel)
}
Your function is a bit inefficient doing things row-wise, we can vectorize that operation.
namkurz <- function(data, a_spalte) {
spez <- strsplit(data$Art, " ", fixed = TRUE)
data$kurz <- sapply(spez, function(z) paste(substr(z, 1, 2), collapse = " "))
data
}
(Optional) The content= portion of downloadHandler is already reactive, you do not need to wrap namkurz in reactive. Because of this, you also don't need to treat ergebnis as reactive.
output$analyse <- downloadHandler(
filename = ...,
content = function(file) {
ergebnis <- namkurz(data(), art())
vroom::vroom_write(ergebnis, file)
}
)
(Optional) Your output filename is fixed, so two things here: if it's always going to be "mit_kuerzel.csv", then there's no need for paste0, just use function() "mit_kuerzel.csv".
However, if you are intending to return a file named something based on the original input filename, one could do something like:
filename = function() {
paste0(tools::file_path_sans_ext(basename(input$upload$name)),
"_mit_kuerzel.",
tools::file_ext(input$upload$name))
},
to add _mit_kuerzel to the base portion of the uploaded filename. Note that the file in the content= section is never this name, the new_mit_kuerzel.csv is the filename offered to the downloading browser as a suggestion, that is all.
(Optional) You are using a .csv file extension in the downloadHandler, but the default for vroom::vroom_write is to use delim = "\t", which is not a CSV. I suggest either adding delim = ";" (or similar), or changing the returned filename extension to .tsv instead.

kableExtra::text_spec - Rotate Text - Unwanted Commas

Using RSTudio > Blogdown > Hugo to create a blog
Inserting this R in a post. When the HTML is rendered there are commas between the rotated letters. Why is that?
library("knitr")
library("kableExtra")
library("dplyr")
library("formattable")
library("stringr")
library("tidyverse")
p1 <- c("R Markdown is pretty neat. You can do things like this. I wonder why more people don't")
p1 <- c("Hello World!")
p2 <- c("do this. It's so much easier to read. NOTE: Those people live here.")
p_text <- unlist(strsplit(p1, "")) # strsplit returns a list. Make it a vector.
num_char <- length(p_text)
p_angle <- seq(30, 360, 30)
num_ang <- length(p_angle)
p_angle_long <- rep(p_angle, ceiling(num_char / num_ang)) # Repeat the angles for the length of the string
p_angle_long <- p_angle_long[1:num_char]
rtext <- text_spec(p_text, "html", bold = TRUE, angle = p_angle_long)
The output of text_spec is a vector with each letter (+ accompanying HTML tags) as a separate element. You can combine into a single string with paste0:
# Example RMarkdown chunk that produces rotated text:
```{r txt, results='asis'}
library("knitr")
library("kableExtra")
library("tidyverse")
p1 <- c("Hello World!")
p2 <- c("do this. It's so much easier to read. NOTE: Those people live here.")
p_text <- unlist(strsplit(p1, "")) # strsplit returns a list. Make it a vector.
num_char <- length(p_text)
p_angle <- seq(30, 360, 30)
num_ang <- length(p_angle)
p_angle_long <- rep(p_angle, ceiling(num_char / num_ang))
# Repeat the angles for the length of the string
p_angle_long <- p_angle_long[1:num_char]
rtext <- text_spec(p_text, "html", bold = TRUE, angle = p_angle_long)
cat(paste0(rtext, collapse=""))
```

Read HTML into R

I would like R to take a word in a column in a dataset, and return a value from a website. The code I have so far is below. So, for each word in the data frame column, it will go to the website and return the pronunciation (for example, the pronunciation on http://www.speech.cs.cmu.edu/cgi-bin/cmudict?in=word&stress=-s is "W ER1 D"). I have looked at the HTML of the website, and it's unclear what I would need to enter to return this value - it's between <tt> and </tt> but there are many of these. I'm also not sure how to then get that value into R. Thank you.
library(xml2)
for (word in df$word) {
result <- read_html("http://www.speech.cs.cmu.edu/cgi-bin/cmudict?in="word"&stress=-s")
}
Parsing HTML is a tricky task in R. There are a couple ways though. If the HTML converts well to XML and the website/API always returns the same structure then you can use tools to parse XML. Otherwise you could use regex and call stringr::str_extract() on the HTML.
For your case, it is fairly easy to get the value you're looking for using XML tools. It's true that there are a lot of <tt> tags but the one you want is always in the second instance so you can just pull out that one.
#load packages. dplyr is just to use the pipe %>% function
library(httr)
library(XML)
library(dplyr)
#test words
wordlist = c('happy', 'sad')
for (word in wordlist){
#build the url and GET the result
url <- paste0("http://www.speech.cs.cmu.edu/cgi-bin/cmudict?in=",word,"&stress=-s")
h <- handle(url)
res <- GET(handle = h)
#parse the HTML
resXML <- htmlParse(content(res, as = "text"))
#retrieve second <tt>
print(getNodeSet(resXML, '//tt[2]') %>% sapply(., xmlValue))
#don't abuse your API
Sys.sleep(0.1)
}
>[1] "HH AE1 P IY0 ."
>[1] "S AE1 D ."
Good luck!
EDIT: This code will return a dataframe:
#load packages. dplyr is just to use the pipe %>% function
library(httr)
library(XML)
library(dplyr)
#test words
wordlist = c('happy', 'sad')
#initializae the dataframe with pronunciation field
pronunciation_list <- data.frame(pronunciation=character(),stringsAsFactors = F)
#loop over the words
for (word in wordlist){
#build the url and GET the result
url <- paste0("http://www.speech.cs.cmu.edu/cgi-bin/cmudict?in=",word,"&stress=-s")
h <- handle(url)
res <- GET(handle = h)
#parse the HTML
resXML <- htmlParse(content(res, as = "text"))
#retrieve second <tt>
to_add <- data.frame(pronunciation=(getNodeSet(resXML, '//tt[2]') %>% sapply(., xmlValue)))
#bind the data
pronunciation_list<- rbind(pronunciation_list, to_add)
#don't abuse your API
Sys.sleep(0.1)
}

R highcharter get data from plots saved as html

I plot data with highcharter package in R, and save them as html to keep interactive features. In most cases I plot more than one graph, therefore bring them together as a canvas.
require(highcharter)
hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>%
lapply(function(x) highchart() %>% hc_add_series(x))
hc_grid <- hw_grid(hc_list,ncol = 2)
htmltools::browsable(hc_grid) # print
htmltools::save_html(hc_grid,"test_grid.html") # save
I want to extract the data from plots that I have saved as html in the past, just like these. Normally I would do hc_list[[1]]$x$hc_opts$series, but when I import html into R and try to do the same, I get an error. It won't do the job.
> hc_imported <- htmltools::includeHTML("test_grid.html")
> hc_imported[[1]]$x$hc_opts$series
Error in hc_imported$x : $ operator is invalid for atomic vectors
If I would be able to write a function like
get_my_data(my_imported_highcharter,3) # get data from 3rd plot
it would be the best. Regards.
You can use below code
require(highcharter)
hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>%
lapply(function(x) highchart() %>% hc_add_series(x))
hc_grid <- hw_grid(hc_list,ncol = 2)
htmltools::browsable(hc_grid) # print
htmltools::save_html(hc_grid,"test_grid.html") # save
# hc_imported <- htmltools::includeHTML("test_grid.html")
# hc_imported[[1]]$x$hc_opts$series
library(jsonlite)
library(RCurl)
library(XML)
get_my_data<-function(my_imported_highcharter,n){
webpage <- readLines(my_imported_highcharter)
pagetree <- htmlTreeParse(webpage, error=function(...){})
body <- pagetree$children$html$children$body
divbodyContent <- body$children$div$children[[n]]
script<-divbodyContent$children[[2]]
data<-as.character(script$children[[1]])[6]
data<-fromJSON(data,simplifyVector = FALSE)
data<-data$x$hc_opts$series[[1]]$data
return(data)
}
get_my_data("test_grid.html",3)
get_my_data("test_grid.html",1)

Loading json encoded log files into R for analysis

I have a log file with each line a json-encoded entry:
{"requestId":"5550d","partnerId":false,"ip":"170.158.3.1", ... }
I tried reading each line, json_decode and then append to the dataframe:
loadLogs <- function(fileName) {
conn <- file(fileName, "r", blocking = FALSE)
linn <- readLines(conn)
long <- length(linn)
df = data.frame(requestId = character(0),
partnerId = character(0),
ip = character(0)
)
for (i in 1:long) {
jsonRow <- fromJSON(linn[i])
df <- rbind(df, data.frame(requestId = jsonRow$requestId,
partnerId = as.character(jsonRow$partnerId),
ip = jsonRow$ip
}
close(conn)
return(df)
}
The above code is extremely slow for large files though. Is there any way to speed this up? A few options I can think of at the moment:
pre-allocate the dataframe as its copying the entire data for a new append
using apply function on the json_decode
???
How would I do (1) and (2) in R? I'm new.
Thanks for looking at my question.