Replacing NA columns with Median in R - regression

I keep getting errors with the codes, which would be correct?

The first one is correct, but generally you don't want to use for loops in R. Try using apply.
Answer
v <- runif(10000)
v[sample(1000,120)] <- NA
mydata <- as.data.frame(matrix(v,10,100))
for (i in 1:ncol(mydata)){
mydata[is.na(mydata[,i]), i] <- median(mydata[,1], na.rm = TRUE)
}
Apply Solution
v <- runif(10000)
v[sample(1000,120)] <- NA
mydata <- as.data.frame(matrix(v,10,100))
medrep <- function(x){
x[is.na(x)] <- median(x, na.rm=TRUE)
x
}
dat <- data.frame(apply(mydata, 2, medrep))

Related

R Shiny: How can we store values computed in an output, to be reused in an other output?

I'm developing at the moment my first Shiny app and I have a little issue:
In the "server part" of the app, I have 3 output objects (renderTables) named:
output$results
output$results2
output$results3
In the two first outputs, I compute some parameters using information comming from inputs. These computations are done in a conditionnal loop (if/if else) and for some of them stock in a variable "Resultats".
What I want to do is to reuse these parameters in the third output ("output$results3").
Is there a way to store these parameters when computing them and then to reuse them ?
As an example: I want to be able to use the value of "SP" (computed in out$results2) in the output$results3.
If I don't store them, the output$results3 just can't find the parameters...
I have already tried reactivevalues() function but it didn't work.
Do you have an idea how to do it?
# Define server logic to summarize and view selected dataset
server <- function(input, output) {
# 1. Calculs et formattage des résultats
#Demande
output$results <- renderTable({
if (input$shape == "QofPD") {
a <- input$a
b <- input$b
}
if (input$shape == "PofQD") {
a <- input$a/input$b
b <- 1/input$b
}
Q0 <- max(0,a - b*input$P0)
epd <- -b*input$P0/Q0
SC <- (a/b-input$P0)*Q0/2
DT0 <- input$P0*Q0
Variables <- c("Demande:", "Demande inverse:", "Prix (P*) = ","Quantité (Q*) = ",paste0("Elasticité-prix (",intToUtf8(949),") = "), "Dépense totale (DT*) = ", "Surplus du consommateur (SC) = ")
Resultats <- c(paste0("Q = ",round(a,2)," - ",round(b,2),"P"),paste0("P = ",round(a/b,2)," - ",round(1/b,2),"Q"),input$P0,round(Q0,2),round(epd,2),round(DT0,2),round(SC,2))
df.results <- data.frame(Variables, Resultats)
df.results
},
striped=TRUE, colnames=FALSE)
# Offre
output$results2 <- renderTable({
if (input$shape2 == "QofPO") {
a2 <- input$a2
b2 <- input$b2
inter <- -a2/b2
slope <- 1/b2
Q02 <- max(0,a2+b2*input$P0)
epo <- b2*input$P0/Q02
SP <- (Q02+a2)/2*input$P0
RT0 <- input$P0*Q02
Variables <- c("Offre:", "Offre inverse:", "Prix (P*) = ","Quantité (Q*) = ",paste0("Elasticité-prix (",intToUtf8(949),") = "), "Recette totale (RT*) = ", "Surplus du producteur (SP) = ")
Resultats <- c(paste0("Q = ",a2," + ",b2,"P"),paste0("P = ",round(slope,2),"Q"," ",round(inter,2)),input$P0,round(Q02,2),round(epo,2),round(RT0,2),round(SP,2))
df.results <- data.frame(Variables, Resultats)
df.results
}
else if (input$shape2 == "PofQO") {
a2 <- input$a2
b2 <- input$b2
inter <- a2
slope <- b2
Q02 <- max(0,(input$P0-a2)/b2)
epo <- (1/b2)*input$P0/Q02
SP <- (input$P0-inter)*Q02/2
RT0 <- input$P0*Q02
Variables <- c("Offre:", "Offre inverse:", "Prix (P*) = ","Quantité (Q*) = ",paste0("Elasticité-prix (",intToUtf8(949),") = "), "Recette totale (RT*) = ", "Surplus du producteur (SP) = ")
Resultats <- c(paste0("Q = ",round(-a2/b2,2)," + ",round(1/b2,2),"P"),paste0("P = ",round(inter,2)," + ",round(slope,2),"Q"),input$P0,round(Q02,2),round(epo,2),round(RT0,2),round(SP,2))
df.results <- data.frame(Variables, Resultats)
df.results
}
},
striped=TRUE, colnames=FALSE)
#Equilibre
output$results3 <- renderTable({
# Here I want to enter the parameters SP / SC...etc to be able to make new computation.
},
striped=TRUE, colnames=FALSE)
}
shinyApp(ui=ui, server=server)
I hope I was clear enough.
Thanks for your help.
And I stay available for more information if needed
Valentin

perform unite function using only dplyr commands?

Is there any way to concatenate two variables together using only dplyr commands?
For example:
df <- mtcars
df <- select(df, mpg, cyl)
df$mpg <- as.character(df$mpg)
df$cyl <- as.character(df$cyl)
df <- unite(df, "new_var", c(mpg, cyl), sep="", remove = FALSE)
view(df)
I realize that the unite function makes this very simple, but I can't use it as I'm trying to do this operation on an object of type tbl_MariaDBConnection, which, as far as I understand, can only be operated on with dplyr commands. Thanks!
Using mutate+paste0 will do the trick
df <- mtcars
df <- select(df, mpg, cyl)
df$mpg <- as.character(df$mpg)
df$cyl <- as.character(df$cyl)
df <- df %>%
mutate(new_var = paste0(mpg, cyl))
View(df)
For all dplyr : transmute = mutate, but deletes everything else.
df <- mtcars %>%
dplyr::transmute(
mpg = as.character(mpg),
cyl = as.character(cyl),
new_var = paste0(mpg, cyl)
)
df

Tabulizing data off website PDFs (w/ various formats) assigning each event values according to HTML link titles

I've been trying to automate the process of manually typing down the data from the ATF's trace data site (see "URL") but it's been a fairly big pain as I've only been able to collect the each URL link that holds the PDFs and assign it to its correct State/Territory and Year. The newer files 2017-2019 have data "tables that are relatively easier to pull data from compared to 2014-2016, e.g. I'm referring to page 10 of the Trace Data report 2019 and Trace Data report 2014.
It's the latter that I'm having trouble with the most as the data is not stored in something that looks like a table but surrounding a (!)pie-chart. There have been some promising R packages such as "pdftools" and "tesseract". But I'm very much an amateur when it comes to trouble-shooting advanced analytical packages such as these.
It's my guess that I'm still a ways off from where I want to be with the final product as I would need to mine the bottom text of page 10 to find how many "other" weapons were traced to a city, as well the number of weapons where a recovery city couldn't be determined. But if anyone has any suggestions on what I could try next or to even make the working code more efficient, I'd appreciate it.
URL <- "https://www.atf.gov/resource-center/data-statistics"
html <- paste(readLines(URL))
library(xml2)
library(tidyverse)
library(rvest)
library(stringr)
x <- c('\t\t\t<div>([^<]*)</div>','\t\t</tr><tr><td>([^<]*)</td>','\t\t\t<td>([^<]*)</td>')
r <- read_html(URL) %>% html_nodes("a") %>% map_df(~{
Link <- .x %>% html_attr("href")
Title <- .x %>% html_text()
data_frame(Link, Title)
}) %>%
dplyr::filter(grepl('node',Link, fixed = T))
r <- as.data.frame(r)
x <- c('<ul><li>([^<]*)</li>','\t<li>([^<]*)</li>')
states <- c('Alabama','Alaska','Arizona','Arkansas','California','Colorado','Connecticut','Delaware','District of Columbia','Florida','Georgia','Guam & Northern Mariana Islands','Hawaii','Idaho','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana','Maine','Maryland','Massachusetts','Michigan','Minnesota','Mississippi','Missouri','Montana','Nebraska','Nevada','New Hampshire','New Jersey','New Mexico','New York','North Carolina','North Dakota','Ohio','Oklahoma','Oregon','Pennsylvania','Puerto Rico','Rhode Island','South Carolina','South Dakota','Tennessee','Texas','Utah','Vermont','Virginia','Washington','West Virginia','Wisconsin','Wyoming')
s <- list()
for(i in 1:nrow(r)){
s[[i]] <- read_html(r$Link[i]) %>% html_nodes("a") %>% map_df(~{
Link <- .x %>% html_attr("href")
Title <- .x %>% html_text()
data_frame(Link, Title)
}) %>% mutate(Year <- r$Title[i]) %>%
dplyr::filter(Title %in% states | str_detect(Title, "Virgin Islands")) %>%
dplyr::filter(grepl('download',Link, fixed = T))
trace_list = do.call(rbind, s)
}
names(trace_list)[3] <- "Year"
Progress so far...
library(pdftools)
pdf_file <- "https://www.atf.gov/file/146951/download"
text <- pdf_text(pdf_file)
cat(text[10])
vtext <- as.list(str_split(text[10],"\n"))
x <- data.frame(matrix(unlist(vtext), nrow=length(vtext), byrow=TRUE),stringsAsFactors=FALSE)
x1 <- pivot_longer(x, cols = 1:length(x),names_to="X1",values_to="X2")
x1$X2 <- trimws(x1$X2)
x1 <- x1[c(8,12),]
x1[1,2] <- sub(" ","_",x1[1,2],fixed=T)
library(splitstackshape)
x1 <- as.data.frame(cSplit(x1, 'X2', sep=" ", type.convert=FALSE))
x1 <- x1[,c(2:length(x1))]
colnames(x1) <- x1[1,]
x1 <- x1[-1, ]
x2 <- pivot_longer(x1, cols = 1:length(x1),names_to="city",values_to="count")
mixing both pdftools amd tesseract...
library(tesseract)
img_file <- pdftools::pdf_convert("https://www.atf.gov/file/89621/download", format = 'tiff', dpi = 400)
text <- ocr(img_file)
strsplit(text[10],"\n")
Expected output:
year
state
city
count
2019
AL
Birmingham
100
2018
CA
Los Angeles
200
2017
CA
None
30
2017
CA
Other
400

R: Vector of JSONs to data.frame

I have a vector of JSONs (of the same structure) and transform it to a data.frame. Following example does exactly what I want.
require(jsonlite) # fromJSON()
require(magrittr) # for the pipeline only
require(data.table) # rbindlist()
jsons <- c('{"num":1,"char":"a","list":{"x":1,"y":2}}',
'{"num":2,"char":"b","list":{"x":1,"y":2}}',
'{"num":3,"char":"c","list":{"x":1,"y":2}}')
df <- jsons %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
Some elements of the JSON are objects, i.e. if I transform it fromJSON() some elements of the list will be lists as well. I cannot use unlist() to each list because I have different variable types so I am using as.data.frame.list() function. This is however too slow to do for each JSON individually. Is there a way how can I do it more effectively?
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
system.time(
df <- json %>% rep(1000) %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
) # 2.72
I know that there are plenty of similar questions but most of the answers I saw was about using as.data.frame() or data.frame(). Nobody mentioned the speed. Maybe there is no better solution to this.
I finally found the answer. It will be on CRAN soon.
devtools::install_github("jeremystan/tidyjson")
tidyjson::spread_all()
This function is about 10-times faster than my example above.
Try to collapse all JSONs in the one string. Let's show example of the solution:
require(jsonlite)
require(data.table)
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
n <- 1000
ex <- rep(json, 1000)
f1 <- function(x) {
res <- lapply(x, fromJSON)
res <- lapply(res, as.data.frame.list, stringsAsFactors = FALSE)
res <- rbindlist(res, fill = TRUE)
return(res)
}
f2 <- function(x) {
res <- fromJSON(paste0("[", paste(x, collapse = ","), "]"), flatten = TRUE)
lst <- sapply(res, is.list)
res[lst] <- lapply(res[lst], function(x) as.data.table(transpose(x)))
res <- flatten(res)
return(res)
}
bench::mark(
f1(ex), f2(ex), min_iterations = 100, check = FALSE
)
#> # A tibble: 2 x 14
#> expression min mean median max `itr/sec` mem_alloc n_gc n_itr #> total_time result memory time
#> <chr> <bch:t> <bch:t> <bch:t> <bch:tm> <dbl> <bch:byt> <dbl> <int> #> <bch:tm> <list> <list> <lis>
#> 1 f1(ex) 2.27s 2.35s 2.32s 2.49s 0.425 0B 5397 100 #> 3.92m <data… <Rpro… <bch…
#> 2 f2(ex) 48.85ms 63.78ms 57.88ms 116.19ms 15.7 0B 143 100 #> 6.38s <data… <Rpro… <bch…
#> # … with 1 more variable: gc <list>

r knit html How can I create a simple frequency table like the ones we see on books?

I need to create a simply frequency table, like those ones we see on statistical books.
I am working with this data:
Notas <- c(64,78,66,82,74,103,78,86,103,87,73,95,82,89,73,92,85,80,81,90,78,86,78,101,85,98,75,73,90,86,86,84,86,76,76,83,103,86,84,85,76,80,92,102,73,87,70,85,79,93,82,90,83,81,85,72,81,96,81,85,68,96,86,70,72,74,84,99,81,89,71,73,63,105,74,98,78,78,83,96,95,94,88,62,91,83,98,93,83,76)
And so far I did this:
library('dplyr')
tabela <- as.data.frame(Notas)
tabela <- tabela %>%
mutate(fr=round(prop.table(Notas),digits=2),
fr_perc = round(prop.table(Notas)*100,digits=2))
But I cannot believe there isn't an easier way, as its taking too much effort. So I believe I don't know a function which would do this in a very easier way.
My intended result is this:
My main pain is being to show column two and make data aggregate as suggested by it.
This table is to be used with statistics 101 class students, to show then a question's answer, that's why I need to show all those columns, as thats the way they can double check their results.
After some tries I finally come up with a solution. Isn't pretty but it worked like a charm, so I used it. Anyway if someone knows of a better solution, please, feel free to post it here.
library(knitr)
library(dplyr)
Notas <- c(64,78,66,82,74,103,78,86,103,87,73,95,82,89,73,92,85,80,81,90,78,86,78,101,85,98,75,73,90,86,86,84,86,76,76,83,103,86,84,85,76,80,92,102,73,87,70,85,79,93,82,90,83,81,85,72,81,96,81,85,68,96,86,70,72,74,84,99,81,89,71,73,63,105,74,98,78,78,83,96,95,94,88,62,91,83,98,93,83,76)
int_clas <- floor(1+3.3*log10(length(Notas)))
ampl_amos <- ceiling((max(Notas) - min(Notas)) / int_clas)
i <- c(1:int_clas)
tabela <- as.data.frame(i)
base <- array(0,int_clas)
topo <- array(0,int_clas)
notas <- array(0,int_clas)
xi <- array(0,int_clas)
fi <- array(0,int_clas)
Fi <- array(0,int_clas)
Fri <- array(0,int_clas)
Fri_perc <- array(0,int_clas)
for (z in 1:int_clas) {
base[z] <- cbind(min(Notas)+(ampl_amos*(z-1)))
topo[z] <- cbind(min(Notas)+(ampl_amos*z))
notas[z] <- cbind(paste(as.character(base[z])," → ", as.character(topo[z]), sep = " "))
xi[z] <- cbind(ceiling((base[z]+topo[z])/2))
fi[z] <- cbind(sum(Notas>base[z]-1 & Notas<topo[z]))
Fi[z] <- cbind(sum(fi[1:z]))
}
tabela <- tabela %>%
mutate(notas,xi,fi,xifi=xi*fi,fri=fi/sum(fi),fri_perc=fi/sum(fi)*100,Fi)
for (z in 1:int_clas) {
Fri[z] <- cbind(sum(tabela$fri[1:z]))
Fri_perc[z] <- cbind(sum(tabela$fri_perc[1:z]))
}
tabela <- tabela %>%
mutate(Fri,Fri_perc)
kable(tabela,digits = 2)
The "& ra rr;" you can see in code is a html code to insert an arrow like this (→). As I am knitting it to html, I used this code.