Rmarkdown - code run ok at script, but when knit gives error - html

I have one Rmarkdown document, that was given to me and worked fine with the person who given to me.
But when I do with my datas, I dont know what's going on, when I run the exactly code with script document works well, but when I try to knit to html, give me an error.
bhv_df <- plyr::ddply(bhv_df, ~segmentid, function(d){
d= bhv_df[bhv_df$segmentid == bhv_df$segmentid[1],
# predictions are made based on the mid time between start and end of the message
predObj <- crawl::crwPredict(object.crwFit = crawl_models_list[[d$segmentid[1]]], predTime = d$MidTime, speedEst=TRUE, flat=TRUE)
predObj_dives <- predObj[predObj$locType == "p",]
# reproject into lat/long, because the crawl models have been built in a mercator pacific centered CRS
coord_points <- predObj_dives
coordinates(coord_points) =~ mu.x + mu.y
proj4string(coord_points) <- CRS("+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
predObj_dives[c("lon", "lat")] <- coordinates(spTransform(coord_points, CRS("+proj=longlat +datum=WGS84")))
# calculate the lon360
return(cbind(d, predObj_dives[c("mu.x", "mu.y", "lon", "lat")]))
})
#load("./Outputs/crawl_argos_df.RData")
######### MERGE DIVE DATASET with CRAWL-derived MOUVEMENT DATA
bhv_df <- plyr::ddply(bhv_df, ~segmentid, function(d){ # for each tag dive data
cr <- crawl_argos_df[crawl_argos_df$segmentid == d$segmentid[1], ] # select the corresponding crawl track
fl <- filtered_argos_df_crawl[filtered_argos_df_crawl$segmentid == d$segmentid[1], ] # select the corresponding filtered track
d[c("lq_closest_filt","dt_closest_filt", "dist_closest_filt", "rel.angle","speed")] <- NA # add empty columns
for (i in 1:nrow(d)){ # for each dive...
fl$dist_diff <- as.vector(spDists(x = as.matrix(d[i,c("lon", "lat")]), y = as.matrix(fl[c("lon", "lat")]), longlat = T)) # distance calculated in km
fl$dt_diff <- as.numeric(difftime(d$MidTime[i], fl$time, units = "hours"))
d[i, "dt_closest_filt"] <- min(abs(fl$dt_diff)) # select the filtered position closest in time to the dive, time in hours
d[i, "dist_closest_filt"] <- fl[which(abs(fl$dt_diff) == min(abs(fl$dt_diff))), "dist_diff"][1] # retrieve the distance from this filtered position to the dive location (predicted by crawl)
d[i, "lq_closest_filt"] <- fl[which(abs(fl$dt_diff) == min(abs(fl$dt_diff))), "lq"][1] # argos quality of closest filtered argos position
cr$diff <- as.numeric(difftime(d$MidTime[i], cr$time, units = "hours")) # look at the time diff between this dive and all positions recorded in crawl for that same tag
d[i, c("rel.angle","speed")] <- cr[abs(cr$diff) == min(abs(cr$diff)), c("rel.angle", "speed")] # select closest position recorded when dive occurred
}
return(d)
})
bhv_df$depth_bin <- cut(bhv_df$DepthMean, seq(0, 700, 50))
ggplot(bhv_df[bhv_df$What == "Dive" & bhv_df$depth_range == "deep" & bhv_df$DepthMean < 1000,], aes(x = lon, y = lat)) +
stat_contour(data = bathyNOAA_df, aes(x, y, z=z), binwidth = 500, color = "grey60", size = 0.2) +
geom_tile(data = bathyNOAA_df_shallow[bathyNOAA_df_shallow$z >= 0, ], aes(x, y), fill = "grey10") +
geom_jitter(aes(fill = -DepthMean, size = DepthMean), col="black", alpha=0.8, pch = 21, width=0.1) +
scale_fill_viridis(option = "magma", name = "Dive Depth (m)", direction = 1, begin = 0.2) +
xlab("Longitude") +
ylab("Latitude") +
coord_fixed(xlim = c(-50, -26), ylim = c(-55, -15), expand = F)
The error (is it just at beginning of the code, at 4 line):
Erro: unexpected symbol in:
" predObj <- crawl::crwPredict(object.crwFit = crawl_models_list[[d$segmentid1]], predTime = d$MidTime, speedEst=TRUE, flat=TRUE)
predObj_dives"
And this:
I tryed to see if have conflicts, but apparently dont have
> conflicts()
[1] "lines" "cividis" "inferno" "magma" "plasma" "viridis" "viridis.map"
[8] "summary" "days" "hours" "minutes" "origin" "seconds" "show"
[15] "years" "hour" "isoweek" "mday" "minute" "month" "quarter"
[22] "second" "wday" "week" "yday" "year" "coerce" "coerce"
[29] "plot" "show" "summary" "%>%" "%>%" "%>%" "between"
[36] "count" "first" "intersect" "last" "setdiff" "union" "%>%"
[43] "flatten" "map" "transpose" "%>%" "add_row" "as_data_frame" "as_tibble"
[50] "data_frame" "data_frame_" "frame_data" "glimpse" "lst" "lst_" "tbl_sum"
[57] "tibble" "tribble" "trunc_mat" "type_sum" "enexpr" "enexprs" "enquo"
[64] "enquos" "ensym" "ensyms" "expr" "quo" "quo_name" "quos"
[71] "sym" "syms" "vars" "filter" "lag" "lines" "plot"
[78] "as.raster" "data" "Arith" "coerce" "Compare" "initialize" "show"
[85] "as.difftime" "body<-" "date" "intersect" "kronecker" "merge" "Position"
[92] "setdiff" "setequal" "split" "subset" "summary" "union"
>
Someone know what's going on?
Thanks!

After many tries, I removed one part of the code, and everything worked well
# I removed: d= bhv_df[bhv_df$segmentid == bhv_df$segmentid[1],
bhv_df <- plyr::ddply(bhv_df, ~segmentid, function(d){
# predictions are made based on the mid time between start and end of the message
predObj <- crawl::crwPredict(object.crwFit = crawl_models_list[[d$segmentid[1]]], predTime = d$MidTime, speedEst=TRUE, flat=TRUE)
predObj_dives <- predObj[predObj$locType == "p",]
# reproject into lat/long, because the crawl models have been built in a mercator pacific centered CRS
coord_points <- predObj_dives
coordinates(coord_points) =~ mu.x + mu.y
proj4string(coord_points) <- CRS("+proj=merc +lon_0=0 +k=1 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
predObj_dives[c("lon", "lat")] <- coordinates(spTransform(coord_points, CRS("+proj=longlat +datum=WGS84")))
# calculate the lon360
return(cbind(d, predObj_dives[c("mu.x", "mu.y", "lon", "lat")]))
})
######### MERGE DIVE DATASET with CRAWL-derived MOUVEMENT DATA
bhv_df <- plyr::ddply(bhv_df, ~segmentid, function(d){ # for each tag dive data
cr <- crawl_argos_df[crawl_argos_df$segmentid == d$segmentid[1], ] # select the corresponding crawl track
fl <- filtered_argos_df_crawl[filtered_argos_df_crawl$segmentid == d$segmentid[1], ] # select the corresponding filtered track
d[c("lq_closest_filt","dt_closest_filt", "dist_closest_filt", "rel.angle","speed")] <- NA # add empty columns
for (i in 1:nrow(d)){ # for each dive...
fl$dist_diff <- as.vector(spDists(x = as.matrix(d[i,c("lon", "lat")]), y = as.matrix(fl[c("lon", "lat")]), longlat = T)) # distance calculated in km
fl$dt_diff <- as.numeric(difftime(d$MidTime[i], fl$time, units = "hours"))
d[i, "dt_closest_filt"] <- min(abs(fl$dt_diff)) # select the filtered position closest in time to the dive, time in hours
d[i, "dist_closest_filt"] <- fl[which(abs(fl$dt_diff) == min(abs(fl$dt_diff))), "dist_diff"][1] # retrieve the distance from this filtered position to the dive location (predicted by crawl)
d[i, "lq_closest_filt"] <- fl[which(abs(fl$dt_diff) == min(abs(fl$dt_diff))), "lq"][1] # argos quality of closest filtered argos position
cr$diff <- as.numeric(difftime(d$MidTime[i], cr$time, units = "hours")) # look at the time diff between this dive and all positions recorded in crawl for that same tag
d[i, c("rel.angle","speed")] <- cr[abs(cr$diff) == min(abs(cr$diff)), c("rel.angle", "speed")] # select closest position recorded when dive occurred
}
return(d)
}) #warnings probably due to RT being equal to NA at beginning and end of the track
Thank you

Related

using user interface, nearpoint/renderprint/verbatimTextOutput('print') to show part of variables on user click

Below I attach the code I use for the Shiny application. I am wondering how I can specifically use click="plot_click"/near-point/render print, to show part of data(). If you look at the attached image, you see that bunch of info has appeared see image, but I need to take "name", and winning times only.
#code------------------------------------
server<- function(input, output, session) {
#wrangling data:
gslam<- as.data.frame(gslam1)
gslam$tournament <- sapply(gslam$tournament, function(val)
{agrep(val, c('Australian Open', 'U.S.
Open','Wimbledon','FrenchOpen'),
value = TRUE)})
#pivot plot based on Tournament
reactive_data<-reactive({
req(input$sel_tournament)
df<- gslam %>% filter(tournament %in%
input$sel_tournament)%>%group_by(winner,tournament)%>%
mutate(winningNumber=n())
df<-df %>% arrange(winner)
})
#dynamic list
observe({
updateSelectInput(session,"sel_tournament", choices =
gslam$tournament, select="U.S. Open")
})
# create the plot
output$WinnerPlot <- renderPlot({
g<- ggplot(reactive_data(),aes( y =winner, x
=winningNumber),decreasing=FALSE) +
theme(legend.position=none")
g+geom_bar(stat="identity",width=0.5, fill="black")})
output$print = renderPrint({
nearPoints(
reactive_data(), # the plotting data
input$plot_click, # input variable to get the x/y
maxpoints = 1, # only show the single nearest point
threshold = 1000 # basically a search radius. set this big
# to show at least one point per click
)})
output$Top5Plot <-renderPlot({
g<- ggplot(reactive_data(),aes( y =winner, x
=winningNumber),decreasing=FALSE) + theme(legend.position =
"none")
g+geom_bar(stat="identity",width=0.5, fill="black")
})
}
ui <- navbarPage("Grand Slam Shiny Application", id="cc",
tabPanel("Winners' Rank",
fluidRow(titlePanel("Grand Slam
ShinyApp"),
sidebarPanel(
selectInput(inputId =
"sel_tournament",label="Choose Tournament:","Names"))),
plotOutput(("WinnerPlot"), click="plot_click"),
verbatimTextOutput('print')
),
tabPanel("Top 5 Winners' Performance",
plotOutput("Top5Plot"))
)
# Run the App
shinyApp(ui, server)
You may take help of dplyr::select in nearPoints.
output$print = renderPrint({
nearPoints(
dplyr::select(reactive_data(), winner, winningNumber),
input$plot_click,
maxpoints = 1,
threshold = 1000
)})

R: Selecting certain from a JSON file

I've imported a JSON file into R from ( http://eric.clst.org/wupl/Stuff/gz_2010_us_040_00_20m.json ) and I'm trying to select only counties in Kansas.
Right now I have all the data into one variable and I'm trying to make subdata of this that is just counties of Kansas. I'm not sure how to go about this.
What you have there is geoJson, which can be read directly by library(sf), to give you an sf object, which is also data.frame. Then you can use the usual data.frame subsetting operations
library(sf)
sf <- sf::read_sf("http://eric.clst.org/wupl/Stuff/gz_2010_us_040_00_20m.json")
sf[sf$NAME == "Kansas", ]
# Simple feature collection with 1 feature and 5 fields
# geometry type: MULTIPOLYGON
# dimension: XY
# bbox: xmin: -102.0517 ymin: 36.99308 xmax: -94.58993 ymax: 40.00316
# epsg (SRID): 4326
# proj4string: +proj=longlat +datum=WGS84 +no_defs
# GEO_ID STATE NAME LSAD CENSUSAREA geometry
# 30 0400000US20 20 Kansas 81758.72 MULTIPOLYGON(((-99.541116 3...
And seeing as you want the individual counties, you need to use the counties data set
sf_counties <- sf::read_sf("http://eric.clst.org/wupl/Stuff/gz_2010_us_050_00_500k.json")
sf_counties[sf_counties$STATE == 20, ]
To stay with a JSON workflow, can try jqr
library(jqr)
url <- 'http://eric.clst.org/wupl/Stuff/gz_2010_us_040_00_20m.json'
download.file(url, (f <- tempfile(fileext = ".json")))
res <- paste0(readLines(f), collapse = " ")
out <- jq(res, '.features[] | select(.properties.NAME == "Kansas")')
can map easily like
library(leaflet)
leaflet() %>%
addTiles() %>%
addGeoJSON(out) %>%
setView(-98, 38, 6)
library(rjson)
lst=fromJSON(file = 'http://eric.clst.org/wupl/Stuff/gz_2010_us_040_00_20m.json')
index = which(sapply(lapply(lst$features,"[[",'properties'),'[[','NAME')=='Kansas')
subdata = lst$features[[index]]

Find Jaccard distance of tweets and cluster in Kmeans

This is a follow up question to a problem I've been working on for a while. I have two questions. One regards an algorithm that works on two tweets, that I revised to measure 10 tweets. I'm wondering what my revision is measuring. I get result, but I want it to measure several tweet's jaccard distances, not just return one value. Since it's returning one value, I think it's just adding everything up. The other question is about my attempt to create a For Loop and assign clusters.
I'm trying to find the Jaccard distance between a dataset of tweets, then cluster those tweets with the Kmeans algorithm.
This is where I'm retrieving the data from:
http://www3.nd.edu/~dwang5/courses/spring15/assignments/A2/Tweets.json
What I have so far is this
install.packages("rjson")
library("rjson")
#download JSON File and put into a dataframe
download.file("http://www3.nd.edu/~dwang5/courses/spring15/assignments/A2/Tweets.json", tf<-tempfile());library(jsonlite);json_alldata <- fromJSON(sprintf("[%s]", paste(readLines(file(tf)),collapse=",")))
# get rid of geo column
tweet.features = json_alldata
tweet.features$geo <- NULL
# *Works. Compares two tweets and measures Jaccard Distance
tweetText <- list(tweet1 = tweet.features$text[1]:tweet.features$text[2])
jaccard_i <- function(tw1, tw2){
tw1 <- unlist(strsplit(tw1, " |\\."))
tw2 <- unlist(strsplit(tw2, " |\\."))
i <- length(intersect(tw1, tw2))
u <- length(union(tw1, tw2))
list(i=i, u=u, j=i/u)
}
jaccard_i(tweetText[[1]], tweetText[[2]])
All of that measures the jaccard distance of two specified tweets. Which is great.
But now I'm trying to modify to compare the distances between several tweets. This time 10 random tweets which I retrieved from the Sample command in R.
# Generates two sets of 5 random tweets
tweetText <- list(sample(tweet.features$text, replace = FALSE, size = 5), sample(tweet.features$text, replace = FALSE, size = 5))
jaccard_i <- function(tw1, tw2){
tw1 <- unlist(strsplit(tw1, " |\\."))
tw2 <- unlist(strsplit(tw2, " |\\."))
i <- length(intersect(tw1, tw2))
u <- length(union(tw1, tw2))
list(i=i, u=u, j=i/u)
}
jaccard_i(tweetText[[1]], tweetText[[2]])
This gives me results, but it can't be correct.
I'm trying to build an algorithm that can measure all the tweets, compare their jaccard distance, then cluster based on the Jaccard distance with Kmeans.
So for another attempt, I thought to make a For Loop.
I decided to make 10 cluster centers with the 10 random tweets
c <- sample(tweet.features$text, replace = FALSE, size = 10)
Now I did a For Loop hoping to measure the tweets which I figured I could assign to an array and cluster
#Algorithm attempt
for(i in tweet.features$text){
for (j in c){
i <- length(intersect(i, j))
u <- length(union(i, j))
j = i/u
}
#assign(my.array)
}
I don't believe that's doing anything useful, but it's an attempt to create a loop to measure the Jaccard distance.
I'm sorry that this is a loaded question. Any help would be appreciated as I'm a bit lost.
In your first function, you are doing unlist of your lists of words in your tweets, thus in tw1 and tw2 have the global lists of words and you cannot use them for your tweet-by-tweet Jaccard. You could accomplish it with a removing the unlist, then tw1 and tw2 are lists of lists of terms, and you can compare them using mapply. Something as follows.
jaccard_i <- function(tw1, tw2){
tw1 <- strsplit(tw1, " |\\.")
tw2 <- strsplit(tw2, " |\\.")
i <- mapply(function(tw1, tw2) {
length(intersect(tw1, tw2))
}, tw1=tw1, tw2=tw2)
u <- mapply(function(tw1, tw2) {
length(union(tw1, tw2))
}, tw1=tw1, tw2=tw2)
list(i=i, u=u, j=i/u)
}
Silly example:
> tw1 = c("we yes you no", "we are the people")
> tw2= c("we are the people", "we yes you no")
> tweetText = list(tw1, tw2)
> jaccard_i(tweetText[[1]], tweetText[[2]])
$i
[1] 1 1
$u
[1] 7 7
$j
[1] 0.1428571 0.1428571
As for the second part of your question, the double loop, an easy way to start addressing it would be like this,
tw = c("we yes you no", "we are the people")
lapply(tw, function(tweet1) {
lapply(tw, function(tweet2) {
jaccard_i(tweet1, tweet2)
})
})
With results that look like this,
[[1]]
[[1]][[1]]
[[1]][[1]]$i
[1] 4
[[1]][[1]]$u
[1] 4
[[1]][[1]]$j
[1] 1
[[1]][[2]]
[[1]][[2]]$i
[1] 1
[[1]][[2]]$u
[1] 7
[[1]][[2]]$j
[1] 0.1428571
[[2]]
[[2]][[1]]
[[2]][[1]]$i
[1] 1
[[2]][[1]]$u
[1] 7
[[2]][[1]]$j
[1] 0.1428571
[[2]][[2]]
[[2]][[2]]$i
[1] 4
[[2]][[2]]$u
[1] 4
[[2]][[2]]$j
[1] 1
where you should skip the values in the diagonals of course -just a starting point as I said.
Hope it helps.

Unable to convert JSON to dataframe

I want to convert a json-file into a dataframe in R. With the following code:
link <- 'https://www.dropbox.com/s/ckfn1fpkcix1ccu/bevingenbag.json'
document <- fromJSON(file = link, method = 'C')
bev <- do.call("cbind", document)
i'm getting this:
type features
1 FeatureCollection list(type = "Feature", geometry = list(type = "Point", coordinates = c(6.54800000288927, 52.9920000044505)), properties = list(gid = "1496600", yymmdd = "19861226", lat = "52.992", lon = "6.548", mag = "2.8", depth = "1.0", knmilocatie = "Assen", baglocatie = "Assen", tijd = "74751"))
which is the first row of a matrix. All the other rows have the same structure. I'm interested in the properties = list(gid = "1496600", yymmdd = "19861226", lat = "52.992", lon = "6.548", mag = "2.8", depth = "1.0", knmilocatie = "Assen", baglocatie = "Assen", tijd = "74751") part, which should be converted into a dataframe with the columns gid, yymmdd, lat, lon, mag, depth, knmilocatie, baglocatie, tijd.
I searched for and tryed several solutions but none of them worked. I used the rjson package for this. I also tryed the RJSONIO & jsonlite package, but was unable to extract the desired information.
Anyone an idea how to solve this problem?
Here's a way to obtain the data frame:
library(rjson)
document <- fromJSON(file = "bevingenbag.json", method = 'C')
dat <- do.call(rbind, lapply(document$features,
function(x) data.frame(x$properties)))
Edit: How to replace empty values with NA:
dat$baglocatie[dat$baglocatie == ""] <- NA
The result:
head(dat)
gid yymmdd lat lon mag depth knmilocatie baglocatie tijd
1 1496600 19861226 52.992 6.548 2.8 1.0 Assen Assen 74751
2 1496601 19871214 52.928 6.552 2.5 1.5 Hooghalen Hooghalen 204951
3 1496602 19891201 52.529 4.971 2.7 1.2 Purmerend Kwadijk 200914
4 1496603 19910215 52.771 6.914 2.2 3.0 Emmen Emmen 21116
5 1496604 19910425 52.952 6.575 2.6 3.0 Geelbroek Ekehaar 102631
6 1496605 19910808 52.965 6.573 2.7 3.0 Eleveld Assen 40114
This is just another, quite similar, approach.
#SvenHohenstein's approach creates a dataframe at each step, an expensive process. It's much faster to create vectors and re-type the whole result at the end. Also, Sven's approach makes each column a factor, which might or might not be what you want. The approach below runs about 200 times faster. This can be important if you intend to do this repeatedly. Finally, you will need to convert columns lon, lat, mag, and depth to numeric.
library(microbenchmark)
library(rjson)
document <- fromJSON(file = "bevingenbag.json", method = 'C')
json2df.1 <- function(json){ # #SvenHohenstein approach
df <- do.call(rbind, lapply(json$features,
function(x) data.frame(x$properties, stringsAsFactors=F)))
return(df)
}
json2df.2 <- function(json){
df <- do.call(rbind,lapply(json[["features"]],function(x){c(x$properties)}))
df <- data.frame(apply(result,2,as.character), stringsAsFactors=F)
return(df)
}
microbenchmark(x<-json2df.1(document), y<-json2df.2(document), times=10)
# Unit: milliseconds
# expr min lq median uq max neval
# x <- json2df.1(document) 2304.34378 2654.95927 2822.73224 2977.75666 3227.30996 10
# y <- json2df.2(document) 13.44385 15.27091 16.78201 18.53474 19.70797 10
identical(x,y)
# [1] TRUE

R: calling rq() within a function and defining the linear predictor

I am trying to call rq() of the package quantreg within a function. Herebelow is a simplified explanation of my problem.
If I follow the recommendations found at
http://developer.r-project.org/model-fitting-functions.txt, I have a design matrix after the line
x <- model.matrix(mt, mf, contrasts)
with the first column full of 1's to create an intercept.
Now, when I call rq(), I am obliged to use something like
fit <- rq (y ~ x [,2], tau = 0.5, ...)
My problem happens if there is more than 1 explanatory variable. I don't know how to find an automatic way to write:
x [,2] + x [,3] + x [,4] + ...
Here is the complete simplified code:
ao_qr <- function (formula, data, method = "br",...) {
cl <- match.call ()
## keep only the arguments which should go into the model
## frame
mf <- match.call (expand.dots = FALSE)
m <- match (c ("formula", "data"), names (mf), 0)
mf <- mf[c (1, m)]
mf$drop.unused.levels <- TRUE
mf[[1]] <- as.name ("model.frame")
mf <- eval.parent (mf)
if (method == "model.frame") return (mf)
## allow model.frame to update the terms object before
## saving it
mt <- attr (mf, "terms")
y <- model.response (mf, "numeric")
x <- model.matrix (mt, mf, contrasts)
## proceed with the quantile regression
fit <- rq (y ~ x[,2], tau = 0.5, ...)
print (summary (fit, se = "boot", R = 100))
}
I call the function with:
ao_qr(pain ~ treatment + extra, data = data.subset)
And here is how to get the data:
require (lqmm)
data(labor)
data <- labor
data.subset <- subset (data, time == 90)
data.subset$extra <- rnorm (65)
In this case, with this code, my linear predictor only includes "treatment". If I want "extra", I have to manually add x[,3] in the linear predictor of rq() in the code. This is not automatic and will not work on other datasets with unknown number of variables.
Does anyone know how to tackle this ?
Any help would be greatly appreciated !!!
I found a simple solution:
x[,2:ncol(x)]