Related
There are bunch of files in a directory that has json formatted entries in each line. The size of the files varies from 5k to 200MB. I have this code to go though each file, parse the data I am looking for in the json and finally form a data frame. This script is taking a very long time to finish, in fact it never finishes.
Is there any way to speed it up so that I can read the files faster?
Code:
library(jsonlite)
library(data.table)
setwd("C:/Files/")
#data <- lapply(readLines("test.txt"), fromJSON)
df<-data.frame(Timestamp=factor(),Source=factor(),Host=factor(),Status=factor())
filenames <- list.files("Json_files", pattern="*.txt", full.names=TRUE)
for(i in filenames){
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
df<-rbind(df,myDf)
}
This is a sample entry but there are thousands of entries like this in the file:
{"senderDateTimeStamp":"2016/04/08 10:53:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB01","servermember":"test"},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:54:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
{"senderDateTimeStamp":"2016/04/08 10:55:18","senderHost":null,"senderAppcode":"app","senderUsecase":"appinternalstats_prod","destinationTopic":"app_appinternalstats_realtimedata_topic","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460127623591,"payloadData":{"timestamp":"2016-04-08T10:53:18.169","status":"get","source":"STREAM","fund":"JVV","client":"","region":"","evetid":"","osareqid":"","basis":"","pricingdate":"","content":"","msgname":"","recipient":"","objid":"","idlreqno":"","host":"WEB02","servermember":""},"payloadDataText":"","key":"app:appinternalstats_prod","destinationTopicName":"app_appinternalstats_realtimedata_topic","hdfsPath":"app/appinternalstats_prod","esindex":"app","estype":"appinternalstats_prod","useCase":"appinternalstats_prod","appCode":"app"}
With your example data in "c:/tmp.txt":
> df <- jsonlite::fromJSON(paste0("[",paste0(readLines("c:/tmp.txt"),collapse=","),"]"))$payloadData[c("timestamp","source","host","status")]
> df
timestamp source host status
1 2016-04-08T10:53:18.169 STREAM WEB01 get
2 2016-04-08T10:53:18.169 STREAM WEB02 get
3 2016-04-08T10:53:18.169 STREAM WEB02 get
So to adapt your code to get a list of dataframes:
dflist <- lapply(filenames, function(i) {
jsonlite::fromJSON(
paste0("[",
paste0(readLines(i),collapse=","),
"]")
)$payloadData[c("timestamp","source","host","status")]
})
The idea is to transform your lines (from readLines) into a big json array and then create the dataframe by parsing it as json.
As lmo already showcased, using lapply on your filenmaes list procide you with a list of dataframes, if you really want only one dataframe at end you can load the data.table packages and then use rbindlist on dflist to get only one dataframe.
Or if you're short in memory this thread may help you.
One speed up is to replace your for loop with lapply Then drop the final rbind. the speed up here would be that R would not have to repeatedly copy an increasingly large file, df over your "bunch" of files. The result would be stored in a convenient list that you could either use as is or convert to a data.frame in one go:
# create processing function
getData <- function(i) {
print(i)
data <- lapply(readLines(i), fromJSON)
myDf <- do.call("rbind", lapply(data, function(d) {
data.frame(TimeStamp = d$payloadData$timestamp,
Source = d$payloadData$source,
Host = d$payloadData$host,
Status = d$payloadData$status)}))
}
# lapply over files
myDataList <- lapply(filenames, getData)
I am trying to save about 300 HTML objects to disk using R.
str_url <- "https://www.holidayhouses.co.nz/Browse/List.aspx?page=1"
read_html_test1 <- xml2::read_html(str_url)
xml2::write_xml(read_html_test1, "testwrite.html")
read_html <- xml2::read_html("testwrite.html")
But this will eventually save about 300 separate files to disk. Ideally, what I would like is to save a single R object to disk that contains these 300 documents.
Converting each document to text before saving for some reason does not work. For example the following will product some weird (unhelpful) error:
str_html <- as.character(read_html_test1)
xml2::read_html(str_html)
If I try to use the output of xml2::read_html() it is a a pointer to a C structure and therefore this will not persist to disk.
Any suggestions for a hack to make this work...?
I managed it with the httr package, whose content function can take an as = "text" argument, which stops it from parsing the HTML.
library(xml2)
library(httr)
str_url <- "https://www.holidayhouses.co.nz/Browse/List.aspx?page=1"
# use `GET` to make the request, and pull out the html with `content`; returns text string
x <- content(GET(str_url), as = 'text')
# make a list of html documents to save
list_xs <- list(x, x)
# save list with `saveRDS`
saveRDS(list_xs, 'test.rds')
Now to see if it works:
# read in rds file we saved
saved_html <- readRDS('test.rds')
# parse the second element in it with `xml2::read_html`
saved_x_parsed <- read_html(saved_html[[2]])
# and let's see...
saved_x_parsed
# {xml_document}
# <html>
# [1] <head><title>
\n\tNew Zealand holiday homes, baches and vacation homes for rent.
\ ...
# [2] <body id="ctl00_Body" class="Page-List">
\n <div class="SatNavBarPlaceholder"/>
 ...
How to save R objects to disk:
Save R Objects
I took your example code and produced working, human readable, R-loadable output as follows:
str_url <- "https://www.holidayhouses.co.nz/Browse/List.aspx?page=1"
read_html_test1 <- xml2::read_html(str_url)
str_html <- as.character(read_html_test1)
x <- xml2::read_html(str_html)
save(x, file="c:\\temp\\text.txt",compress=FALSE,ascii=TRUE)
I have a data frame where the values of column Parameters are Json data:
# Parameters
#1 {"a":0,"b":[10.2,11.5,22.1]}
#2 {"a":3,"b":[4.0,6.2,-3.3]}
...
I want to extract the parameters of each row and append them to the data frame as columns A, B1, B2 and B3.
How can I do it?
I would rather use dplyr if it is possible and efficient.
In your example data, each row contains a json object. This format is called jsonlines aka ndjson, and the jsonlite package has a special function stream_in to parse such data into a data frame:
# Example data
mydata <- data.frame(parameters = c(
'{"a":0,"b":[10.2,11.5,22.1]}',
'{"a":3,"b":[4.0,6.2,-3.3]}'
), stringsAsFactors = FALSE)
# Parse json lines
res <- jsonlite::stream_in(textConnection(mydata$parameters))
# Extract columns
a <- res$a
b1 <- sapply(res$b, "[", 1)
b2 <- sapply(res$b, "[", 2)
b3 <- sapply(res$b, "[", 3)
In your example, the json structure is fairly simple so the other suggestions work as well, but this solution will generalize to more complex json structures.
I actually had a similar problem where I had multiple variables in a data frame which were JSON objects and a lot of them were NA's, but I did not want to remove the rows where NA's existed. I wrote a function which is passed a data frame, id within the data frame(usually a record ID), and the variable name in quotes to parse. The function will create two subsets, one for records which contain JSON objects and another to keep track of NA value records for the same variable then it joins those data frames and joins their combination to the original data frame thereby replacing the former variable. Perhaps it will help you or someone else as it has worked for me in a few cases now. I also haven't really cleaned it up too much so I apologize if my variable names are a bit confusing as well as this was a very ad-hoc function I wrote for work. I also should state that I did use another poster's idea for replacing the former variable with the new variables created from the JSON object. You can find that here : Add (insert) a column between two columns in a data.frame
One last note: there is a package called tidyjson which would've had a simpler solution but apparently cannot work with list type JSON objects. At least that's my interpretation.
library(jsonlite)
library(stringr)
library(dplyr)
parse_var <- function(df,id, var) {
m <- df[,var]
p <- m[-which(is.na(m))]
n <- df[,id]
key <- n[-which(is.na(df[,var]))]
#create df for rows which are NA
key_na <- n[which(is.na(df[,var]))]
q <- m[which(is.na(m))]
parse_df_na <- data.frame(key_na,q,stringsAsFactors = FALSE)
#Parse JSON values and bind them together into a dataframe.
p <- lapply(p,function(x){
fromJSON(x) %>% data.frame(stringsAsFactors = FALSE)}) %>% bind_rows()
#bind the record id's of the JSON values to the above JSON parsed dataframe and name the columns appropriately.
parse_df <- data.frame(key,p,stringsAsFactors = FALSE)
## The new variables begin with a capital 'x' so I replace those with my former variables name
n <- names(parse_df) %>% str_replace('X',paste(var,".",sep = ""))
n <- n[2:length(n)]
colnames(parse_df) <- c(id,n)
#join the dataframe for NA JSON values and the dataframe containing parsed JSON values, then remove the NA column,q.
parse_df <- merge(parse_df,parse_df_na,by.x = id,by.y = 'key_na',all = TRUE)
#Remove the new column formed by the NA values#
parse_df <- parse_df[,-which(names(parse_df) =='q')]
####Replace variable that is being parsed in dataframe with the new parsed and names values.######
new_df <- data.frame(append(df,parse_df[,-which(names(parse_df) == id)],after = which(names(df) == var)),stringsAsFactors = FALSE)
new_df <- new_df[,-which(names(new_df) == var)]
return(new_df)
}
I have data from GPS log like this : (this data in rows of data frame columns)
{"mAccuracy":20.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":21677339000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"wifi","noGPSLocation":{"mAccuracy":20.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":21677339000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811956,"mLongitude":126.9104909,"mProvider":"network","mSpeed":0.0,"mTime":1402801381486},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811956,"mLongitude":126.9104909,"mProvider":"network","mSpeed":0.0,"mTime":1402801381486,"timestamp":1402801665.512}
The problem is I only need Latitude and longitude value, so I think i can use substring and sappy for applying to all data in dataframe.
But I am not sure this way is handsome because when i use substring ex: substr("abcdef", 2, 4) so I need to count who many chars from beginning until "mLatitude" , so anybody can give suggestion the fast way to processing it?
Thank you to #mnel for answering question, it's work , but i still have problem
From mnel answer I've created function like this :
fgps <- function(x) {
out <- fromJSON(x)
c(out$mExtras$noGPSLocation$mLatitude,
out$mExtras$noGPSLocation$mLongitude)
}
and then this is my data :
gpsdata <- head(dfallgps[,4],2)
[1] "{\"mAccuracy\":23.128,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":76437488000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"wifi\",\"noGPSLocation\":{\"mAccuracy\":23.128,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":76437488000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1779956,\"mLongitude\":126.9089661,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894224187},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1779956,\"mLongitude\":126.9089661,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894224187,\"timestamp\":1402894517.425}"
[2] "{\"mAccuracy\":1625.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":77069916000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"cell\",\"noGPSLocation\":{\"mAccuracy\":1625.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":77069916000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811881,\"mLongitude\":126.9084072,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894857416},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811881,\"mLongitude\":126.9084072,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894857416,\"timestamp\":1402894857.519}"
When run sapply why the data still shows in the result not just the results values.
sapply(gpsdata, function(gpsdata) fgps(gpsdata))
{"mAccuracy":23.128,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":76437488000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"wifi","noGPSLocation":{"mAccuracy":23.128,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":76437488000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1779956,"mLongitude":126.9089661,"mProvider":"network","mSpeed":0.0,"mTime":1402894224187},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1779956,"mLongitude":126.9089661,"mProvider":"network","mSpeed":0.0,"mTime":1402894224187,"timestamp":1402894517.425}
[1,] 35.178
[2,] 126.909
{"mAccuracy":1625.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":77069916000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"cell","noGPSLocation":{"mAccuracy":1625.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":77069916000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811881,"mLongitude":126.9084072,"mProvider":"network","mSpeed":0.0,"mTime":1402894857416},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811881,"mLongitude":126.9084072,"mProvider":"network","mSpeed":0.0,"mTime":1402894857416,"timestamp":1402894857.519}
[1,] 35.18119
[2,] 126.90841
I want the result looks like :
[1] 35.178 126.909
[2] 35.18119 126.90841
Thank you
It would appear that your data is in JSON format. Therefore, use a RJSONIO::fromJSON to read the file.
E.g.:
txt <- "{\"mAccuracy\":20.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":21677339000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"wifi\",\"noGPSLocation\":{\"mAccuracy\":20.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":21677339000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811956,\"mLongitude\":126.9104909,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402801381486},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811956,\"mLongitude\":126.9104909,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402801381486,\"timestamp\":1402801665.512}"
Then process:
library(RJSONIO)
out <- fromJSON(txt)
out$$mLongitude
#[1] 126.9105
out$mLatitude
#[1] 35.1812
# to process multiple values
tt <- rep(txt,2)
myData <- lapply(tt, fromJSON)
latlong <- do.call(rbind,lapply(myData, `[` ,c('mLatitude','mLongitude')))
# or using rbind list
library(data.table)
latlong <- rbindlist(lapply(myData, `[` ,c('mLatitude','mLongitude')))
I am trying to use R's rjson library to upload a 200MB JSON file into R , but I got the Cannot fit vector over 1KB error.
Here's the code I used to load the JSON file into R:
UnpackJSON <- function(filePath)
{
con <- file(filePath, "r")
input <- readLines(con, -1L)
# jsonData <- fromJSON(paste(input, collapse=""))
jsonData <- sapply(input, fromJSON)
close(con)
df <- data.frame(jsonData)
temp <- rownames(df)
df <- as.data.frame(t(df))
colnames(df) <- temp
rownames(df) <- NULL
return(df)
}
Is there a way to optimize this code or another way to load such a large file into R? I appreciate any input.
Why would you process the JSON data line per line using sapply? Couldn't you just say fromJSON(input)? I suspect a problem could be that the garbage collector waits for the entire sapply loop to finish before deleting any copies that are made in the analysis process (see also the comments here). So when vectorization does not work (which is greatly preferred), you could try and change the sapply loop to a for loop and see if that helps.