Substring in Data Frame R - json

I have data from GPS log like this : (this data in rows of data frame columns)
{"mAccuracy":20.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":21677339000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"wifi","noGPSLocation":{"mAccuracy":20.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":21677339000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811956,"mLongitude":126.9104909,"mProvider":"network","mSpeed":0.0,"mTime":1402801381486},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811956,"mLongitude":126.9104909,"mProvider":"network","mSpeed":0.0,"mTime":1402801381486,"timestamp":1402801665.512}
The problem is I only need Latitude and longitude value, so I think i can use substring and sappy for applying to all data in dataframe.
But I am not sure this way is handsome because when i use substring ex: substr("abcdef", 2, 4) so I need to count who many chars from beginning until "mLatitude" , so anybody can give suggestion the fast way to processing it?
Thank you to #mnel for answering question, it's work , but i still have problem
From mnel answer I've created function like this :
fgps <- function(x) {
out <- fromJSON(x)
c(out$mExtras$noGPSLocation$mLatitude,
out$mExtras$noGPSLocation$mLongitude)
}
and then this is my data :
gpsdata <- head(dfallgps[,4],2)
[1] "{\"mAccuracy\":23.128,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":76437488000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"wifi\",\"noGPSLocation\":{\"mAccuracy\":23.128,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":76437488000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1779956,\"mLongitude\":126.9089661,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894224187},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1779956,\"mLongitude\":126.9089661,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894224187,\"timestamp\":1402894517.425}"
[2] "{\"mAccuracy\":1625.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":77069916000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"cell\",\"noGPSLocation\":{\"mAccuracy\":1625.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":77069916000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811881,\"mLongitude\":126.9084072,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894857416},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811881,\"mLongitude\":126.9084072,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402894857416,\"timestamp\":1402894857.519}"
When run sapply why the data still shows in the result not just the results values.
sapply(gpsdata, function(gpsdata) fgps(gpsdata))
{"mAccuracy":23.128,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":76437488000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"wifi","noGPSLocation":{"mAccuracy":23.128,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":76437488000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1779956,"mLongitude":126.9089661,"mProvider":"network","mSpeed":0.0,"mTime":1402894224187},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1779956,"mLongitude":126.9089661,"mProvider":"network","mSpeed":0.0,"mTime":1402894224187,"timestamp":1402894517.425}
[1,] 35.178
[2,] 126.909
{"mAccuracy":1625.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":77069916000000,"mExtras":{"networkLocationSource":"cached","networkLocationType":"cell","noGPSLocation":{"mAccuracy":1625.0,"mAltitude":0.0,"mBearing":0.0,"mElapsedRealtimeNanos":77069916000000,"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811881,"mLongitude":126.9084072,"mProvider":"network","mSpeed":0.0,"mTime":1402894857416},"travelState":"stationary"},"mHasAccuracy":true,"mHasAltitude":false,"mHasBearing":false,"mHasSpeed":false,"mIsFromMockProvider":false,"mLatitude":35.1811881,"mLongitude":126.9084072,"mProvider":"network","mSpeed":0.0,"mTime":1402894857416,"timestamp":1402894857.519}
[1,] 35.18119
[2,] 126.90841
I want the result looks like :
[1] 35.178 126.909
[2] 35.18119 126.90841
Thank you

It would appear that your data is in JSON format. Therefore, use a RJSONIO::fromJSON to read the file.
E.g.:
txt <- "{\"mAccuracy\":20.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":21677339000000,\"mExtras\":{\"networkLocationSource\":\"cached\",\"networkLocationType\":\"wifi\",\"noGPSLocation\":{\"mAccuracy\":20.0,\"mAltitude\":0.0,\"mBearing\":0.0,\"mElapsedRealtimeNanos\":21677339000000,\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811956,\"mLongitude\":126.9104909,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402801381486},\"travelState\":\"stationary\"},\"mHasAccuracy\":true,\"mHasAltitude\":false,\"mHasBearing\":false,\"mHasSpeed\":false,\"mIsFromMockProvider\":false,\"mLatitude\":35.1811956,\"mLongitude\":126.9104909,\"mProvider\":\"network\",\"mSpeed\":0.0,\"mTime\":1402801381486,\"timestamp\":1402801665.512}"
Then process:
library(RJSONIO)
out <- fromJSON(txt)
out$$mLongitude
#[1] 126.9105
out$mLatitude
#[1] 35.1812
# to process multiple values
tt <- rep(txt,2)
myData <- lapply(tt, fromJSON)
latlong <- do.call(rbind,lapply(myData, `[` ,c('mLatitude','mLongitude')))
# or using rbind list
library(data.table)
latlong <- rbindlist(lapply(myData, `[` ,c('mLatitude','mLongitude')))

Related

rbind fromJSON page: duplicate rowname error

I was trying to rbind some json data scraped from api
library(jsonlite)
pop_dat <- data.frame()
for (i in 1:3) {
# Generate url for each page
url <- paste0('http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json&page=',i)
# Get json data from each page and transform it into dataframe
dat <- as.data.frame(fromJSON(url)[2],flatten = TRUE, row.names = NULL)
pop_dat <- rbind(pop_dat, dat)
}
However, it returns the following error:
Error in row.names<-.data.frame(*tmp*, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘1’, ‘10’, ‘11’, ‘12’, ‘13’, ‘14’, ‘15’, ‘16’, ‘17’, ‘18’, ‘19’, ‘2’, ‘20’, ‘21’, ‘22’, ‘23’, ‘24’, ‘25’, ‘26’, ‘27’, ‘28’, ‘29’, ‘3’, ‘30’, ‘31’, ‘32’, ‘33’, ‘34’, ‘35’, ‘36’, ‘37’, ‘38’, ‘39’, ‘4’, ‘40’, ‘41’, ‘42’, ‘43’, ‘44’, ‘45’, ‘46’, ‘47’, ‘48’, ‘49’, ‘5’, ‘50’, ‘6’, ‘7’, ‘8’, ‘9’
Changing the row.names to null doesn't work. I heard from someone it is due to the fact that some data are stored as lists here, which I don't quite understand.
I understand that there is an alternative package WDI to access this data and it works well, but I want to know how to resolve the duplicates row name problem here in general so that I can deal with similar situation where no alternative package is available.
I heard from someone it is due to the fact that some data are stored as lists...
This is correct. The solution is fairly simple, but I find it really easy to get tripped up by this. Right now you're using:
dat <- as.data.frame(fromJSON(url)[2],flatten = TRUE, row.names = NULL)
The problem comes from fromJSON(url)[2]. This should be fromJSON(url)[[2]] instead. According to the documentation, the key difference between [ and [[ is a single bracket can select multiple elements whereas [[ selects only one.
You can see how this works with some fake data.
foo <- list(
a = rnorm(100),
b = rnorm(100),
c = rnorm(100)
)
With [, you can select multiple values inside this list.
foo[c("a", "b")]
length(foo["a"]) # Result is 1 not 100 like you might expect.
With [[ the results are different.
foo[[c("a", "b")]] # Raises a subscript error.
foo[["a"]] #This works.
length(foo[["a"]]) # Result is 100.
So, your answer will depend on which subset operator you're using. For your problem, you'll want to use [[ to select a single data.frame inside of the list. Then, you should be able to use rbind correctly.
final <- data.frame()
for (i in 1:10) {
url <- paste0(
'http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json&page=',
i
)
res <- jsonlite::fromJSON(url, flatten = TRUE)[[2]]
final <- rbind(final, res)
}
Alternative solution with lapply:
urls <- sprintf(
'http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json&page=%s',
1:10
)
resl <- lapply(urls, jsonlite::fromJSON, flatten = TRUE)
resl <- lapply(resl, "[[", 2) # Use lapply to select the 2 element from each list element.
resl <- do.call(rbind, resl) # This takes all the elements of the list and uses those elements as the arguments for rbind.

Convert JSON into CSV in R programming

I have JSON of the form:
{"abc":
{
"123":[45600],
"378":[78689],
"343":[23456]
}
}
I need to convert above format JSON to CSV file in R.
CSV format :
ds y
123 45600
378 78689
343 23456
I'm using R library rjson to do so. I'm doing something like this:
jsonFile <- fromJSON(file=fileName)
json_data_frame <- as.data.frame(jsonFile)
but it's not doing the way I need it.
You can use jsonlite::fromJSON to read the data into a list, though you'll need to pull it apart to assemble it into a data.frame:
abc <- jsonlite::fromJSON('{"abc":
{
"123":[45600],
"378":[78689],
"343":[23456]
}
}')
abc <- data.frame(ds = names(abc[[1]]),
y = unlist(abc[[1]]), stringsAsFactors = FALSE)
abc
#> ds y
#> 123 123 45600
#> 378 378 78689
#> 343 343 23456
I believe you got the json file reader - fromJSON function right.
df <- data.frame( do.call(rbind, rjson::fromJSON( '{"a":true, "b":false, "c":null}' )) )
The code below gets me Google's Location History (json) archive from https://takeout.google.com. This is if you have enabled a 'Timeline' (location tracking) in Google Maps on your cell. Credit to http://rpubs.com/jsmanij/131030 for the original code. Note that json files like this can be quite large and plyr::llply is so much more efficient than lapply in parsing a list. Data.table gives me the more efficient 'rbindlist' to take the list to a data.table. Google logs between 350 to 800 GPS calls each day for me! A multi-year location history is converted to quite a sizeable list by 'fromJSON':
format(object.size(doc1),units="MB")
[1] "962.5 Mb"
I found 'do.call(rbind..)' un-optimized. The timestamp, lat, and long needed some work to be useful to Google Earth Pro, but I am getting carried away. At the end, I use 'write.csv' to take a data.table to CSV. That is all the original OP wanted here.
ts lat long latitude longitude
1: 1416680531900 487716717 -1224893214 48.77167 -122.4893
2: 1416680591911 487716757 -1224892938 48.77168 -122.4893
3: 1416680668812 487716933 -1224893231 48.77169 -122.4893
4: 1416680728947 487716468 -1224893275 48.77165 -122.4893
5: 1416680791884 487716554 -1224893232 48.77166 -122.4893
library(data.table)
library(rjson)
library(plyr)
doc1 <- fromJSON(file="LocationHistory.json", method="C")
object.size(doc1)
timestamp <- function(x) {as.list(x$timestampMs)}
timestamps <- as.list(plyr::llply(doc1$locations,timestamp))
timestamps <- rbindlist(timestamps)
latitude <- function(x) {as.list(x$latitudeE7)}
latitudes <- as.list(plyr::llply(doc1$locations,latitude))
latitudes <- rbindlist(latitudes)
longitude <- function(x) {as.list(x$longitudeE7)}
longitudes <- as.list(plyr::llply(doc1$locations,longitude))
longitudes <- rbindlist(longitudes)
datageoms <- setnames(cbind(timestamps,latitudes,longitudes),c("ts","lat","long")) [order(ts)]
write.csv(datageoms,"datageoms.csv",row.names=FALSE)

json parsing in r

I am loading a heavily nested JSON in R -- the seed data on league of legends games. Thanks to another question I was able to open and get a flat data frame (100 x 14167).
library(json)
library(plyr)
data.json <- fromJSON(file = "data/matches1.json")
data.unlist <- lapply(data.json$matches, unlist)
funct <- function(x){
do.call("data.frame", as.list(x))
}
data.match <- rbind.fill(lapply(data.unlist, funct)) # takes ~15 min
data.frame <- as.data.frame(data.match)
However, most columns have the wrong type, and I run into anomalies when converting. Is there a way of converting the columns automatically to characters/factors or numerics? Or is this wishful thinking? :)

Extract JSON data from the rows of an R data frame

I have a data frame where the values of column Parameters are Json data:
# Parameters
#1 {"a":0,"b":[10.2,11.5,22.1]}
#2 {"a":3,"b":[4.0,6.2,-3.3]}
...
I want to extract the parameters of each row and append them to the data frame as columns A, B1, B2 and B3.
How can I do it?
I would rather use dplyr if it is possible and efficient.
In your example data, each row contains a json object. This format is called jsonlines aka ndjson, and the jsonlite package has a special function stream_in to parse such data into a data frame:
# Example data
mydata <- data.frame(parameters = c(
'{"a":0,"b":[10.2,11.5,22.1]}',
'{"a":3,"b":[4.0,6.2,-3.3]}'
), stringsAsFactors = FALSE)
# Parse json lines
res <- jsonlite::stream_in(textConnection(mydata$parameters))
# Extract columns
a <- res$a
b1 <- sapply(res$b, "[", 1)
b2 <- sapply(res$b, "[", 2)
b3 <- sapply(res$b, "[", 3)
In your example, the json structure is fairly simple so the other suggestions work as well, but this solution will generalize to more complex json structures.
I actually had a similar problem where I had multiple variables in a data frame which were JSON objects and a lot of them were NA's, but I did not want to remove the rows where NA's existed. I wrote a function which is passed a data frame, id within the data frame(usually a record ID), and the variable name in quotes to parse. The function will create two subsets, one for records which contain JSON objects and another to keep track of NA value records for the same variable then it joins those data frames and joins their combination to the original data frame thereby replacing the former variable. Perhaps it will help you or someone else as it has worked for me in a few cases now. I also haven't really cleaned it up too much so I apologize if my variable names are a bit confusing as well as this was a very ad-hoc function I wrote for work. I also should state that I did use another poster's idea for replacing the former variable with the new variables created from the JSON object. You can find that here : Add (insert) a column between two columns in a data.frame
One last note: there is a package called tidyjson which would've had a simpler solution but apparently cannot work with list type JSON objects. At least that's my interpretation.
library(jsonlite)
library(stringr)
library(dplyr)
parse_var <- function(df,id, var) {
m <- df[,var]
p <- m[-which(is.na(m))]
n <- df[,id]
key <- n[-which(is.na(df[,var]))]
#create df for rows which are NA
key_na <- n[which(is.na(df[,var]))]
q <- m[which(is.na(m))]
parse_df_na <- data.frame(key_na,q,stringsAsFactors = FALSE)
#Parse JSON values and bind them together into a dataframe.
p <- lapply(p,function(x){
fromJSON(x) %>% data.frame(stringsAsFactors = FALSE)}) %>% bind_rows()
#bind the record id's of the JSON values to the above JSON parsed dataframe and name the columns appropriately.
parse_df <- data.frame(key,p,stringsAsFactors = FALSE)
## The new variables begin with a capital 'x' so I replace those with my former variables name
n <- names(parse_df) %>% str_replace('X',paste(var,".",sep = ""))
n <- n[2:length(n)]
colnames(parse_df) <- c(id,n)
#join the dataframe for NA JSON values and the dataframe containing parsed JSON values, then remove the NA column,q.
parse_df <- merge(parse_df,parse_df_na,by.x = id,by.y = 'key_na',all = TRUE)
#Remove the new column formed by the NA values#
parse_df <- parse_df[,-which(names(parse_df) =='q')]
####Replace variable that is being parsed in dataframe with the new parsed and names values.######
new_df <- data.frame(append(df,parse_df[,-which(names(parse_df) == id)],after = which(names(df) == var)),stringsAsFactors = FALSE)
new_df <- new_df[,-which(names(new_df) == var)]
return(new_df)
}

convert json into data frame in R?

I need to convert json file into a data frame. Each line in the json file, may have different number of entries. For example
{"timestamp":"2016-12-13T04:04:06.394-0500",
"test101":"2016-12-13T04:04:06.382-0500",
"error":"false","from":"xon","event":"DAT","BT":"work","cd":"E","id":"IBM",
"key":"20161213040330617511","begin_work":"2016-12-13T04:04:06.383-0500"","#version:"1","#timestamp":"2016-12-14T20:04:29.502Z"}
{"timestamp":"2016-12-13T04:04:05.318-0500","test101":"2016-12-13T04:03:46.074-0500","error":"false","from":"de","event":"cp","BT":"work","cd":"dsh","id":"appl",
"key":"142314089",
"begin_work":"2016-12-13T04:03:46.074-0500",
"refresh":"2016-12-13T04:03:45.920-0500",
"co_refresh":"2016-12-13T04:03:45.769-0500",
"test104":"2016-12-13T04:03:45.832-0500",
"test104":"2016-12-13T04:03:45.832-0500",
"test105":"2016-12-13T04:03:46.031-0500",
"test7":"2016-12-13T04:03:46.032-0500",
"t-test9":"2016-12-13T04:03:45.704-0500",
"test10_StartDateTimeStamp":"2016-12-13T04:03:45.704-0500",
"stop":"2016-12-13T04:03:50.772-0500",
"stop_again":"2016-12-13T04:03:46.091-0500",
"#version":"1","#timestamp":"2016-12-14T20:04:29.503Z"}
{"timestamp":"2016-12-13T04:04:07.113-0500","test101":"2016-12-13T04:04:07.068-0500","error":"false","from":"xon","event":"DAT","BT":"work","cd":"E","id":"3YPS","key":"20161213040318326935","begin_work":"2016-12-13T04:04:07.069-0500","#version":"1","#timestamp":"2016-12-14T20:04:29.505Z"}
I need to start parsing the file form a keyword called "key" until a keyword called #version.
Data frame need to look something like this:
key group time
20161213040330617511 begin_work 2016-12-13T04:04:06.383-0500
142314089 begin_work 2016-12-13T04:03:46.074-0500
142314089 refresh 2016-12-13T04:03:45.920-0500
142314089 co_refresh 2016-12-13T04:03:45.769-0500
142314089 test104 2016-12-13T04:03:45.832-0500
etc
I have tried something like this:
library(jsonlite)
library(data.table)
setwd("C:/file/")
filenames <- list.files("system", pattern="*json*", full.names=TRUE)
dflist <- lapply(filenames, function(i) {
jsonlite::fromJSON(
paste0("[",
paste0(readLines(i),collapse=","),
"]"),flatten=TRUE
)
})
d<-rbindlist(dflist, use.names=TRUE, fill=TRUE)
I need to put key value pairs into a 3 column data frame
I am getting field names after key as columns and NA as the values. Any ideas how could I convert json to df frame in R?
This is something you can try, a combination of dplyr and tidyr :
library(dplyr)
library(tidyr)
library(jsonlite)
data <- jsonlite::fromJSON("data.json")
lapply(data, function(d) as_data_frame(d)) %>%
bind_rows() %>%
gather(groups, val, -timestamp, -key) %>%
select(key, group, timestamp)
BTW I had to change your json example a little bit.
Here's the json file I use:
{"x":{"timestamp":"2016-12-13T04:04:06.394-0500",
"test101":"2016-12-13T04:04:06.382-0500",
"error":"false","from":"xon","event":"DAT","BT":"work","cd":"E","id":"IBM",
"key":"20161213040330617511","begin_work":"2016-12-13T04:04:06.383-0500","#version":"1","#timestamp":"2016-12-14T20:04:29.502Z"},
"y":{"timestamp":"2016-12-13T04:04:05.318-0500","test101":"2016-12-13T04:03:46.074-0500","error":"false","from":"de","event":"cp","BT":"work","cd":"dsh","id":"appl",
"key":"142314089",
"begin_work":"2016-12-13T04:03:46.074-0500",
"refresh":"2016-12-13T04:03:45.920-0500",
"co_refresh":"2016-12-13T04:03:45.769-0500",
"test104":"2016-12-13T04:03:45.832-0500",
"test105":"2016-12-13T04:03:46.031-0500",
"test7":"2016-12-13T04:03:46.032-0500",
"t-test9":"2016-12-13T04:03:45.704-0500",
"test10_StartDateTimeStamp":"2016-12-13T04:03:45.704-0500",
"stop":"2016-12-13T04:03:50.772-0500",
"stop_again":"2016-12-13T04:03:46.091-0500",
"#version":"1","#timestamp":"2016-12-14T20:04:29.503Z"},
"z":{"timestamp":"2016-12-13T04:04:07.113-0500","test101":"2016-12-13T04:04:07.068-0500","error":"false","from":"xon","event":"DAT","BT":"work","cd":"E","id":"3YPS","key":"20161213040318326935","begin_work":"2016-12-13T04:04:07.069-0500","#version":"1","#timestamp":"2016-12-14T20:04:29.505Z"}}