Reading complex json data as dataframe in R - json
I have the following json data:
json_data <- data.frame(changedContent=c('{"documents":[],"images":[],"profileCommunications":[],"shortListedProfiles":[],"matrimonyUser":{"createdBy":null,"parentMatrimonyUserId":0,"userSalutationVal":"Mr.","matrimonyUserCode":"173773","matrimonyUserName":"SUDIPTO DEB BARMAN","emailAddress":"sudipto06#yahoo.com","contactNumber":"9434944429","emailOTP":"","mobilePhoneOTP":"","isEmailOTPVerified":1,"isMobilePhoneOTPverified":1,"isHideContact":null,"isHideEmail":null,"lastLogInTime":null,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133028,"isDeleted":null,"isActive":1,"isAllowedLogin":null,"numberOfChildProfile":null,"matrimonyUserTypeId":100000006,"matrimonyUserTypeVal":"Online Customer","onlineStatusFlag":null,"lastSystemTransactionDateTime":null,"isLive":null,"mobileCountryCode":0,"userStatusIdValue":"Registered and Verified","crmUserStatusIdValue":null,"deactivateReasonIdValue":null,"deactivateReason":null,"matrimonyUserId":165614,"userSalutationId":100001617,"userStatusId":100002760,"crmUserStatusId":null,"deactivateReasonId":null,"createdOn":null},"aboutMes":[],"partnerPreference":{"isSubcastDealbreaker":null,"isOccupationDealbreaker":null,"isIndustryDealbreaker":null,"isIncomeDealbreaker":null,"isHeightDealbreaker":null,"isBodyTypeDealbreaker":null,"isHivDealbreaker":null,"isFamilyTypeDealbreaker":null,"isFamilyIncomeDealbreaker":null,"isDrinkingDealbreaker":null,"locationTypeIds":null,"isLocationTypeDealbreaker":null,"isLocationNameDealbreaker":null,"locationNameOthers":"","isMaritalStatusDealbreaker":null,"isSmokingDealbreaker":null,"isFoodHabitsDealbreaker":null,"isGothraDealbreaker":null,"isManglikDealbreaker":null,"isProfileCreatedbyDealbreaker":null,"religionIdsValues":"","casteIdsValues":null,"motherTongueIdsValues":"","minimumEducationValues":"","occupationIdsValues":"","industryIdsValues":"","bodyTypeIdsValues":"","hivIdValue":null,"familyTypeIdsValues":"","familyIncomeValues":"","drinkingIdValues":"","locationNameIdsValues":null,"maritalStatusIdsValues":"","smokingIdsValues":"","foodHabitsIdsValues":"","gothraIdsValues":"","manglikIdValue":null,"profileCreatedbyValues":"","heightFrom":null,"heightTo":null,"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133115,"isDeleted":null,"isActive":1,"partnerPreferenceId":2757,"isReligionDealbreaker":null,"casteIds":null,"isCasteDealbreaker":null,"isMotherTongueDealbreaker":null,"subcaste":"","religionIds":null,"motherTongueIds":null,"minimumEducation":null,"occupationIds":null,"industryIds":null,"bodyTypeIds":null,"income":null,"incomeValues":"","familyIncome":null,"hivId":0,"familyTypeIds":null,"drinkingId":null,"locationNameIds":"","maritalStatusIds":null,"smokingIds":null,"foodHabitsIds":null,"gothraIds":null,"manglikId":0,"profileCreatedby":null,"adbCount":0,"fifCount":0,"ageFrom":null,"ageTo":null,"isAgeDealbreaker":null,"isminimumEducationDealbreaker":null,"userId":165614,"createdOn":1440167133115,"height":null},"profileAgentDtl":{"campaignId":"","acquirerCode":0,"createdBy":4444,"modifiedBy":4444,"modifiedOn":1440167133110,"isDeleted":null,"isActive":1,"relationshipMangerId":0,"sourceCode":100000004,"userId":165614,"createdOn":1440167133110,"idOdNo":"","relationshipMangerName":null,"relationshipMangerContact":"","profileAgentDtlId":2757,"dateOfEntry":1437935400000,"formSerialNo":"3661","sourceCodeVal":null,"agentCode":null,"acquirerCodeVal":null,"agentName":"","agentMobileNo":"","adBookingNo":""},"profileBasicRegistrationDtl":{"sourceId":null,"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133109,"isDeleted":null,"isActive":1,"genderId":100000596,"priorityId":100001671,"profileCreatedById":100000590,"webSourceId":100001672,"dob":null,"genderVal":"Male","userId":165614,"profileCompleteness":null,"createdOn":1440167133109,"profileDtlId":2757,"nickName":null,"relation":null,"regViewersCount":null,"guestViewersCount":null,"trustScore":20,"webSourceVal":"Newspaper ","priorityVal":"Medium","profileCreatedByval":"Self","fieldContentModerationStatusId":null,"photoModerationStatusId":null,"documentModerationStatusId":null,"isPhotoHide":null,"isHoroscopeHide":null},"profileAstrologyDtl":{"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133111,"isDeleted":null,"isActive":1,"userId":165614,"createdOn":1440167133111,"profileAstrologyDtlId":2757,"gothraId":0,"gaanId":0,"nakshatraId":0,"sunSignId":0,"moonSignId":0,"manglikFlagId":0,"placeOfBirth":"0","timeOfBirth":null,"isPreferredPartnerDtl":null,"gothraVal":"","gaanVal":"","nakshatraVal":"","sunSignVal":"","moonSignVal":"","manglikFlagVal":""},"profileFamilyDtl":{"permanentAddress":null,"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133111,"isDeleted":null,"isActive":1,"familyIncome":0.0,"fathersStatusId":0,"mothersStatusId":0,"fathersOccupationId":0,"mothersOccupationId":0,"mothersIndustryId":null,"fathersIndustryId":null,"familyTypeId":0,"familyValueId":0,"familyKindId":0,"familyStatusId":0,"userId":165614,"createdOn":1440167133111,"moderatedOn":null,"profileFamilyDtlId":2757,"fathersName":"","fathersStatusVal":null,"motherName":"","mothersStatusVal":null,"numberOfSibling":0,"shortRefModerationStatus":null,"fathersOccupationVal":null,"mothersOccupationVal":null,"familyTypeVal":null,"familyValueVal":null,"familyKindVal":null,"familyStatusVal":null,"mothersIndustryVal":null,"fathersIndustryVal":null,"familyIncomeVal":"","moderatedBy":null,"moderatorRemarks":null,"ref1fullName":null,"ref1relationship":null,"ref1emailId":null,"ref1phoneNo":null,"ref1remarks":null,"ref2fullName":null,"ref2relationship":null,"ref2emailId":null,"ref2phoneNo":null,"ref2remarks":null},"profileLifestyleDtl":{"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133110,"isDeleted":null,"isActive":1,"favouriteBooksTypeIds":null,"favouriteHobbiesTypeIds":null,"favouriteMoviesTypeIds":null,"favouriteMusicTypeIds":null,"favouriteSportsTypeIds":null,"livingInHouseTypeId":0,"vehicleTypeOwnedId":0,"petsId":0,"drinkingStatusId":0,"numberOfKids":0,"userId":165614,"createdOn":1440167133110,"moderatedOn":null,"moderatedBy":null,"isModerated":null,"moderatorRemarks":null,"profileLifestyleDtlId":2757,"smokingStatusId":0,"foodHabitsId":0,"financialPlansId":0,"retirementPlansId":0,"vehicleDescription":null,"vehicleNumber":0,"childrenDesiredId":null,"isReligionImportantFlagId":null,"religiousBeliefs":0,"smokingStatusVal":"","drinkingStatusVal":null,"foodHabitsVal":"","financialPlansVal":null,"retirementPlansVal":null,"vehicleTypeOwnedVal":null,"livingInHouseTypeVal":null,"petsVal":null,"childrenDesiredVal":null,"favouriteBooksTypeVals":"","favouriteMoviesTypeVals":"","favouriteMusicTypeVals":"","favouriteSportsTypeVals":"","favouriteHobbiesTypeVals":"","isReligionImportantFlagVal":null,"religiousBeliefsVal":"","favouriteHobbiesRating":null,"favouriteHobbiesDescription":null,"noOfKidsVal":null},"profileOccupationEducationDtl":{"highestSpecializationVal":null,"highestSpecializationOthersVal":"","createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133110,"isDeleted":null,"isActive":1,"highestEducationId":null,"occupationId":null,"designationId":null,"incomeCurrencyId":null,"education2id":0,"education3id":0,"specialization2id":0,"specialization3id":0,"highestSpecializationId":null,"industryId":null,"annualIncome":null,"userId":165614,"createdOn":1440167133110,"moderatedOn":null,"moderatedBy":null,"isModerated":null,"moderatorRemarks":null,"highestEducationVal":null,"occupationVal":null,"industryVal":null,"incomeCurrencyVal":null,"designationVal":null,"education3val":null,"education2val":null,"specialization2val":null,"specialization2othersVal":"","specialization3val":null,"specialization3othersVal":"","additionalQualification":null,"professionalQualification":null,"occupationOthersVal":"","departmentId":null,"employmentSectorId":null,"companyName":"","highestEducationInstituteVal":null,"education2instituteVal":"0","education3instituteVal":"","professionalQualificationVal":null,"departmentVal":null,"employmentSectorVal":null,"annualIncomeVal":null,"profileOccupationEducationDtlId":2757,"schoolName2":"","schoolName1":"","education2instituteId":null,"education3instituteId":null},"profilePersonalDtl":{"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133110,"isDeleted":null,"isActive":1,"familyOriginId":0,"stateId":null,"countryId":null,"numChildrenProspect":0,"countryVal":null,"stateVal":null,"landmark":null,"locationVal":null,"userId":165614,"locationId":null,"religionId":100000598,"createdOn":1440167133110,"isPreferredPartnerDtl":null,"maritalStatusId":null,"maritalStatusVal":null,"subCaste":"","profilePersonalDtlId":2757,"motherTongueId":100000618,"casteId":null,"marryOutsideCasteId":0,"familyOriginVal":null,"facebookHandle":"","linkedInHandle":"","twiterHandle":null,"googlePlus":null,"casteText":"Kshatriya","homeTownText":"0","religionVal":"Hindu","motherTongueVal":"Bengali","marryOutsideCasteVal":"","isSocialMediaVerified":null,"numChildrenProspectVal":null,"locality":null},"profilePhysicalAttributesDtl":{"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133110,"isDeleted":null,"isActive":1,"hivId":0,"bodyTypeId":0,"complexionId":0,"bloodGroupId":0,"userId":165614,"createdOn":1440167133110,"height":null,"isPreferredPartnerDtl":null,"hairColourId":0,"eyeColourId":0,"hairLengthId":0,"physicalStatusId":null,"disabilitiesVal":"","hivVal":"","knownAilmentVal":"","bodyTypeVal":null,"complexionVal":null,"hairColourVal":"","eyeColourVal":"","hairLengthVal":"","physicalStatusVal":null,"bloodGroupVal":null,"profilePhysicalAttributesDtlId":2757,"weight":null},"profileSiblingsDtl":null,"profileImageDtl":null,"notes":[{"createdBy":4444,"modifiedBy":4444,"modifiedOn":1440167133115,"isDeleted":null,"isActive":1,"userId":165614,"createdOn":1440167133115,"profileNotesDtlId":3499,"notesDescription":""}],"references":[],"relationOthers":[],"photoIdentificationDetails":null,"preModAboutMes":[{"answer":"null ","preModerationAboutMeId":1439283144614540579,"moderationStatus":1,"createdBy":4444,"questionVal":null,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133092,"isActive":1,"isAnswerChange":0,"userId":165614,"questionId":1,"createdOn":1440167133092},{"answer":"null ","preModerationAboutMeId":1439283144614540580,"moderationStatus":1,"createdBy":4444,"questionVal":null,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133093,"isActive":1,"isAnswerChange":0,"userId":165614,"questionId":2,"createdOn":1440167133093},{"answer":"null ","preModerationAboutMeId":1439283144614540581,"moderationStatus":1,"createdBy":4444,"questionVal":null,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133094,"isActive":1,"isAnswerChange":0,"userId":165614,"questionId":3,"createdOn":1440167133094},{"answer":"null ","preModerationAboutMeId":1439283144614540582,"moderationStatus":1,"createdBy":4444,"questionVal":null,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133094,"isActive":1,"isAnswerChange":0,"userId":165614,"questionId":4,"createdOn":1440167133094}],"preModContent":[{"preModerationContentId":1439307323336466240,"isChangeMatrimonyUserName":null,"isChangeLocality":0,"isChangeLandmark":0,"permanentAddress":"Dev Barman,Mayapur,PO-Talbagicha,Kharadpur-721306","isChangePermanentAddress":1,"nameOfInstitutionHighestEducation":"0","highestSpecializationVal":null,"highestSpecializationOthersVal":null,"createdBy":4444,"matrimonyUserName":"SUDIPTO DEB BARMAN","userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133104,"isDeleted":null,"isActive":1,"highestEducationId":0,"occupationId":0,"designationId":0,"incomeCurrencyId":null,"highestSpecializationId":0,"industryId":0,"annualIncome":0.0,"stateId":100000269,"countryId":100000101,"dob":520972200000,"countryVal":null,"stateVal":null,"landmark":"","userId":165614,"moderationStatusId":null,"createdOn":1440167133104,"isChangeNameOfInstitutionHighestEducation":0,"isChangeHighestSpecialization":0,"highestEducationVal":null,"isChangeHighestEducation":0,"occupationVal":null,"isChangeOccupation":0,"industryVal":"","isChangeIndustry":0,"incomeCurrencyVal":null,"isChangeIncomeCurrency":0,"customerTypeId":null,"customerTypeVal":null,"isChangeCustomerType":1,"isChangeDob":1,"maritalStatusId":100000900,"maritalStatusVal":"Never Married","isChangeMaritalStatus":1,"isChangeCountry":1,"isChangeState":1,"cityId":0,"isChangeCity":0,"cityVal":null,"isChangeAnnualIncome":0,"designationVal":null,"isChangeDesignation":0,"subCaste":null,"hometown":null,"isChangeSubCaste":null,"isChangeHometown":null,"ref1fullName":null,"isChangeRef1fullName":null,"ref1relationship":null,"isChangeRef1relationship":null,"ref1emailId":null,"isChangeRef1emailId":null,"ref1phoneNo":null,"isChangeRef1phoneNo":null,"ref1remarks":null,"isChangeRef1remarks":null,"ref2fullName":null,"isChangeRef2fullName":null,"ref2relationship":null,"isChangeRef2relationship":null,"ref2emailId":null,"isChangeRef2emailId":null,"ref2phoneNo":null,"isChangeRef2phoneNo":null,"ref2remarks":null,"isChangeRef2remarks":null,"typeOfCustomer":null,"isChangeTypeOfCustomer":null,"highestEducationInstituteId":null,"typeOfCustomerId":100000006,"locality":""}],"preModReferences":[],"preModShortReferences":[{"moderationStatus":null,"createdBy":4444,"userSessionDtlId":null,"modifiedBy":4444,"modifiedOn":1440167133099,"isDeleted":null,"isActive":1,"userId":165614,"createdOn":1440167133099,"isModerated":null,"premoderationprofileImageDtlId":1772,"ref1fullName":"","isChangeRef1fullName":0,"ref1relationship":"","isChangeRef1relationship":0,"ref1emailId":"","isChangeRef1emailId":0,"ref1phoneNo":null,"isChangeRef1phoneNo":0,"ref1remarks":null,"ref2fullName":"","isChangeRef2fullName":0,"ref2relationship":"","isChangeRef2relationship":0,"ref2emailId":"","isChangeRef2emailId":0,"ref2phoneNo":null,"isChangeRef2phoneNo":0,"ref2remarks":null}],"paymentTransactions":[],"userPlanMappings":[],"userFeatureMappings":[],"userPlanMapping":null,"blockedProfiles":[],"notMyTypeProfiles":[]}')
I want to convert the above to a convenient data frame with 1 row each MatrimonyUserId in the above.I have tried a few things but unable to get this in desired format.
Assuming you can wrangle the json data into a nested list....
x <- jsonlite::fromJSON(jsontext)
I've found it's easiest to parse complex list structures by using the pipe operator and frequently checking the structure (limited to 1 or 2 levels.
str1 <- function(x) str(x, 1)
str2 <- function(x) str(x, 2)
# for pipe operator
library("magittr")
x %>% str1
x %>% .[[1]] %>% str2
Etc.
Related
R highcharter get data from plots saved as html
I plot data with highcharter package in R, and save them as html to keep interactive features. In most cases I plot more than one graph, therefore bring them together as a canvas. require(highcharter) hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>% lapply(function(x) highchart() %>% hc_add_series(x)) hc_grid <- hw_grid(hc_list,ncol = 2) htmltools::browsable(hc_grid) # print htmltools::save_html(hc_grid,"test_grid.html") # save I want to extract the data from plots that I have saved as html in the past, just like these. Normally I would do hc_list[[1]]$x$hc_opts$series, but when I import html into R and try to do the same, I get an error. It won't do the job. > hc_imported <- htmltools::includeHTML("test_grid.html") > hc_imported[[1]]$x$hc_opts$series Error in hc_imported$x : $ operator is invalid for atomic vectors If I would be able to write a function like get_my_data(my_imported_highcharter,3) # get data from 3rd plot it would be the best. Regards.
You can use below code require(highcharter) hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>% lapply(function(x) highchart() %>% hc_add_series(x)) hc_grid <- hw_grid(hc_list,ncol = 2) htmltools::browsable(hc_grid) # print htmltools::save_html(hc_grid,"test_grid.html") # save # hc_imported <- htmltools::includeHTML("test_grid.html") # hc_imported[[1]]$x$hc_opts$series library(jsonlite) library(RCurl) library(XML) get_my_data<-function(my_imported_highcharter,n){ webpage <- readLines(my_imported_highcharter) pagetree <- htmlTreeParse(webpage, error=function(...){}) body <- pagetree$children$html$children$body divbodyContent <- body$children$div$children[[n]] script<-divbodyContent$children[[2]] data<-as.character(script$children[[1]])[6] data<-fromJSON(data,simplifyVector = FALSE) data<-data$x$hc_opts$series[[1]]$data return(data) } get_my_data("test_grid.html",3) get_my_data("test_grid.html",1)
Creating Dataframe from a json file
I want to create a proper data frame reading from a json file. I am able to view the created data frame properly, but dplyr function group_by does not work on it. It is probably because when I do the str() of the data frame created it gives every column as a list of strings as opposed to a vector of strings. I am trying the following: require(jsonlite) train_file = 'train.json' train_data <- fromJSON(train_file) rb = data.frame(sapply(train_data,c), stringsAsFactors = FALSE) rbs = rb %>% slice(1:10) rbsg = rbs %>% group_by(colname) This gives the following error: Error: cannot group column colname, of class 'list' Very specifically, the file that I am trying to read is the train.json file in this kaggle competition: https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data
You need to unnest() the column of interest before operating on it (e.g. before using group_by() or other dplyr verbs): library(jsonlite) library(tidyverse) rbs <- fromJSON("train.json") %>% bind_rows() rbsg <- rbs %>% unnest(bedrooms) %>% group_by(bedrooms) rbs_filtered <- rbs %>% unnest(bathrooms) %>% filter(bathrooms > 5)
jsonlite's fromJSON is returning a list of 2 lists instead of a df
I'm following the FCC's documentation to download some metadata about proceedings. I don't believe I can post the data but you can get a free API key. My code results in a listed list of 2 lists instead of a structured df from the JSON format. My goal is to have a dataframe where each json element is it's own column.. like a normal df. library(httr) library(jsonlite) datahere = "C:/fcc/" setwd(datahere) URL <- "https://publicapi.fcc.gov/ecfs/filings?api_key=<KEY HERE>&proceedings.name=14-28&sort=date_disseminated,DESC" dataDF <- GET(URL) dataJSON <- content(dataDF, as="text") dataJSON <- fromJSON(dataJSON) # NAs dataJSON2 <- lapply(dataJSON, function(x) { x[sapply(x, is.null)] <- NA unlist(x) }) x <- do.call("rbind", dataJSON2) x <- as.data.frame(x)
The JSON is really deeply nested, so you need to put a little more thought into converting between list and data.frame. The logic below pulls out a data.frame of 25 filings (102 variables) and 10 aggregations (25 variables). # tackle the filings object filings_df <- ldply(dataJSON$filings, function(x) { # removes null list elements x[sapply(x, is.null)] <- NA # converts to a named character vector unlisted_x <- unlist(x) # converts the named character vector to data.frame # with 1 column and rows for each element d <- as.data.frame(unlisted_x) # we need to transpose this data.frame because # the rows should be columns, and don't check names when converting d <- as.data.frame(t(d), check.names=F) # now assign the actual names based on that original # unlisted character vector colnames(d) <- names(unlisted_x) # now return to ldply function, which will automatically stack them together return(d) }) # tackle the aggregations object # same exact logic to create the data.frame aggregations_df <- ldply(dataJSON$aggregations, function(x) { # removes null list elements x[sapply(x, is.null)] <- NA # converts to a named character vector unlisted_x <- unlist(x) # converts the named character vector to data.frame # with 1 column and rows for each element d <- as.data.frame(unlisted_x) # we need to transpose this data.frame because # the rows should be columns, and don't check names when converting d <- as.data.frame(t(d), check.names=F) # now assign the actual names based on that original # unlisted character vector colnames(d) <- names(unlisted_x) # now return to ldply function, which will automatically stack them together return(d) })
How to read nested JSON structure?
I have some JSON that looks like this: "total_rows":141,"offset":0,"rows":[ {"id":"1","key":"a","value":{"SP$Sale_Price":"240000","CONTRACTDATE$Contract_Date":"2006-10-26T05:00:00"}}, {"id":"2","key":"b","value":{"SP$Sale_Price":"2000000","CONTRACTDATE$Contract_Date":"2006-08-22T05:00:00"}}, {"id":"3","key":"c","value":{"SP$Sale_Price":"780000","CONTRACTDATE$Contract_Date":"2007-01-18T06:00:00"}}, ... In R, what would be the easiest way to produce a scatter-plot of SP$Sale_Price versus CONTRACTDATE$Contract_Date? I got this far: install.packages("rjson") library("rjson") json_file <- "http://localhost:5984/testdb/_design/sold/_view/sold?limit=100" json_data <- fromJSON(file=json_file) install.packages("plyr") library(plyr) asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame)) but now I'm stuck... > plot(CONTRACTDATE$Contract_Date, SP$Sale_Price) Error in plot(CONTRACTDATE$Contract_Date, SP$Sale_Price) : object 'CONTRACTDATE' not found How to make this work?
Suppose you have the following JSON-file: txt <- '{"total_rows":141,"offset":0,"rows":[ {"id":"1","key":"a","value":{"SP$Sale_Price":"240000","CONTRACTDATE$Contract_Date":"2006-10-26T05:00:00"}}, {"id":"2","key":"b","value":{"SP$Sale_Price":"2000000","CONTRACTDATE$Contract_Date":"2006-08-22T05:00:00"}}, {"id":"3","key":"c","value":{"SP$Sale_Price":"780000","CONTRACTDATE$Contract_Date":"2007-01-18T06:00:00"}}]}' Then you can read it as follows with the jsonlite package: library(jsonlite) json_data <- fromJSON(txt, flatten = TRUE) # get the needed dataframe dat <- json_data$rows # set convenient names for the columns # this step is optional, it just gives you nicer columnnames names(dat) <- c("id","key","sale_price","contract_date") # convert the 'contract_date' column to a datetime format dat$contract_date <- strptime(dat$contract_date, format="%Y-%m-%dT%H:%M:%S", tz="GMT") Now you can plot: plot(dat$contract_date, dat$sale_price) Which gives: If you choose not to flatten the JSON, you can do: json_data <- fromJSON(txt) dat <- json_data$rows$value sp <- strtoi(dat$`SP$Sale_Price`) cd <- strptime(dat$`CONTRACTDATE$Contract_Date`, format="%Y-%m-%dT%H:%M:%S", tz="GMT") plot(cd,sp) Which gives the same plot:
I found a way that doesn't discard the field names: install.packages("jsonlite") install.packages("curl") json <- fromJSON(json_file) r <- json$rows At this point r looks like this: > class(r) [1] "data.frame" > colnames(r) [1] "id" "key" "value" After some more Googling and trial-and-error I landed on this: f <- r$value sp <- strtoi(f[["SP$Sale_Price"]]) cd <- strptime(f[["CONTRACTDATE$Contract_Date"]], format="%Y-%m-%dT%H:%M:%S", tz="GMT") plot(cd,sp) And the result on my full data-set...
JSON to R for Data Mining
I am trying to grab tweets using the Topsy Otter api, so I can perform some data mining on it for my dissertation. So far, I have got: library(RJSONIO) library(RCurl) tweet_data <- getURL("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx") fromJSON(tweet_data) Which works fine. Now however, I want to return just a couple details from this file, 'content' and 'trackback_date'. I cannot seem to figure out how - I have tried cobbling a couple of examples together, but unable to extract what I want. Here is what I've tried so far: trackback_date <- lapply(tweet_data$result, function(x){x$trackback_date}) content <- lapply(tweet_data$result, function(x){x$content}) Any help would be greatly appreciated, thank you. edit I have also tried: library("rjson") # use rjson tweet_data <- fromJSON(paste(readLines("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx"), collapse="")) # get a data from Topsy Otter API # convert JSON data into R object using fromJSON() trackback_date <- lapply(tweet_data$result, function(x){x$trackback_date}) content <- lapply(tweet_data$result, function(x){x$content})
Basic processing of Topsy Otter API response: library(RJSONIO) library(RCurl) tweet_data <- getURL("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx") # # Addition to your code # tweets <- fromJSON(tweet_data)$response$list content <- sapply(tweets, function(x) x$content) trackback_date <- sapply(tweets, function(x) x$trackback_date) EDIT: Processing multiple pages Function gets 100 items from specified page: pagetweets <- function(page){ url <- paste("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&page=",page, "&perpage=100&maxtime=1304226000&apikey=xxx", collapse="", sep="") tweet_data <- getURL(url) fromJSON(tweet_data)$response$list } Now we can apply it to multiple pages: tweets <- unlist(lapply(1:10, pagetweets), recursive=F) And, voila, this code: content <- sapply(tweets, function(x) x$content) trackback_date <- sapply(tweets, function(x) x$trackback_date) returns you 1000 records.