Related
I am writing a loop (in R) to webscrape Reddit posts - using Reddit's API ("Pushshift").
Essentially, I would like to get every comment that contains the word "Trump" between now and until 20,000 hours ago at an hourly basis. The API stores the comments in a JSON frame - I wrote the following code in R to obtain these comments (note - I made it so that the results are saved after every 200 iterations in case of a crash):
library(jsonlite)
part1 = "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 = "h&before="
part3 = "h&size=500"
results = list()
for (i in 1:20000)
{tryCatch({
{
url_i<- paste0(part1, i+1, part2, i, part3)
r_i <- data.frame(fromJSON(url_i))
results[[i]] <- r_i
myvec_i <- sapply(results, NROW)
print(c(i, sum(myvec_i)))
ifelse(i %% 200 == 0, saveRDS(results, "results_index.RDS"), "" )
}
}, error = function(e){})
}
final = do.call(rbind.data.frame, results)
saveRDS(final, "final.RDS")
The code runs - but I am looking for tips to increase the speed and efficiency of this code. For example, I have noticed that:
Sometimes this code seems to take a really long time on certain iterations
I also have a feeling that as the "list" grows in size and the global environment with R becomes more full, things are also slowing down.
Sometimes, the webscraping stops collecting new results (i.e. I added a statement which shows the cumulative number of results that have been collected at each iteration - sometimes, this number stops updating)
I used "tryCatch()" to skip errors to prevent the loop from crashing - but perhaps there might have been some way around this that could have potentially resulted in more Reddit comments being scraped?
Could someone please recommend some tips on how to optimize and speed this code up? Perhaps someone could try running this code and let me know what they think?
Thank you!
There are two things you can do : 1) save the data.frame into a ".RData file" at each iteration. You need less memory when you do this because you do not store data in the RAM 2) use parallel calculations. Here is an example :
library(parallel)
library(doParallel)
library(RSelenium)
fn_Par <- function(core_Id, all_Index, list_remDr, nb_Core)
{
library(jsonlite)
library(RSelenium)
remDr <- list_remDr[[core_Id]]
remDr$open()
setwd("D:\\")
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "h&before="
part3 <- "h&size=500"
nb_Index_All <- length(all_Index)
nb_Id_Per_Core <- floor(nb_Index_All / nb_Core)
index_To_Extract <- all_Index[(1 + (core_Id - 1) * nb_Id_Per_Core) : min((core_Id * nb_Id_Per_Core), nb_Index_All)]
for(i in index_To_Extract)
{
url_i <- paste0(part1, i + 1, part2, i, part3)
remDr$navigate(url_i)
Sys.sleep(0.5)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
if(is.null(dim(r_i)) == FALSE)
{
Sys.sleep(10)
remDr$navigate(url_i)
web_Obj <- remDr$findElement("css selector", 'body > pre')
r_i <- tryCatch(data.frame(fromJSON(web_Obj$getElementText()[[1]])), error = function(e) NA)
}
save(r_i, file = paste0(i, "_core_Id_", core_Id, ".RData"))
Sys.sleep(0.5)
}
}
nb_CPU <- 4
cluster <- parallel::makeCluster(nb_CPU)
doParallel::registerDoParallel(cl = cluster)
list_remDr <- list()
list_rd <- list()
for(i in 1 : nb_CPU)
{
print(i)
port <- as.integer(4444L + rpois(lambda = 1000, 1))
list_rd[[i]] <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
list_remDr[[i]] <- list_rd[[i]]$client
}
parLapply(cluster, X = 1 : nb_CPU, fun = fn_Par, all_Index = 1 : 2000, list_remDr = list_remDr, nb_Core = nb_CPU)
Here is another approach that can be considered :
library(parallel)
library(doParallel)
fn_Par <- function(core_Id, all_Index, nb_Core)
{
library(jsonlite)
setwd("D:\\")
part1 <- "https://api.pushshift.io/reddit/search/comment/?q=trump&after="
part2 <- "h&before="
part3 <- "h&size=500"
nb_Index_All <- length(all_Index)
nb_Id_Per_Core <- floor(nb_Index_All / nb_Core)
index_To_Extract <- all_Index[(1 + (core_Id - 1) * nb_Id_Per_Core) : min((core_Id * nb_Id_Per_Core), nb_Index_All)]
for(i in index_To_Extract)
{
url_i <- paste0(part1, i + 1, part2, i, part3)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
if(is.null(dim(r_i)) == TRUE)
{
Sys.sleep(5)
r_i <- tryCatch(data.frame(fromJSON(url_i)), error = function(e) NA)
}
save(r_i, file = paste0(i, "_core_Id_", core_Id, ".RData"))
}
}
nb_CPU <- 4
cluster <- parallel::makeCluster(nb_CPU)
doParallel::registerDoParallel(cl = cluster)
parLapply(cluster, X = 1 : nb_CPU, fun = fn_Par, all_Index = 1 : 2000, nb_Core = nb_CPU)
Here is a big web document : https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut . I know how to extract the text with
library(rvest)
library(magrittr)
page_url<- "https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut"
page_html<- read_html(page_url)
document <- page_html %>%
html_nodes("hr") %>%
html_text()
document
[1] "Rappel de votre demande:"
[2] "Format de téléchargement: : Texte"
[3] "Vues 1 à 544 sur 544"
[4] "Nombre de pages: 544"
[5] "Notice complète:"
[6] "Titre : Oeuvres complètes de Molière : accompagnées de notes tirées de tous les commentateurs avec des remarques nouvelles. Monsieur de Pourceaugnac / par M. Félix Lemaistre"
[7] "Auteur : Molière (1622-1673). Auteur du texte"
[8] "Auteur : Voltaire (1694-1778). Auteur du texte"
[9] "Auteur : La Harpe, Jean François de (1739-1803). Auteur du texte"
[10] "Auteur : Auger, Louis-Simon (1772-1829). Auteur du texte"
However, it's important for me to track the page from which the text was extracted. The start and the end of a page is actually represented by an horizontal line as you can see here https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut. So instead of retrieving a vector in which each element represent a row of the document, I want to have a list in which each element is a page, and each page is a vector in which each element is a row of the document. Something like
[[1]]
[1] "avurrbbihevyupsexvgymphjhdiqtfxzlwrbzpuqqpcxtlyrmyfxewydqnwqpinafaajvhylgaerlqilsvlwnscbiwoyinwjoudu"
[2] "gcgyuizpzznacdnrucvcjajjkbfahvlqqcoudbhpvuuvgrefpglnweznrimuzuydbzjzvhqezmjqtndzdhvvvbnhyipujusjmbhf"
[3] "caugvpyabksaqgktlrcoghkgjaqglpicgcngovvecesasevcdsmimysvrojvpwhbewxfwhdysvdcwmgxlziajwhilclecnkobmnc"
[4] "vuskqpyfqvqexilxqbhviqbdhhldprgdhifwzvhhvcclmljdgqmzsjrvlosftjshpuhxyjfsmfkqsxhaafysgesxwtoechrtekhy"
[[2]]
[1] "muvahkvftgglaphbzfehpnzvemhzixawlvadoxncmtmtzhqjlciozhgspnrusbkycgoqovxslusonmgqehbajbwpcldjquxchsvx"
[2] "pnhpzpbhjvqhehmlchncmgnhapaoqncvezaphilrpqguetutczpydrqthgdhwjtmlfhgvqvofdcylefrmergbkkwnsxlojgyaagw"
[3] "okjhxdpliykzbmdaghtgnsqftxhgpmkpsmiknuugejnrqmzaxqdljnbroxensegyxpikhzwkfzrqairvdhcvglcelnexvcypjkrx"
[4] "ftrbacjpwgmiuwbprvdkfpplycthukvycsyrjwsrokrrvcylzaxxdsgwlctglqaylegeflnlodttkiincavtncxttegstkgvvqgo"
[[3]]
[1] "ndnsdtqxpatoigobldauekhqdbcgvyqmcwyvmcvaredlrfjafiidwvcczqmufvufwjtdhordkaauukjezkyaodffohbzrnhwvioi"
[2] "ywryphperpsnbuspbfengmlllevavpbebfquiguvahshxdleyutvknsfiqcvrsirajqkzppbutsfbspjoirnqacoipcfxisugrto"
[3] "ivuzuxpflzqyphbnsdwvrqwcblxfagdflhqpgldnxkpuhzlhapueowofcgnakgwajgnaaqcvqxzwmorcmjybljsioulscnnntbmx"
[4] "cpbjxincbyrdasbrgrfdzxdzlmogfjmezgdkswpmcjrrlonsvgsaccrjvpbholodgsdcwslpsylslhoxliarkbighsmffoxprffb"
library(stringi)
library(rvest)
library(tidyverse)
Cache the page since it's big and loads really slowly:
if (!file.exists("~/Data/forso.html")) {
read_html(
"https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut"
) -> pg
write_lines(as.character(pg), "~/Data/forso.html")
}
Read it in as lines. This is usually a really bad idea for working with HTML but it's better for this process since the XPath required for dealing with text between sequences of tags is gnarly and slow (even just finding the <hr> elements felt kinda slow using html_nodes():
doc <- read_lines("~/Data/forso.html")
Now, find all the <hr> elements, ignoring the first two since they are after the intro/metadata section:
pos <- which(doc == "<hr>")[-(1:2)]
Create start/end index marker positions the text:
starts <- head(pos, -1)
ends <- tail(pos, -1)
Iterate along the start/end positions, extract the text, split it into lines and make a data frame:
map_df(seq_along(starts), ~{
start <- starts[.x]
end <- ends[.x]
data_frame(
pg = .x,
txt = read_html(paste0(doc[start:end], collapse="\n")) %>%
html_children() %>%
html_text() %>%
stri_split_lines() %>%
flatten_chr() %>%
list()
)
}) -> xdf
Take a look:
xdf
## # A tibble: 542 x 2
## pg txt
## <int> <list>
## 1 1 <chr [4]>
## 2 2 <chr [2]>
## 3 3 <chr [13]>
## 4 4 <chr [1]>
## 5 5 <chr [35]>
## 6 6 <chr [19]>
## 7 7 <chr [22]>
## 8 8 <chr [18]>
## 9 9 <chr [16]>
## 10 10 <chr [36]>
## # ... with 532 more rows
Another look:
glimpse(xdf)
## Observations: 542
## Variables: 2
## $ pg <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, ...
## $ txt <list> [<"OEUVRES COMPLETES ", "DE MOLIERE ", "TOMI: III ", "">, <"PARIS. — I1IP. SIMON RAÇON ET COUP., RUE D...
One more:
str(head(xdf))
## Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 2 variables:
## $ pg : int 1 2 3 4 5 6
## $ txt:List of 6
## ..$ : chr "OEUVRES COMPLETES " "DE MOLIERE " "TOMI: III " ""
## ..$ : chr "PARIS. — I1IP. SIMON RAÇON ET COUP., RUE D'ERFURTH, 1. " ""
## ..$ : chr "OEUVRES COMPLETES " "DE MOLIERE " "NOUVELLE ÉDITION " "ACe-OJIPAfi NEES DE NOTES TIRÉES DE TOUS L, E S COMMENTATEURS AVEC DES REMARQUES NOUVELLES " ...
## ..$ : chr ""
## ..$ : chr "OEUVRES " "COMPLÈTES " "DE MOLIÈRE " "MONSIEUR DE POURCEAUGNAC' " ...
## ..$ : chr "MONSIEUR DE POURCEAUGNAC. " "MATASSINS dansants. DEUX AVOCATS chantants. DEUX PROCUREURS dansants. DEUX SERGENTS dansants. TROUPE DE MASQUES"| __truncated__ "La scène est à Paris. " "ACTE PREMIER " ...
This captures empty lines as well, but I have no idea what you need outside of what you described.
another approach
As #hrbrmstr already mentioned in his answer, xpath is not very friendly if you want to extract nodes between other nodes... Things get very inefficient, very fast...
So, keep in mind that the following code will take up several minutes to complete (or longer, depending on your machine)... (maybe an other user can speed things up using this answer as a base).
Having said that:
library( xml2 )
library( data.table )
#get the contents od the webpage
doc <- read_html( "https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut" )
#determine how many hr-tags/nodes are there in the document
hr <- length( xml_nodes( doc, "hr") )
#create an empty list
l <- list()
#fill the list with a loop. This seems to take forever, but is works!
# just be patient (and get a cup of coffe. or two...).
for( i in seq(1, hr, by = 1) ) {
#set up the xpath.
#xpath: get all p-nodes after the i-th hr-nodes, that have exactly i preceding hr-nodes
xpath_ <- paste0 ( ".//hr[", i, "]/following-sibling::p[count(preceding-sibling::hr)=", i, "]" )
#
l[[i]] <- xml_find_all( doc, xpath = xpath_ ) %>% xml_text() %>% data.table()
}
some results
l[1:5]
# [[1]]
# Empty data.table (0 rows) of 1 col: .
#
# [[2]]
# Empty data.table (0 rows) of 1 col: .
#
# [[3]]
# .
# 1: OEUVRES COMPLETES
# 2: DE MOLIERE
# 3: TOMI: III
#
# [[4]]
# .
# 1: PARIS. — I1IP. SIMON RAÇON ET COUP., RUE D'ERFURTH, 1.
#
# [[5]]
# .
# 1: OEUVRES COMPLETES
# 2: DE MOLIERE
# 3: NOUVELLE ÉDITION
# 4: ACe-OJIPAfi NEES DE NOTES TIRÉES DE TOUS L, E S COMMENTATEURS AVEC DES REMARQUES NOUVELLES
# 5: PAR FÉLIX L E M A I T R E
# 6: P R É C É D É E
# 7: DE LA VIE DE MOLIÈRE PAR VOLTAIRE
# 8: TOME TROISIEME
# 9: PARIS
# 10: GARNIER FRÈRES, LIBRAIRES-ÉDITEURS
# 11: G, RUE DES SAINTS-PÈRES, ET P A L A I S-R 0 V A I., 213
# 12: 8 6 7
or bind everything together in a data.table
dt <- rbindlist(l, use.names = TRUE, idcol = "page")
# page .
# 1: 3 OEUVRES COMPLETES
# 2: 3 DE MOLIERE
# 3: 3 TOMI: III
# 4: 4 PARIS. — I1IP. SIMON RAÇON ET COUP., RUE D'ERFURTH, 1.
# 5: 5 OEUVRES COMPLETES
# 6: 5 DE MOLIERE
# 7: 5 NOUVELLE ÉDITION
# 8: 5 ACe-OJIPAfi NEES DE NOTES TIRÉES DE TOUS L, E S COMMENTATEURS AVEC DES REMARQUES NOUVELLES
# 9: 5 PAR FÉLIX L E M A I T R E
# 10: 5 P R É C É D É E
# 11: 5 DE LA VIE DE MOLIÈRE PAR VOLTAIRE
# 12: 5 TOME TROISIEME
# 13: 5 PARIS
# 14: 5 GARNIER FRÈRES, LIBRAIRES-ÉDITEURS
# 15: 5 G, RUE DES SAINTS-PÈRES, ET P A L A I S-R 0 V A I., 213
# 16: 5 8 6 7
# 17: 7 OEUVRES
# 18: 7 COMPLÈTES
# 19: 7 DE MOLIÈRE
# 20: 7 MONSIEUR DE POURCEAUGNAC'
Finding an index of all the hr nodes is a straightforward way to go about it. The mutate section is the most notable part which uses %in% and cumsum.
# set up and read
library(rvest)
library(xml2)
library(dplyr)
page_url<- "https://gallica.bnf.fr/ark:/12148/bpt6k5619759j.texteBrut"
page_html<- read_html(page_url)
# filter to body only, so no need to deal with child nodes
allbodynodes <- page_html %>%
xml_node('body')
# get all nodes and all hr nodes to compare later
# the first could be put into the pipeline, but it's more clear to me here
allnodes <- allbodynodes %>%
xml_nodes('*')
allhr <- allbodynodes %>%
xml_nodes('hr')
alltext <- allnodes %>%
html_text(trim = T) %>% # convert to text only
as.data.frame(stringsAsFactors = F) %>% # put into dataframe
select(maintext = '.') %>% # give the text a variable name
mutate(
ishr = allnodes %in% allhr, # check which nodes were <hr> (now blank)
page = cumsum(ishr) + 1 # add page number by running across the hr
) %>%
filter(!ishr) %>% # get rid of blank hr lines
select(-ishr) # get rid of all false ishr column
# split into a list of sorts if desired
alltextlist <- split(alltext$maintext,alltext$page)
I hope there's a more succinct way to create the index (preferably within the dplyr pipeline), but I haven't found it yet.
I have a large data file filled up of key value pairs. The key is an ID and the value is a huge json object. I have been trying to convert this data file to a df in R, by importing the data as a 2 column table and then converting the value to a data frame.
I keep getting this error, even after I validated my json.
Error: lexical error: invalid string in json text.
[{ f: { SEQNUM: [ 455043, 455044,
(right here) ------^
below is my code
part00013 <- read.table(PatientData, sep = '\t', header = F, as.is = T)
colnames(part00013) <- c('k','v')
make_indexDateLists <- function(x) {
# x['v'] <- lapply(x['v'], function(y) as.character(y))
# x['v'] <- lapply(x['v'], function(y) substr(y,1, nchar(y)-1 ))
# x['v'] <- lapply(x['v'], function(y) substr(y,2,nchar(y)))
x["v"] <- lapply(as.character(x["v"]), function(y) jsonlite::fromJSON(y,simplifyVector = T))
#do assignpatienttocohorts
x["v"] <- lapply(x["v"], function(y) RJSONIO::toJSON(y))
cbind(x$k, x$v)
}
make_indexDateLists(part00013)
and here is a sample file https://drive.google.com/open?id=0B6hKduYaYwdJQ3BwbUpNSW9EZk0
It's invalid JSON, but you can turn it into valid JSON:
library(stringi)
library(jsonlite)
library(tidyverse)
tmp <- readLines("oneline_part00013")
parts <- stri_split_fixed(tmp, "\t", 2)[[1]]
fromJSON(parts[2], flatten = FALSE) %>%
glimpse()
## Observations: 1
## Variables: 7
## $ f <data.frame> 455043, 455044, 455045, 455046, 455047, 455048, 45504...
## $ s <data.frame> 246549, 246550, 246551, 246552, 246553, 246554, 24655...
## $ i <data.frame> 8224, 8788, 770102, 30, 10, 30, 3301, 3301, 3301, 192...
## $ d <data.frame> 1114386, 1114387, 1114388, 1114389, 1114390, 1114391,...
## $ o <data.frame> 162072527, 162072528, 162072529, 162072530, 162072531...
## $ t <data.frame> 408352, 408353, 408354, 408355, 408356, 408357, 40835...
## $ a <data.frame> 36527, 42259, 35562, 42458, 39119, 30, 10, 30, 20, 30...
flatten = TRUE will un-nest all the data.frame columns (you'll end up with over 450 columns that way)
I keep getting errors with the codes, which would be correct?
The first one is correct, but generally you don't want to use for loops in R. Try using apply.
Answer
v <- runif(10000)
v[sample(1000,120)] <- NA
mydata <- as.data.frame(matrix(v,10,100))
for (i in 1:ncol(mydata)){
mydata[is.na(mydata[,i]), i] <- median(mydata[,1], na.rm = TRUE)
}
Apply Solution
v <- runif(10000)
v[sample(1000,120)] <- NA
mydata <- as.data.frame(matrix(v,10,100))
medrep <- function(x){
x[is.na(x)] <- median(x, na.rm=TRUE)
x
}
dat <- data.frame(apply(mydata, 2, medrep))
I'm having a lot of trouble trying to read some JSON data obtained from a URL in R. I'm able to read in the data, and call on each observation to get the values (as characters which is fine), but I can't seem to find a way to get the data in a table format (basically like in excel).
I've tried to create a loop which calls on each field to place it in an empty matrix, however not every object has the same number of fields (ie. some values have Label1 and Label2, while others just have Label1). I get the error that the subscipts are out of bounds. What I was thinking was to make a conditional statement whereas if the field existed then the value of the field would be put in the data matrix, and if the field does not exist then I would insert an NA. I get a subscript error automatically though and cannot do the conditional evaluation - I've looked to see if I can coerce an error to become an NA, but I don't think this is possible.
I'm starting the index from j=3, since the first two observations in the JSON code are not needed for me. My problem is that for example "json$poi[[j]]$label[[2]]$value" may not exist for every observation and I automatically get an error when the code comes across the first observation missing this field.
The data is quite big - around 4480 observations with up to 20 fields each. I only require the 9 fields I have listed however. Here is a link to the data URL - it may take some time to load. Im quite new to coding, and especially trying to deal with JSON files, so my apology if this has a simple solution that I'm not seeing.
Thanks!
http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1
library(rjson)
library(RCurl)
json <- fromJSON(getURL('http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1'))
ljson <- length(json$poi)-2
data <- matrix(data=NA, nrow=ljson, ncol=9)
for(i in 1:ljson)
{
j <- i+2
d1 <- json$poi[[j]]$location$point[[1]]$Point$posList
d2 <- json$poi[[j]]$label[[1]]$value
d3 <- json$poi[[j]]$label[[2]]$value
d4 <- json$poi[[j]]$category[[1]]$value
d5 <- json$poi[[j]]$category[[2]]$value
d6 <- json$poi[[j]]$id
d7 <- json$poi[[j]]$author$value
d8 <- json$poi[[j]]$license$value
d9 <- json$poi[[j]]$description[[1]]$value
if(exists("d1") == TRUE){
d1 <- json$poi[[j]]$location$point[[1]]$Point$posList
} else {
d1 <- NA
}
if(exists("d2") == TRUE){
d2 <- json$poi[[j]]$label[[1]]$value
} else {
d2 <- NA
}
if(exists("d3") == TRUE){
d3 <- json$poi[[j]]$label[[2]]$value
} else {
d3 <- NA
}
if(exists("d4") == TRUE){
d4 <- json$poi[[j]]$category[[1]]$value
} else {
d4 <- NA
}
if(exists("d5") == TRUE){
d5 <- json$poi[[j]]$category[[2]]$value
} else {
d5 <- NA
}
if(exists("d6") == TRUE){
d6 <- json$poi[[j]]$id
} else {
d6 <- NA
}
if(exists("d7") == TRUE){
d7 <- json$poi[[j]]$author$value
} else {
d7 <- NA
}
if(exists("d8") == TRUE){
d8 <- json$poi[[j]]$license$value
} else {
d8 <- NA
}
if(exists("d9") == TRUE){
d9 <- json$poi[[j]]$description[[1]]$value
} else {
d9 <- NA
}
data[i,] <- rbind(c(d1,d2,d3,d4,d5,d6,d7,d8,d9))
}
For JSON & XML list structures str is your friend! You can use that to inspect all or portions of a list structure. sapply on individual components to extract is probably better than the for construct and you'll need to handle NULLs and missing sub-structure components to build a data frame from that JSON (and many JSON files, actually). The following gets you started, but you still have some work to do:
# simplify extraction (saves typing, too)
poi <- json$poi
# start at 3rd element
poi <- poi[3:length(poi)]
# have to do some special checking since the value isn't always there
poi_points <- sapply(poi, function(x) {
if ("point" %in% names(x$location) & length(x$location$point) > 0) {
x$location$point[[1]]$Point$posList
} else {
NA
}
})
# this removes NULLs which the data.frame call won't like later
poi_description <- sapply(poi, function(x) {
if (is.null(x$description[[1]]$value)) {
NA
} else {
x$description[[1]]$value
}
})
# this removes NULLs which the data.frame call won't like later
poi_category <- sapply(poi, function(x) {
if (is.null(x$category[[1]]$value)) {
NA
} else {
x$category[[1]]$value
}
})
# simpler extractions
poi_label <- sapply(poi, function(x) x$label[[1]]$value)
poi_id <- sapply(poi, function(x) x$id)
poi_author <- sapply(poi, function(x) x$author$value)
poi_license <- sapply(poi, function(x) x$license$value)
# make a data frame
poi <- data.frame(poi_label, poi_category, poi_id, poi_points, poi_author, poi_license, poi_description)
str(poi)
## 'data.frame': 4482 obs. of 7 variables:
## $ poi_label : Factor w/ 4482 levels "\"Bloco das Águas Livres\", edifício de habitação, comércio e serviços",..: 363 765 764 1068 174 419 461 762 420 412 ...
## $ poi_category : Factor w/ 129 levels "Acessórios de Uso Pessoal",..: 33 33 33 33 33 33 123 33 33 33 ...
## $ poi_id : Factor w/ 4482 levels "52d7bf4d723e8e0b0cc08b69",..: 2 3 4 5 7 8 15 16 17 18 ...
## $ poi_points : Factor w/ 3634 levels "38.405892 -9.93503",..: 975 244 478 416 301 541 2936 2975 2850 2830 ...
## $ poi_author : Factor w/ 1 level "CitySDK": 1 1 1 1 1 1 1 1 1 1 ...
## $ poi_license : Factor w/ 1 level "open-data": 1 1 1 1 1 1 1 1 1 1 ...
## $ poi_description: Factor w/ 2831 levels "","\n","\n\n",..: 96 1051 NA NA 777 1902 NA 1038 81 82 ...
##