JSON to R for Data Mining - json

I am trying to grab tweets using the Topsy Otter api, so I can perform some data mining on it for my dissertation.
So far, I have got:
library(RJSONIO)
library(RCurl)
tweet_data <- getURL("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx")
fromJSON(tweet_data)
Which works fine. Now however, I want to return just a couple details from this file, 'content' and 'trackback_date'. I cannot seem to figure out how - I have tried cobbling a couple of examples together, but unable to extract what I want.
Here is what I've tried so far:
trackback_date <- lapply(tweet_data$result, function(x){x$trackback_date})
content <- lapply(tweet_data$result, function(x){x$content})
Any help would be greatly appreciated, thank you.
edit
I have also tried:
library("rjson")
# use rjson
tweet_data <- fromJSON(paste(readLines("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx"), collapse=""))
# get a data from Topsy Otter API
# convert JSON data into R object using fromJSON()
trackback_date <- lapply(tweet_data$result, function(x){x$trackback_date})
content <- lapply(tweet_data$result, function(x){x$content})

Basic processing of Topsy Otter API response:
library(RJSONIO)
library(RCurl)
tweet_data <- getURL("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&perpage=10&maxtime=1304226000&apikey=xxx")
#
# Addition to your code
#
tweets <- fromJSON(tweet_data)$response$list
content <- sapply(tweets, function(x) x$content)
trackback_date <- sapply(tweets, function(x) x$trackback_date)
EDIT: Processing multiple pages
Function gets 100 items from specified page:
pagetweets <- function(page){
url <- paste("http://otter.topsy.com/search.json?q=PSN&mintime=1301634000&page=",page,
"&perpage=100&maxtime=1304226000&apikey=xxx",
collapse="", sep="")
tweet_data <- getURL(url)
fromJSON(tweet_data)$response$list
}
Now we can apply it to multiple pages:
tweets <- unlist(lapply(1:10, pagetweets), recursive=F)
And, voila, this code:
content <- sapply(tweets, function(x) x$content)
trackback_date <- sapply(tweets, function(x) x$trackback_date)
returns you 1000 records.

Related

R read_html running indefinitely

I am trying to scrape data from this website: edmunds cost to own data. But whenever I try to run read_html("link") nothing happens as far as I can tell it just runs indefinitely;
library(rvest)
htm <- read_html("https://www.edmunds.com/lexus/rx-350/2019/cost-to-own/?style=401771404")
I have also tried things like: But they all just run forever. Why can't I read this html?
library(httr)
library(XML)
library(dplyr)
library(rvest)
h <- handle("https://www.edmunds.com/lexus/rx-350/2019/cost-to-own/?style=401771404")
res <- GET(handle = h)
#parse the HTML
resXML <- htmlParse(content(res, as = "text"))

Web scraping with R, solution with Jsonlite seems flaky

I maintain small scrips to extract financial data from websites. One of them retrieves the dutch natural gas grid balance. However, I keep getting problems with it as it works for a while, then get an error message and finally find a work around. Anyway, it seems that I am using a rather flaky method to do it. Could anyone guide me to a better direction (package) of getting this done?
Below I add the code (which again stopped working)
library(curl)
library(bitops)
url <- "https://www.gasunietransportservices.nl/en/shippers/balancing-regime/sbs-and-pos/graphactualjson/MWh"
h <- new_handle(copypostfields ="moo=moomooo")
handle_setheaders(h, "Content-Type" = "text/moo", "Cache-Control" = "no-cache", "User-Agent" = "A cow")
req <- curl_fetch_memory(url, handle=h)
x <- rawToChar(req$content)
library(jsonlite)
json_data <- fromJSON(x)
data <- json_data[,c(1,4)]
n=tail(data,1)
Many thanks
You can use rvest for this (but there could be better approaches too)
library(rvest)
json_data <- read_html('https://www.gasunietransportservices.nl/en/shippers/balancing-regime/sbs-and-pos/graphactualjson/MWh') %>%
html_text() %>%
jsonlite::fromJSON(.)
data <- json_data[,c(1,4)]
n=tail(data,1)
n
Output:
> n
sbsdatetime position
37 2017-11-16 12:00:00 -9
Slightly elegant solution if the dataframe isn't required:
library(rvest)
library(dplyr)
read_html('https://www.gasunietransportservices.nl/en/shippers/balancing-regime/sbs-and-pos/graphactualjson/MWh') %>%
html_text() %>%
jsonlite::fromJSON(.) %>%
select(1:4) %>%
tail(n=1)

R highcharter get data from plots saved as html

I plot data with highcharter package in R, and save them as html to keep interactive features. In most cases I plot more than one graph, therefore bring them together as a canvas.
require(highcharter)
hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>%
lapply(function(x) highchart() %>% hc_add_series(x))
hc_grid <- hw_grid(hc_list,ncol = 2)
htmltools::browsable(hc_grid) # print
htmltools::save_html(hc_grid,"test_grid.html") # save
I want to extract the data from plots that I have saved as html in the past, just like these. Normally I would do hc_list[[1]]$x$hc_opts$series, but when I import html into R and try to do the same, I get an error. It won't do the job.
> hc_imported <- htmltools::includeHTML("test_grid.html")
> hc_imported[[1]]$x$hc_opts$series
Error in hc_imported$x : $ operator is invalid for atomic vectors
If I would be able to write a function like
get_my_data(my_imported_highcharter,3) # get data from 3rd plot
it would be the best. Regards.
You can use below code
require(highcharter)
hc_list <- lapply(list(sin,cos,tan,tanh),mapply,seq(1,5,by = 0.1)) %>%
lapply(function(x) highchart() %>% hc_add_series(x))
hc_grid <- hw_grid(hc_list,ncol = 2)
htmltools::browsable(hc_grid) # print
htmltools::save_html(hc_grid,"test_grid.html") # save
# hc_imported <- htmltools::includeHTML("test_grid.html")
# hc_imported[[1]]$x$hc_opts$series
library(jsonlite)
library(RCurl)
library(XML)
get_my_data<-function(my_imported_highcharter,n){
webpage <- readLines(my_imported_highcharter)
pagetree <- htmlTreeParse(webpage, error=function(...){})
body <- pagetree$children$html$children$body
divbodyContent <- body$children$div$children[[n]]
script<-divbodyContent$children[[2]]
data<-as.character(script$children[[1]])[6]
data<-fromJSON(data,simplifyVector = FALSE)
data<-data$x$hc_opts$series[[1]]$data
return(data)
}
get_my_data("test_grid.html",3)
get_my_data("test_grid.html",1)

Converting JSON file to data.frame

I'm having a heck of a time trying to convert a JSON file to a data frame. I have searched and tried to use others' code to my example but none seem to fit. The output is always still a list instead of a data frame.
library(jsonlite)
URL <- getURL("http://scores.nbcsports.msnbc.com/ticker/data/gamesMSNBC.js.asp?xml=true&sport=NBA&period=20160104")
URLP <- fromJSON(URL, simplifyDataFrame = TRUE, flatten = FALSE)
URLP
Here is what format the answer always ends up in.
$games
[1] "<ticker-entry gamecode=\"2016010405\" gametype=\"Regular Season\"><visiting-team display_name=\"Toronto\" alias=\"Tor\" nickname=\"Raptors\" id=\"28\" division=\"ECA\" conference=\"EC\" score=\"\"><score heading=\"\" value=\"0\" team-fouls=\"0\"></score><team-record wins=\"21\" losses=\"14\"></team-record><team-logo link=\"http://hosted.stats.com/nba/logos/nba_50x33/Toronto_Raptors.png\" gz-image=\"http://hosted.stats.com/GZ/images/NBAlogos/TorontoRaptors.png\"></team-logo></visiting-team><home-team display_name=\"Cleveland\" alias=\"Cle\" nickname=\"Cavaliers\" id=\"5\" division=\"ECC\" conference=\"EC\" score=\"\"><score heading=\"\" value=\"0\" team-fouls=\"0\"></score><team-record wins=\"22\" losses=\"9\" ties=\"\"></team-record><team-logo link=\"http://hosted.stats.com/nba/logos/nba_50x33/Cleveland_Cavaliers.png\" gz-image=\"http://hosted.stats.com/GZ/images/NBAlogos/ClevelandCavaliers.png\"></team-logo></home-team><gamestate status=\"Pre-Game\" display_status1=\"7:00 PM\" display_status2=\"\" href=\"http://scores.nbcsports.msnbc.com/nba/preview.asp?g=2016010405\" tv=\"FSOH/SNT\" gametime=\"7:00 PM\" gamedate=\"1/4\" is-dst=\"0\" is-world-dst=\"0\"></gamestate></ticker-entry>"
With regards to #jbaums comment, you could try
library(jsonlite)
library(RCurl)
library(dplyr)
library(XML)
URL <- getURL("http://scores.nbcsports.msnbc.com/ticker/data/gamesMSNBC.js.asp?xml=true&sport=NBA&period=20160104")
lst <- lapply(fromJSON(URL)$games, function(x) as.data.frame(t(unlist(xmlToList(xmlParse(x)))), stringsAsFactors=FALSE))
df <- bind_rows(lst)
View(df)
... in theory. However, as #hrbrmstr pointed out: practically, this would violate the website owner's terms of service.

R: Extract JSON Variable Info

I'm trying to download NBA player information from Numberfire and then put that information into a data frame. However I seem to be running into a few issues
The following snippet downloads the information just fine
require(RCurl)
require(stringr)
require(rjson)
#download data from numberfire
nf <- "https://www.numberfire.com/nba/fantasy/fantasy-basketball-projections"
html <- getURL(nf)
Then there is what I assume to be a JSON data structure
#extract json variable (?)
pat <- "NF_DATA.*}}}"
jsn <- str_extract(html, pat)
jsn <- str_split(jsn, "NF_DATA = ")
parse <- newJSONParser()
parse$addData(jsn)
It seems to add data OK as it doesn't throw any errors, but if there is data in that object I can't tell or seem to get it out!
I'd paste in the jsn variable but it's way over the character limit. Any hints as to where I'm going wrong would be much appreciated
Adding the final line gets a nice list format that you can transform to a data.frame
require(RCurl); require(stringr); require(rjson)
#download data from numberfire
nf <- "https://www.numberfire.com/nba/fantasy/fantasy-basketball-projections"
html <- getURL(nf)
#extract json variable (?)
pat <- "NF_DATA.*}}}"
jsn <- str_extract(html, pat)
jsn <- str_split(jsn, "NF_DATA = ")
fromJSON(jsn[[1]][[2]])