I have a file containing over 1500 json objects that I want to work with in R. I've been able to import the data as a list, but am having trouble coercing it into a useful structure. I want to create a data frame containing a row for each json object and a column for each key:value pair.
I've recreated my situation with this small, fake data set:
[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]
Some features of the data:
The objects all contain the same number of key:value pairs although
some of the values are null
There are two non-numeric columns per object (name and group)
name is the unique identifier, there are 10 or so groups
many of the name and group entires contain spaces, commas and other punctuation.
Based on this question: R list(structure(list())) to data frame, I tried the following:
json_file <- "test.json"
json_data <- fromJSON(json_file)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))
With both my real data and this fake data, the last line give me this error:
Error in data.frame(name = "Doe, John", group = "Red", `age (y)` = 24, :
arguments imply differing number of rows: 1, 0
You just need to replace your NULLs with NAs:
require(RJSONIO)
json_file <- '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'
json_file <- fromJSON(json_file)
json_file <- lapply(json_file, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
Once you have a non-null value for each element, you can call rbind without getting an error:
do.call("rbind", json_file)
name group age (y) height (cm) wieght (kg) score
[1,] "Doe, John" "Red" "24" "182" "74.8" NA
[2,] "Doe, Jane" "Green" "30" "170" "70.1" "500"
[3,] "Smith, Joan" "Yellow" "41" "169" "60" NA
[4,] "Brown, Sam" "Green" "22" "183" "75" "865"
[5,] "Jones, Larry" "Green" "31" "178" "83.9" "221"
[6,] "Murray, Seth" "Red" "35" "172" "76.2" "413"
[7,] "Doe, Jane" "Yellow" "22" "164" "68" "902"
This is very simple if you use either library(jsonlite) or library(jsonify)
Both of these handle the null values and converts them to NA, and they preserve the data types.
Data
json_file <- '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'
jsonlite
library(jsonlite)
jsonlite::fromJSON( json_file )
# name group age (y) height (cm) wieght (kg) score
# 1 Doe, John Red 24 182 74.8 NA
# 2 Doe, Jane Green 30 170 70.1 500
# 3 Smith, Joan Yellow 41 169 60.0 NA
# 4 Brown, Sam Green 22 183 75.0 865
# 5 Jones, Larry Green 31 178 83.9 221
# 6 Murray, Seth Red 35 172 76.2 413
# 7 Doe, Jane Yellow 22 164 68.0 902
str( jsonlite::fromJSON( json_file ) )
# 'data.frame': 7 obs. of 6 variables:
# $ name : chr "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group : chr "Red" "Green" "Yellow" "Green" ...
# $ age (y) : int 24 30 41 22 31 35 22
# $ height (cm): int 182 170 169 183 178 172 164
# $ wieght (kg): num 74.8 70.1 60 75 83.9 76.2 68
# $ score : int NA 500 NA 865 221 413 902
jsonify
library(jsonify)
jsonify::from_json( json_file )
# name group age (y) height (cm) wieght (kg) score
# 1 Doe, John Red 24 182 74.8 NA
# 2 Doe, Jane Green 30 170 70.1 500
# 3 Smith, Joan Yellow 41 169 60.0 NA
# 4 Brown, Sam Green 22 183 75.0 865
# 5 Jones, Larry Green 31 178 83.9 221
# 6 Murray, Seth Red 35 172 76.2 413
# 7 Doe, Jane Yellow 22 164 68.0 90
str( jsonify::from_json( json_file ) )
# 'data.frame': 7 obs. of 6 variables:
# $ name : chr "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group : chr "Red" "Green" "Yellow" "Green" ...
# $ age (y) : int 24 30 41 22 31 35 22
# $ height (cm): int 182 170 169 183 178 172 164
# $ wieght (kg): num 74.8 70.1 60 75 83.9 76.2 68
# $ score : int NA 500 NA 865 221 413 902
To remove null values use parameter nullValue
json_data <- fromJSON(json_file, nullValue = NA)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))
this way there won´t be any unnecessary quotes in your output
library(rjson)
Lines <- readLines("yelp_academic_dataset_business.json")
business <- as.data.frame(t(sapply(Lines, fromJSON)))
You may try this to load JSON data into R
dplyr::bind_rows(fromJSON(file_name))
Changing the package from rjson to jsonlite fixed it for me.
So instead of this:
fromAPIPlantsPages <- rjson::fromJSON(content(apiGetPlants,type="text",encoding = "UTF-8"))
dfPlantenAPI <- as.data.frame(fromAPIPlantsPages)
I changed it to this:
fromAPIPlantsPages <- jsonlite::fromJSON(content(apiGetPlants,type="text",encoding = "UTF-8"))
dfPlantenAPI <- as.data.frame(fromAPIPlantsPages)
Related
I am trying to write a R script that can download the csv file from the following website,"https://www.covidanalytics.io/projections There is a link for download data at the bottom of the page, which takes the form of "data:text/csv...". I was wondering if I can have a R script to download the file in csv format. Any help will be highly appreciated.
Thanks
IC
There are easier ways to get this data, but it is possible with a bit of low-level work using the httr package.
As #r2evans pointed out, this is a url-encoded csv built by Dash. To get the url, you need to request a json file containing the html page information using an xhr request. This needs all the correct headers as well as a json request in the body of the POST request:
library(httr)
page1 <- GET("https://www.covidanalytics.io/projections")
H <- add_headers( `Host` = "www.covidanalytics.io",
`User-Agent` = paste("Mozilla/5.0 (Windows NT 6.1; rv:77.0)",
"Gecko/20100101 Firefox/77.0"),
`Accept` = "application/json",
`Accept-Language` = "en-GB,en;q=0.5",
`Accept-Encoding` = "gzip, deflate",
`Referer` = "https://www.covidanalytics.io/projections",
`Content-Type` = "application/json",
`X-CSRFToken` = "undefined",
`Origin` = "https://www.covidanalytics.io",
`Connection` = "keep-alive")
post_data <- paste0('{"output":"page-content.children","outputs":{"id":',
'"page-content","property":"children"},"inputs":',
'[{"id":"url","property":"pathname","value":',
'"/projections"}],"changedPropIds":["url.pathname"]}')
res <- httr::POST("https://www.covidanalytics.io/_dash-update-component", H,
body = post_data, encode = "raw")
'res` now contains the json response, and our url-encoded csv is deep inside it. We get this parsed content and extract the string containing the url:
body <- parsed_content(res)$response$`page-content`$children$props$children[[2]]
div <- body$props$children[[10]]$props$children[[1]]
url <- div$props$children$props$children$props$href
Now we need to cut off the data:text/csv;charset=utf-8, part and unescape the url encoding. I actually found this was far quicker using nested gsubs, since my machine choked on URLdecode:
csv <- strsplit(url, ",")[[1]][2]
df <- read.csv(text = gsub("%0A", "\n", gsub("%20", " ", gsub("%2C", ",", csv))))
Your data is now in df. It's big, so I'll show it as a tibble here:
tidyr::as_tibble(df)
#> # A tibble: 7,106 x 10
#> Continent Country Province Day Total.Detected Active Active.Hospital~ Cumulative.Hosp~
#> <fct> <fct> <fct> <fct> <fct> <int> <int> <int>
#> 1 Africa Algeria None 2020~ 5651 1531 302 834
#> 2 Africa Algeria None 2020~ 5742 1514 300 848
#> 3 Africa Algeria None 2020~ 5831 1497 298 861
#> 4 Africa Algeria None 2020~ 5917 1477 296 874
#> 5 Africa Algeria None 2020~ 6000 1457 293 886
#> 6 Africa Algeria None 2020~ 6079 1435 291 898
#> 7 Africa Algeria None 2020~ 6156 1411 287 910
#> 8 Africa Algeria None 2020~ 6230 1387 284 921
#> 9 Africa Algeria None 2020~ 6300 1361 280 932
#> 10 Africa Algeria None 2020~ 6368 1335 277 942
#> # ... with 7,096 more rows, and 2 more variables: Total.Detected.Deaths <int>,
#> # Active.Ventilated <int>
I am trying to scrap results of Polish elections that were held this weekend, but I come to problem that before every intager random float is added.
I have tried using htmltab, but it did not work - as you can see random number is added
library(htmltab)
url <- "https://wybory2018.pkw.gov.pl/pl/geografia/020000#results_vote_council"
tmp <- htmltab::htmltab(doc = html, which = 1)
tmp
Wyszczególnienie Liczba
2 Mieszkańców 0.972440432 755 957
3 Wyborców 0.977263472 273 653
4 Obwodów 0.99998061 940
I have checked in html what is the problem:
library(xml2)
library(rvest)
webpage <- xml2::read_html(url)
a <- webpage %>%
rvest::html_nodes("tbody")
a[1]
<tbody>\n<tr>\n<td>Mieszkańców</td>\n <td class=\"table-number\">\n<span class=\"hidden\">0.97244043</span>2 755 957</td>\n </tr>\n<tr>\n<td>Wyborców</td>\n <td class=\"table-number\">\n<span class=\"hidden\">0.97726347</span>2 273 653</td>\n </tr>\n<tr>\n<td>Obwodów</td>\n <td class=\"table-number\">\n<span class=\"hidden\">0.9999806</span>1 940</td>\n </tr>\n</tbody>"
I assume the problem is with <span class=\"hidden\">, but how to get rid of it?
EDIT
I need the info from the 9th table with results of the parties
Nr listy Komitet wyborczy Liczba % głosów ważnych
Głosów na kandydatów komitetu Kandydatów
12 KOMITET WYBORCZY WYBORCÓW Z DUTKIEWICZEM DLA DOLNEGO ŚLĄSKA 93 260 45 8.29%
9 KOMITET WYBORCZY WYBORCÓW WOLNOŚĆ W SAMORZĄDZIE 15 499 46 1.38%
8 KOMITET WYBORCZY WYBORCÓW KUKIZ'15 53 800 41 4.78%
1 KOMITET WYBORCZY WYBORCÓW BEZPARTYJNI SAMORZĄDOWCY 168 442 46 14.98%
11 KOMITET WYBORCZY WOLNI I SOLIDARNI 9 624 38 0.86%
7 KOMITET WYBORCZY RUCH NARODOWY RP 14 874 38 1.32%
10 KOMITET WYBORCZY PRAWO I SPRAWIEDLIWOŚĆ 320 908 45 28.53%
2 KOMITET WYBORCZY POLSKIE STRONNICTWO LUDOWE 58 820 46 5.23%
6 KOMITET WYBORCZY PARTII RAZEM 18 087 44 1.61%
3 KOMITET WYBORCZY PARTIA ZIELONI 19 783 36 1.76%
5 KOALICYJNY KOMITET WYBORCZY SLD LEWICA RAZEM 61 889 46 5.50%
4 KOALICYJNY KOMITET WYBORCZY PLATFORMA.NOWOCZESNA KOALICJA OBYWATELSKA 289 831 46 25.77%
EDIT 2
I have found not the most elegant solution:
#https://stackoverflow.com/questions/7963898/extracting-the-last-n-characters-from-a-string-in-r
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
tmp <- htmltab::htmltab(doc = html, which = 9)
tmp2 <- xml2::read_html(html) %>%
rvest::html_nodes("tbody") %>%
magrittr::extract2(9) %>%
rvest::html_nodes("tr") %>%
rvest::html_nodes("td") %>%
rvest::html_nodes("span") %>%
rvest::html_text() %>%
matrix(ncol = 4, byrow = T) %>%
data.frame()
names(tmp) <- c("a", "b", "c", "d", "e", "f", "g")
tmp3 <- cbind(tmp, tmp2) %>%
mutate(n_to_delate = nchar(X1),
c1 = as.character(c),
n_whole = nchar(c1),
c2 = substrRight(c1, n_whole - n_to_delate),
c3 = gsub(" ", "", c2),
c4 = as.numeric(c3)) %>%
select(b, c4)
names(tmp3) <- c("party", "n_of_votes")
Solving the original question:
You can remove those nodes before the conversion to a table:
library(rvest)
pg <- read_html("https://wybory2018.pkw.gov.pl/pl/geografia/020000#results_vote_council")
tbl_1 <- html_nodes(pg, xpath=".//table[#class = 'stat_table']")[1]
xml_remove(html_nodes(tbl_1, xpath=".//span[#class='hidden']"))
html_table(tbl_1)
## [[1]]
## Wyszczególnienie Liczba
## 1 Mieszkańców 2 755 957
## 2 Wyborców 2 273 653
## 3 Obwodów 1 940
Solving the updated requirements:
library(rvest)
pg <- read_html("https://wybory2018.pkw.gov.pl/pl/geografia/020000#results_vote_council")
Let's target that particular table. Using the "View Source" version of the document, we can go for the header that precedes that table and then got to the table:
target_tbl <- html_node(pg, xpath=".//header[contains(., 'mandatów pomiędzy')]/following-sibling::table")
Still get rid of the hidden spans:
xml_remove(html_nodes(target_tbl, xpath=".//span[#class='hidden']"))
Now, we need to know how many real columns there are since it has one of those daft headers that are multi-line with <td>'s that span multiple columns:
length(
html_nodes(target_tbl, xpath=".//tbody/tr[1]") %>%
html_nodes("td")
) -> n_cols
Now we pull out each column, set good column names, turn it into a data frame and remove the junk column that is just feeding the filled in bars:
as.data.frame(
setNames(
lapply(1:n_cols, function(.idx) {
html_nodes(target_tbl, xpath=sprintf(".//tbody/tr/td[%s]", .idx)) %>%
html_text(trim=TRUE)
}),
c(
"nr_listy", "komitet_wyborczy", "głosów_na_kandydatów_komitetu",
"kandydatów", "mandatów", "pct_głosów_ważnych", "junk",
"udział_w_podziale_mandatów"
)
),
stringsAsFactors = FALSE
) -> xdf
xdf$junk <- NULL
str(xdf)
## 'data.frame': 12 obs. of 7 variables:
## $ nr_listy : chr "1" "2" "3" "4" ...
## $ komitet_wyborczy : chr "KOMITET WYBORCZY WYBORCÓW BEZPARTYJNI SAMORZĄDOWCY" "KOMITET WYBORCZY POLSKIE STRONNICTWO LUDOWE" "KOMITET WYBORCZY PARTIA ZIELONI" "KOALICYJNY KOMITET WYBORCZY PLATFORMA.NOWOCZESNA KOALICJA OBYWATELSKA" ...
## $ głosów_na_kandydatów_komitetu: chr "168 442" "58 820" "19 783" "289 831" ...
## $ kandydatów : chr "46" "46" "36" "46" ...
## $ mandatów : chr "6" "1" "0" "13" ...
## $ pct_głosów_ważnych : chr "14.98%" "5.23%" "1.76%" "25.77%" ...
## $ udział_w_podziale_mandatów : chr "Tak" "Tak" "Nie" "Tak" ...
I don't think piping makes the lapply() block more readable but just in case it's preferred:
lapply(1:n_cols, function(.idx) {
html_nodes(target_tbl, xpath=sprintf(".//tbody/tr/td[%s]", .idx)) %>%
html_text(trim=TRUE)
}) %>%
setNames(c(
"nr_listy", "komitet_wyborczy", "głosów_na_kandydatów_komitetu",
"kandydatów", "mandatów", "pct_głosów_ważnych", "junk",
"udział_w_podziale_mandatów"
)) %>%
as.data.frame(stringsAsFactors = FALSE) -> xdf
I am trying to load a json file into a data.frame in R. But there is some list() which is null in my data.
Here is my json data:
json_file1 <- jsonlite::fromJSON('{"txtId":"20180101","data":{"user":[{"id":"123","phone":"00001","realName":"Eric","addr":{},"source":{},"registerDate":{},"type":0,"remain":{}}],"score":[]}}')
json_file2 <- jsonlite::fromJSON('{"txtId":"20180102","data":{"user":[{"id":"456","phone":"00002","realName":"Amy","addr":{},"source":{},"registerDate":{},"type":0,"remain":100}],"score":[]}}')
json_file = list(json_file1, json_file2)
zt.detail = lapply(json_file, function(y){
if(!is.null(y$data$user)) data.frame(y$data$user, stringsAsFactors = F)
})
when I rbind zt.detail, I get the error:
# > dat_callrecord = data.table::rbindlist(zt.detail, fill = T)
# Error in data.table::rbindlist(zt.detail, fill = T) :
# Column 4 of item 1 is length 0, inconsistent with first column of that item which is length 1. rbind/rbindlist doesn't recycle as it already expects each item to be a uniform list, data.frame or data.table
# > str(zt.detail[[1]])
# 'data.frame': 1 obs. of 9 variables:
# $ id : chr "123"
# $ phone : chr "00001"
# $ realName : chr "Eric"
# $ addr :'data.frame': 1 obs. of 0 variables
# $ source :'data.frame': 1 obs. of 0 variables
# $ registerDate:'data.frame': 1 obs. of 0 variables
# $ type : int 0
# $ remain :'data.frame': 1 obs. of 0 variables
The error was caused because the structure of my data contains data.frame of 1 observation but 0 variables. So I want to transfer those list() into NA before and get the following result:
> dat_callrecord
id phone realName type remain addr source registerDate
123 00001 Eric 0 NA NA NA NA
456 00002 Amy 0 100 NA NA NA
We can loop through the list and if there is a data.frame, replace it with NA and then do the rbindlist
data.table::rbindlist(lapply(zt.detail, function(x) {
x[] <- lapply(x, function(y) if(is.data.frame(y)) NA else y)
x}))
# id phone realName addr source registerDate type remain
#1: 123 00001 Eric NA NA NA 0 NA
#2: 456 00002 Amy NA NA NA 0 100
I'm just learning how to use R to scrape data from webpages, and I'm running into a couple of issues.
For reference, the website that I am practicing on is here: http://www.rsssf.com/tables/34q.html
As far as I know, the website I am scraping data from is not a table so I can't directly scrape the information into a table, so here is the code I wrote to just have all of the text:
wcq_1934_html <- read_html("http://www.rsssf.com/tables/34q.html")
wcq_1934_node <- html_nodes(wcq_1934_html, "pre")
wcq_1934_text <- html_text(wcq_1934_node, trim = TRUE)
This results in a very long text file with all of the information that I need, just not formatted in an ideal way.
So I am next attempting to substring this text in order to get an output that looks something like this.
Country A - Country A Score - Country B - Country B Score
It doesn't have to be exactly like this, I just basically need for each game the country and how many goals they scored and ideally it should be comparable with the other country from the same game so I can know who won or lost! I do not need any of the other information like where the game was played, etc.
So I've tried three different ways to get this:
First test: split text by dashes:
test <- strsplit(wcq_1934_text, "-")
df_test <- data.frame(test)
This gives me the information I need in a table but the rows don't match the exact scores that I need (i.e. Lithuania 0, and Sweden 2 are in separate rows)
Second test: split text by spaces:
test2 <- strsplit(wcq_1934_text, " ")
df_test2 <- data.frame(test2)
This is helpful because it gives me the scores in one row (0-2 for the first game), but the countries are unevenly spaced out across rows.
Third test: split text by "tabs"
test3 <- strsplit(wcq_1934_text, " ")
df_test3 <- data.frame(test3)
This has a similar issue to the first test.
Any suggestions would be much appreciated. This is my first ever Stack Overflow post, although I've lurked around and this website has been helpful to me for a very long time. Thank you in advance!
Here's a solution that provides you most of what you need, though as MrFlick commented, it is a little fragile to this page. I'll stay with rvest, though as biomiha suggested, it isn't really buying you a lot here (though it does cleanly break out the <pre> block).
Starting with your wcq_1934_text, it's a single long string, let's break it up by newlines (CRLF in this case):
wcq_1934_text <- strsplit(wcq_1934_text, "[\r\n]+")[[1]]
str(wcq_1934_text)
# chr [1:51] "Hosts: Italy (not automatically qualified)" "Holders: Uruguay (did not enter)" "Group 1 [Sweden]" ...
I'll the magrittr package merely because it helps break out each step of the process using the %>% non-pipe; you can convert it non-magrittr by changing (say) func1() %>% func2() %>% func3() to func3(func2(func1())) (yuck) or intermediate assignment of return values, ret1 <- func1(); ret2 <- func2(ret1); ....
library(magrittr)
dat <- Filter(function(a) grepl("^[0-9][0-9]", a), wcq_1934_text) %>%
paste(., collapse = "\n") %>%
textConnection() %>%
read.fwf(file = ., widths = c(10, 16, 17, 4, 99), stringsAsFactors = FALSE) %>%
lapply(trimws) %>%
as.data.frame(stringsAsFactors = FALSE)
The widths are fragile and unique to this page. If other reporting pages have slightly different column layouts, you'll need to use a different function, perhaps one that can automatically determine the breaks.
head(dat)
# V1 V2 V3 V4 V5
# 1 11.06.33 Stockholm Sweden 6-2 Estonia
# 2 29.06.33 Kaunas Lithuania 0-2 Sweden
# 3 11.03.34 Madrid Spain 9-0 Portugal
# 4 18.03.34 Lisboa Portugal 1-2 Spain
# 5 25.03.34 Milano Italy 4-0 Greece
# 6 25.03.34 Sofia Bulgaria 1-4 Hungary
From here, it's up to you which columns you want to use.
For instance, handling of the date, you might want:
dat$V1 <- as.POSIXct(gsub("([0-9]+)$", "19\\1", dat$V1), format = "%d.%m.%Y")
dat$V1
# [1] "1933-06-11 PST" "1933-06-29 PST" "1934-03-11 PST" "1934-03-18 PST" "1934-03-25 PST" "1934-03-25 PST" "1934-04-25 PST" "1934-04-29 PST"
# [9] "1933-10-15 PST" "1934-03-15 PST" "1933-09-24 PST" "1933-10-29 PST" "1934-04-29 PST" "1934-02-25 PST" "1934-04-08 PST" "1934-04-29 PST"
# [17] "1934-03-11 PST" "1934-04-15 PST" "1934-01-28 PST" "1934-02-01 PST" "1934-02-04 PST" "1934-03-04 PST" "1934-03-11 PST" "1934-03-18 PST"
# [25] "1934-05-24 PST" "1934-03-16 PST" "1934-04-06 PST"
The gsub stuff is because as.POSIXct assumes 2-digit years less than 69 are in the 20th century, 19th for 69-99.
It's easy enough to use either strsplit on the scores, but you could also do:
library(tidyr)
dat %>%
separate(V4, c("score1", "score2"), sep="-") %>%
head()
# Warning: Too few values at 1 locations: 10
# V1 V2 V3 score1 score2 V5
# 1 1933-06-11 Stockholm Sweden 6 2 Estonia
# 2 1933-06-29 Kaunas Lithuania 0 2 Sweden
# 3 1934-03-11 Madrid Spain 9 0 Portugal
# 4 1934-03-18 Lisboa Portugal 1 2 Spain
# 5 1934-03-25 Milano Italy 4 0 Greece
# 6 1934-03-25 Sofia Bulgaria 1 4 Hungary
(The warning is expected, since one game was not played so has "n/p" for a score. You might want to handle non-score values in V4 before trying the split, perhaps replacing anything not numeric-dash-numeric with NA.)
Equally specific to this particular site but may be easier to generalize:
library(rvest)
library(purrr)
library(dplyr)
library(stringi)
pg <- read_html("http://www.rsssf.com/tables/34q.html")
Target the <pre> and strip out some things that aren't part of "tables":
html_nodes(pg, "pre") %>%
html_text() %>%
stri_split_lines() %>%
flatten_chr() %>%
discard(stri_detect_regex, "^(NB| )") -> lines
Now, we get the start and end lines indexes of each "group":
starts <- which(grepl("^Group", lines))
ends <- c(starts[-1], length(lines))
We iterate over those starts and ends and:
extract the group info
clean up the table
discard any "empty" tables
turn the tabular data into a data frame, doing some munging along the way
I can annotate the following more if needed:
map2_df(starts, ends, ~{
grp_info <- stri_match_all_regex(lines[.x], "Group ([[:digit:]]+) \\[(.*)]")[[1]][,2:3]
lines[(.x+1):.y] %>%
discard(stri_detect_regex, "(^[^[:digit:]]| round)") %>%
discard(`==`, "") -> grp
if (length(grp) == 0) return(NULL)
stri_split_regex(grp, "\ \ +") %>%
map_df(~{
.x[1:4] %>%
as.list() %>%
set_names(c("date", "team_a", "team_b", "score_team")) %>%
flatten_df() %>%
separate(score_team, c("score", "team_c"), sep=" ") %>%
mutate(group_num = grp_info[1], group_info = grp_info[2]) %>%
separate(date, c("d", "m", "y")) %>%
mutate(date = as.Date(sprintf("19%s-%s-%s", y, m, d))) %>%
select(-d, -m, -y)
})
})
## # A tibble: 27 x 7
## team_a team_b score team_c group_num group_info date
## <chr> <chr> <chr> <chr> <chr> <chr> <date>
## 1 Stockholm Sweden 6-2 Estonia 1 Sweden 1933-06-11
## 2 Kaunas Lithuania 0-2 Sweden 1 Sweden 1933-06-29
## 3 Madrid Spain 9-0 Portugal 2 Spain 1934-03-11
## 4 Lisboa Portugal 1-2 Spain 2 Spain 1934-03-18
## 5 Milano Italy 4-0 Greece 3 Italy 1934-03-25
## 6 Sofia Bulgaria 1-4 Hungary 4 Hungary, Austria 1934-03-25
## 7 Wien Austria 6-1 Bulgaria 4 Hungary, Austria 1934-04-25
## 8 Budapest Hungary 4-1 Bulgaria 4 Hungary, Austria 1934-04-29
## 9 Warszawa Poland 1-2 Czechoslovakia 5 Czechoslovakia 1933-10-15
## 10 Praha Czechoslovakia n/p Poland 5 Czechoslovakia 1934-03-15
## 11 Beograd Yugoslavia 2-2 Switzerland 6 Romania, Switzerland 1933-09-24
## 12 Bern Switzerland 2-2 Romania 6 Romania, Switzerland 1933-10-29
## 13 Bucuresti Romania 2-1 Yugoslavia 6 Romania, Switzerland 1934-04-29
## 14 Dublin Ireland 4-4 Belgium 7 Netherlands, Belgium 1934-02-25
## 15 Amsterdam Netherlands 5-2 Ireland 7 Netherlands, Belgium 1934-04-08
## 16 Antwerpen Belgium 2-4 Netherlands 7 Netherlands, Belgium 1934-04-29
## 17 Luxembourg Luxembourg 1-9 Germany 8 Germany, France 1934-03-11
## 18 Luxembourg Luxembourg 1-6 France 8 Germany, France 1934-04-15
## 19 Port-au-Prince Haiti 1-3 Cuba 11 USA 1934-01-28
## 20 Port-au-Prince Haiti 1-1 Cuba 11 USA 1934-02-01
## 21 Port-au-Prince Haiti 0-6 Cuba 11 USA 1934-02-04
## 22 Cd. de Mexico Mexico 3-2 Cuba 11 USA 1934-03-04
## 23 Cd. de Mexico Mexico 5-0 Cuba 11 USA 1934-03-11
## 24 Cd. de Mexico Mexico 4-1 Cuba 11 USA 1934-03-18
## 25 Roma USA 4-2 Mexico 11 USA 1934-05-24
## 26 Cairo Egypt 7-1 Palestina 12 Egypt 1934-03-16
## 27 Tel Aviv Palestina 1-4 Egypt 12 Egypt 1934-04-06
I want to play with data that is now saved in JSON format. But I am very new to R and have little clue of how to play with data. You can see below what I managed to achieve. But first, my code:
library(rjson)
json_file <- "C:\\Users\\Saonkfas\\Desktop\\WOWPAPI\\wowpfinaljson.json"
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
I was able to the data:
for (x in json_data){print (x)}
Although output looks pretty raw:
[[1]]
[[1]]$wins
[1] "118"
[[1]]$losses
[1] "40"
# And so on
Note that the JSON is somewhat nested. I could create tables with Python, but R seems much more complicated.
Edit:
My JSON:
{
"play1": [
{
"wins": "118",
"losses": "40",
"max_killed": "7",
"battles": "158",
"plane_id": "4401",
"max_ground_object_destroyed": "3"
},
{
"wins": "100",
"losses": "58",
"max_killed": "7",
"battles": "158",
"plane_id": "2401",
"max_ground_object_destroyed": "3"
},
{
"wins": "120",
"losses": "38",
"max_killed": "7",
"battles": "158",
"plane_id": "2403",
"max_ground_object_destroyed": "3"
}
],
"play2": [
{
"wins": "12",
"losses": "450",
"max_killed": "7",
"battles": "158",
"plane_id": "4401",
"max_ground_object_destroyed": "3"
},
{
"wins": "150",
"losses": "8",
"max_killed": "7",
"battles": "158",
"plane_id": "2401",
"max_ground_object_destroyed": "3"
},
{
"wins": "120",
"losses": "328",
"max_killed": "7",
"battles": "158",
"plane_id": "2403",
"max_ground_object_destroyed": "3"
}
],
fromJSON returns a list, you can use the *apply functions to go through each element.
It's fairly straightforward (once you know what to do!) to convert it to a "table" (data frame is the correct R terminology).
library(rjson)
# You can pass directly the filename
my.JSON <- fromJSON(file="test.json")
df <- lapply(my.JSON, function(play) # Loop through each "play"
{
# Convert each group to a data frame.
# This assumes you have 6 elements each time
data.frame(matrix(unlist(play), ncol=6, byrow=T))
})
# Now you have a list of data frames, connect them together in
# one single dataframe
df <- do.call(rbind, df)
# Make column names nicer, remove row names
colnames(df) <- names(my.JSON[[1]][[1]])
rownames(df) <- NULL
df
wins losses max_killed battles plane_id max_ground_object_destroyed
1 118 40 7 158 4401 3
2 100 58 7 158 2401 3
3 120 38 7 158 2403 3
4 12 450 7 158 4401 3
5 150 8 7 158 2401 3
6 120 328 7 158 2403 3
I find jsonlite to be a little more user friendly for this task. Here is a comparison of three JSON parsing packages (biased in favor of jsonlite)
library(jsonlite)
data <- fromJSON('path/to/file.json')
data
#> $play1
# wins losses max_killed battles plane_id max_ground_object_destroyed
# 1 118 40 7 158 4401 3
# 2 100 58 7 158 2401 3
# 3 120 38 7 158 2403 3
#
# $play2
# wins losses max_killed battles plane_id max_ground_object_destroyed
# 1 12 450 7 158 4401 3
# 2 150 8 7 158 2401 3
# 3 120 328 7 158 2403 3
If you want to collapse those list names into a new column, I recommend dplyr::bind_rows rather than do.call(rbind, data)
library(dplyr)
data <- bind_rows(data, .id = 'play')
# Source: local data frame [6 x 7]
# play wins losses max_killed battles plane_id max_ground_object_destroyed
# (chr) (chr) (chr) (chr) (chr) (chr) (chr)
# 1 play1 118 40 7 158 4401 3
# 2 play1 100 58 7 158 2401 3
# 3 play1 120 38 7 158 2403 3
# 4 play2 12 450 7 158 4401 3
# 5 play2 150 8 7 158 2401 3
# 6 play2 120 328 7 158 2403 3
Beware that the columns may not have the type you expect (notice the columns are all characters since all of the numbers were quoted in the provided JSON data)!
Edit Nov. 2017: One approach to type conversion would be to use mutate_if to guess the intended type of character columns.
data <- mutate_if(data, is.character, type.convert, as.is = TRUE)
I prefer tidyjson over rjson and jsonlite as it has a easy workflow for converting multilevel nested json objects to 2 dimensional tables. Your problem can be easily solved using this package from github.
devtools::install_github("sailthru/tidyjson")
library(tidyjson)
library(dplyr)
> json %>% as.tbl_json %>% gather_keys %>% gather_array %>%
+ spread_values(
+ wins = jstring("wins"),
+ losses = jstring("losses"),
+ max_killed = jstring("max_killed"),
+ battles = jstring("battles"),
+ plane_id = jstring("plane_id"),
+ max_ground_object_destroyed = jstring("max_ground_object_destroyed")
+ )
Output
document.id key array.index wins losses max_killed battles plane_id max_ground_object_destroyed
1 1 play1 1 118 40 7 158 4401 3
2 1 play1 2 100 58 7 158 2401 3
3 1 play1 3 120 38 7 158 2403 3
4 1 play2 1 12 450 7 158 4401 3
5 1 play2 2 150 8 7 158 2401 3
6 1 play2 3 120 328 7 158 2403 3