How to convert json file into dataframe in R? - json

I have a json text file which reads
{"type":"session.ended","v":2,"post.source":"1306210600-col001.sv1.movenetworks.com:2097#5","post.ip4":"75.114.187.146","post.rtime_secs":1371794661,"post.rtime_text":"2013-06-21 06:04:20","post.time_secs":1371794596,"post.time_text":"2013-06-21 06:03:16","post.time_date":"2013-06-21","post.time_hour":6,"post.late_secs":65,"id.session":"3625657","id.sub":"2370b726-b96e-11e2-b3eb-1231380e1adf","id.partner":"0CB48A664E514CA48D378D152574EDBB","id.service":"BBTV (CBD46B77)","device.make":"Roku","device.model":"3050X","device.info":"Roku;3050X","device.serial":"12C241003940","device.version":"2.5.0.7","device.uuid":"51ce2255-62ad-5778-b2d7-b9543c1476c6","device.os":"Linux","device.os_version":"2.6.35","device.platform":"Roku","device.platform_vendor":"Move Networks","device.platform_version":"1.0.0.20130329","device.licenses":[],"user.type":"Subscriber","ip.provider":"netacuity","ip.postal_code":"48154","ip.dma":505,"ip.dma_name":"unknown","ip.city":"livonia","ip.country":"united states","ip.region":"michigan","ip.continent":"north america","ip.isp":"bright house networks llc","ip.asn":0,"ip.asn_owner":"?","clip.id":"1338713","clip.pub":"CBD46B77","asset.id":524768,"asset.pub":"CBD46B77","asset.length_ms":1800000,"asset.guid":"b7c12c09dc5aec832e142a00b0f191fa","asset.title":"Diya Aur Bati Hum","asset.type":"captured","asset.adult":false,"asset.franchise_guid":"941496e1452b4fce9acfe7b3339924eb","asset.franchise_title":"Diya Aur Bati Hum","container.id":14,"container.pub":"CBD46B77","container.guid":"3caa6afd715e4c57ac4750d29e449a9c","container.title":"SPLUS","usage.elapsed_ms":2312,"usage.viewed_ms":392,"usage.stage":"mainVideo","exp.idle_ms":350,"exp.stalls":0,"exp.stalled_ms":0,"exp.frame_renders":0,"exp.frame_drops":0,"exp.ghost_session_ms":0,"exp.ghost_sessions":0,"exp.qmx_stale_ms":0,"exp.qmx_error_ms":0,"exp.qss_late_ms":0,"exp.qss_error_ms":0,"exp.fom":0,"exp.fom_weight":0,"data.dl_bytes":228,"data.ul_bytes":406,"http.oks":2,"http.errors":0,"http.timeouts":0,"net.throughput":8977,"http.slows":0,"data.bitrate_mean":1968,"data.bitrate_stddev":0,"data.bitrate_median":1950,"data.bitrate_modes":[1950],"data.streamlets":1,"data.late_streamlets":0,"data.all_streamlets":0,"data.bf_streamlets":0,"data.ab_streamlets":0}
{"type":"session.started","v":2,"post.source":"1306210600-col004.sv1.movenetworks.com:2183#6","post.ip4":"63.225.172.43","post.rtime_secs":1371794671,"post.rtime_text":"2013-06-21 06:04:31","post.time_secs":1371794660,"post.time_text":"2013-06-21 06:04:20","post.time_date":"2013-06-21","post.time_hour":6,"post.late_secs":11,"id.session":"232169818","id.sub":"55d514ba-3858-11e2-91a7-12313d08e01f","id.partner":"0CB48A664E514CA48D378D152574EDBB","id.service":"BBTV (CBD46B77)","device.make":"Roku","device.model":"3100X","device.info":"Roku;3100X","device.serial":"13C2AE061481","device.version":"2.5.0.37","device.uuid":"7f5654d5-3aa7-5a5f-bb2b-8084da358942","device.os":"Linux","device.os_version":"2.6.35","device.platform":"Roku","device.platform_vendor":"Move Networks","device.platform_version":"1.0.0.20130615","device.licenses":[],"user.type":"Subscriber","ip.provider":"netacuity","ip.postal_code":"98115","ip.dma":819,"ip.dma_name":"unknown","ip.city":"seattle","ip.country":"united states","ip.region":"washington","ip.continent":"north america","ip.isp":"qwest communications company llc","ip.asn":0,"ip.asn_owner":"?","clip.id":"1339170","clip.pub":"CBD46B77","asset.id":522015,"asset.pub":"CBD46B77","asset.length_ms":7200000,"asset.guid":"c6938cfa200a21e90dce41f5ed131cc2","asset.title":"Spark Top 20","asset.type":"captured","asset.adult":false,"container.id":277,"container.pub":"CBD46B77","container.guid":"03e3a689e245457bba2f98c30ef931fa","container.title":"BIGMGC","usage.stage":"mainVideo","exp.idle_ms":5772}
I want to load it in R and convert to a dataframe.Here field names are a part of the data and also we have unequal no of fields in each row (a total of 13 rows)
Any help will be appreciated.

Here's one way to do it:
file <- '[
{"type":"session.ended","v":2,"post.source":"1306210600-col001.sv1.movenetworks.com:2097#5","post.ip4":"75.114.187.146","post.rtime_secs":1371794661,"post.rtime_text":"2013-06-21 06:04:20","post.time_secs":1371794596,"post.time_text":"2013-06-21 06:03:16","post.time_date":"2013-06-21","post.time_hour":6,"post.late_secs":65,"id.session":"3625657","id.sub":"2370b726-b96e-11e2-b3eb-1231380e1adf","id.partner":"0CB48A664E514CA48D378D152574EDBB","id.service":"BBTV (CBD46B77)","device.make":"Roku","device.model":"3050X","device.info":"Roku;3050X","device.serial":"12C241003940","device.version":"2.5.0.7","device.uuid":"51ce2255-62ad-5778-b2d7-b9543c1476c6","device.os":"Linux","device.os_version":"2.6.35","device.platform":"Roku","device.platform_vendor":"Move Networks","device.platform_version":"1.0.0.20130329","device.licenses":[],"user.type":"Subscriber","ip.provider":"netacuity","ip.postal_code":"48154","ip.dma":505,"ip.dma_name":"unknown","ip.city":"livonia","ip.country":"united states","ip.region":"michigan","ip.continent":"north america","ip.isp":"bright house networks llc","ip.asn":0,"ip.asn_owner":"?","clip.id":"1338713","clip.pub":"CBD46B77","asset.id":524768,"asset.pub":"CBD46B77","asset.length_ms":1800000,"asset.guid":"b7c12c09dc5aec832e142a00b0f191fa","asset.title":"Diya Aur Bati Hum","asset.type":"captured","asset.adult":false,"asset.franchise_guid":"941496e1452b4fce9acfe7b3339924eb","asset.franchise_title":"Diya Aur Bati Hum","container.id":14,"container.pub":"CBD46B77","container.guid":"3caa6afd715e4c57ac4750d29e449a9c","container.title":"SPLUS","usage.elapsed_ms":2312,"usage.viewed_ms":392,"usage.stage":"mainVideo","exp.idle_ms":350,"exp.stalls":0,"exp.stalled_ms":0,"exp.frame_renders":0,"exp.frame_drops":0,"exp.ghost_session_ms":0,"exp.ghost_sessions":0,"exp.qmx_stale_ms":0,"exp.qmx_error_ms":0,"exp.qss_late_ms":0,"exp.qss_error_ms":0,"exp.fom":0,"exp.fom_weight":0,"data.dl_bytes":228,"data.ul_bytes":406,"http.oks":2,"http.errors":0,"http.timeouts":0,"net.throughput":8977,"http.slows":0,"data.bitrate_mean":1968,"data.bitrate_stddev":0,"data.bitrate_median":1950,"data.bitrate_modes":[1950],"data.streamlets":1,"data.late_streamlets":0,"data.all_streamlets":0,"data.bf_streamlets":0,"data.ab_streamlets":0}
,{"type":"session.started","v":2,"post.source":"1306210600-col004.sv1.movenetworks.com:2183#6","post.ip4":"63.225.172.43","post.rtime_secs":1371794671,"post.rtime_text":"2013-06-21 06:04:31","post.time_secs":1371794660,"post.time_text":"2013-06-21 06:04:20","post.time_date":"2013-06-21","post.time_hour":6,"post.late_secs":11,"id.session":"232169818","id.sub":"55d514ba-3858-11e2-91a7-12313d08e01f","id.partner":"0CB48A664E514CA48D378D152574EDBB","id.service":"BBTV (CBD46B77)","device.make":"Roku","device.model":"3100X","device.info":"Roku;3100X","device.serial":"13C2AE061481","device.version":"2.5.0.37","device.uuid":"7f5654d5-3aa7-5a5f-bb2b-8084da358942","device.os":"Linux","device.os_version":"2.6.35","device.platform":"Roku","device.platform_vendor":"Move Networks","device.platform_version":"1.0.0.20130615","device.licenses":[],"user.type":"Subscriber","ip.provider":"netacuity","ip.postal_code":"98115","ip.dma":819,"ip.dma_name":"unknown","ip.city":"seattle","ip.country":"united states","ip.region":"washington","ip.continent":"north america","ip.isp":"qwest communications company llc","ip.asn":0,"ip.asn_owner":"?","clip.id":"1339170","clip.pub":"CBD46B77","asset.id":522015,"asset.pub":"CBD46B77","asset.length_ms":7200000,"asset.guid":"c6938cfa200a21e90dce41f5ed131cc2","asset.title":"Spark Top 20","asset.type":"captured","asset.adult":false,"container.id":277,"container.pub":"CBD46B77","container.guid":"03e3a689e245457bba2f98c30ef931fa","container.title":"BIGMGC","usage.stage":"mainVideo","exp.idle_ms":5772}
]'
You need the function fromJSON of the RJSONIO package:
library(RJSONIO)
json <- fromJSON(file, nullValue = NA)
Replace (empty) lists by NA and convert to data frames:
dat <- lapply(json, function(j) {
as.data.frame(replace(j, sapply(j, is.list), NA))
})
Create a single data frame:
library(plyr)
res <- rbind.fill(dat)
The result (res):
type v
1 session.ended 2
2 session.started 2
post.source
1 1306210600-col001.sv1.movenetworks.com:2097#5
2 1306210600-col004.sv1.movenetworks.com:2183#6
post.ip4 post.rtime_secs post.rtime_text
1 75.114.187.146 1371794661 2013-06-21 06:04:20
2 63.225.172.43 1371794671 2013-06-21 06:04:31
post.time_secs post.time_text post.time_date
1 1371794596 2013-06-21 06:03:16 2013-06-21
2 1371794660 2013-06-21 06:04:20 2013-06-21
post.time_hour post.late_secs id.session
1 6 65 3625657
2 6 11 232169818
id.sub
1 2370b726-b96e-11e2-b3eb-1231380e1adf
2 55d514ba-3858-11e2-91a7-12313d08e01f
id.partner id.service
1 0CB48A664E514CA48D378D152574EDBB BBTV (CBD46B77)
2 0CB48A664E514CA48D378D152574EDBB BBTV (CBD46B77)
device.make device.model device.info device.serial
1 Roku 3050X Roku;3050X 12C241003940
2 Roku 3100X Roku;3100X 13C2AE061481
device.version device.uuid
1 2.5.0.7 51ce2255-62ad-5778-b2d7-b9543c1476c6
2 2.5.0.37 7f5654d5-3aa7-5a5f-bb2b-8084da358942
device.os device.os_version device.platform
1 Linux 2.6.35 Roku
2 Linux 2.6.35 Roku
device.platform_vendor device.platform_version
1 Move Networks 1.0.0.20130329
2 Move Networks 1.0.0.20130615
device.licenses user.type ip.provider ip.postal_code
1 NA Subscriber netacuity 48154
2 NA Subscriber netacuity 98115
ip.dma ip.dma_name ip.city ip.country ip.region
1 505 unknown livonia united states michigan
2 819 unknown seattle united states washington
ip.continent ip.isp ip.asn
1 north america bright house networks llc 0
2 north america qwest communications company llc 0
ip.asn_owner clip.id clip.pub asset.id asset.pub
1 ? 1338713 CBD46B77 524768 CBD46B77
2 ? 1339170 CBD46B77 522015 CBD46B77
asset.length_ms asset.guid
1 1800000 b7c12c09dc5aec832e142a00b0f191fa
2 7200000 c6938cfa200a21e90dce41f5ed131cc2
asset.title asset.type asset.adult
1 Diya Aur Bati Hum captured FALSE
2 Spark Top 20 captured FALSE
asset.franchise_guid asset.franchise_title
1 941496e1452b4fce9acfe7b3339924eb Diya Aur Bati Hum
2 <NA> <NA>
container.id container.pub
1 14 CBD46B77
2 277 CBD46B77
container.guid container.title
1 3caa6afd715e4c57ac4750d29e449a9c SPLUS
2 03e3a689e245457bba2f98c30ef931fa BIGMGC
usage.elapsed_ms usage.viewed_ms usage.stage
1 2312 392 mainVideo
2 NA NA mainVideo
exp.idle_ms exp.stalls exp.stalled_ms
1 350 0 0
2 5772 NA NA
exp.frame_renders exp.frame_drops exp.ghost_session_ms
1 0 0 0
2 NA NA NA
exp.ghost_sessions exp.qmx_stale_ms exp.qmx_error_ms
1 0 0 0
2 NA NA NA
exp.qss_late_ms exp.qss_error_ms exp.fom
1 0 0 0
2 NA NA NA
exp.fom_weight data.dl_bytes data.ul_bytes http.oks
1 0 228 406 2
2 NA NA NA NA
http.errors http.timeouts net.throughput http.slows
1 0 0 8977 0
2 NA NA NA NA
data.bitrate_mean data.bitrate_stddev
1 1968 0
2 NA NA
data.bitrate_median data.bitrate_modes data.streamlets
1 1950 1950 1
2 NA NA NA
data.late_streamlets data.all_streamlets
1 0 0
2 NA NA
data.bf_streamlets data.ab_streamlets
1 0 0
2 NA NA

Related

statsmodels OLS gives parameters despite perfect multicollinearity

Assume the following df:
ib c d1 d2
0 1.14 1 1 0
1 1.0 1 1 0
2 0.71 1 1 0
3 0.6 1 1 0
4 0.66 1 1 0
5 1.0 1 1 0
6 1.26 1 1 0
7 1.29 1 1 0
8 1.52 1 1 0
9 1.31 1 1 0
10 0.89 1 0 1
d1 and d2 are perfectly colinear. Now I estimate the following regression model:
import statsmodels.api as sm
reg = sm.OLS(df['ib'], df[['c', 'd1', 'd2']]).fit().summary()
reg
This gives me the following output:
<class 'statsmodels.iolib.summary.Summary'>
"""
OLS Regression Results
==============================================================================
Dep. Variable: ib R-squared: 0.087
Model: OLS Adj. R-squared: -0.028
Method: Least Squares F-statistic: 0.7590
Date: Thu, 17 Nov 2022 Prob (F-statistic): 0.409
Time: 12:19:34 Log-Likelihood: -1.5470
No. Observations: 10 AIC: 7.094
Df Residuals: 8 BIC: 7.699
Df Model: 1
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
c 0.7767 0.111 7.000 0.000 0.521 1.033
d1 0.2433 0.127 1.923 0.091 -0.048 0.535
d2 0.5333 0.213 2.499 0.037 0.041 1.026
==============================================================================
Omnibus: 0.257 Durbin-Watson: 0.760
Prob(Omnibus): 0.879 Jarque-Bera (JB): 0.404
Skew: 0.043 Prob(JB): 0.817
Kurtosis: 2.019 Cond. No. 8.91e+15
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.34e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
"""
However, including c, d1 and d2 represents the well known dummy variable trap which, from my understanding, should make it impossible to estimate the model. Why is this not the case here?

Scraping html text into table with delimiters that do not have a clear pattern using R (rvest)

I'm just learning how to use R to scrape data from webpages, and I'm running into a couple of issues.
For reference, the website that I am practicing on is here: http://www.rsssf.com/tables/34q.html
As far as I know, the website I am scraping data from is not a table so I can't directly scrape the information into a table, so here is the code I wrote to just have all of the text:
wcq_1934_html <- read_html("http://www.rsssf.com/tables/34q.html")
wcq_1934_node <- html_nodes(wcq_1934_html, "pre")
wcq_1934_text <- html_text(wcq_1934_node, trim = TRUE)
This results in a very long text file with all of the information that I need, just not formatted in an ideal way.
So I am next attempting to substring this text in order to get an output that looks something like this.
Country A - Country A Score - Country B - Country B Score
It doesn't have to be exactly like this, I just basically need for each game the country and how many goals they scored and ideally it should be comparable with the other country from the same game so I can know who won or lost! I do not need any of the other information like where the game was played, etc.
So I've tried three different ways to get this:
First test: split text by dashes:
test <- strsplit(wcq_1934_text, "-")
df_test <- data.frame(test)
This gives me the information I need in a table but the rows don't match the exact scores that I need (i.e. Lithuania 0, and Sweden 2 are in separate rows)
Second test: split text by spaces:
test2 <- strsplit(wcq_1934_text, " ")
df_test2 <- data.frame(test2)
This is helpful because it gives me the scores in one row (0-2 for the first game), but the countries are unevenly spaced out across rows.
Third test: split text by "tabs"
test3 <- strsplit(wcq_1934_text, " ")
df_test3 <- data.frame(test3)
This has a similar issue to the first test.
Any suggestions would be much appreciated. This is my first ever Stack Overflow post, although I've lurked around and this website has been helpful to me for a very long time. Thank you in advance!
Here's a solution that provides you most of what you need, though as MrFlick commented, it is a little fragile to this page. I'll stay with rvest, though as biomiha suggested, it isn't really buying you a lot here (though it does cleanly break out the <pre> block).
Starting with your wcq_1934_text, it's a single long string, let's break it up by newlines (CRLF in this case):
wcq_1934_text <- strsplit(wcq_1934_text, "[\r\n]+")[[1]]
str(wcq_1934_text)
# chr [1:51] "Hosts: Italy (not automatically qualified)" "Holders: Uruguay (did not enter)" "Group 1 [Sweden]" ...
I'll the magrittr package merely because it helps break out each step of the process using the %>% non-pipe; you can convert it non-magrittr by changing (say) func1() %>% func2() %>% func3() to func3(func2(func1())) (yuck) or intermediate assignment of return values, ret1 <- func1(); ret2 <- func2(ret1); ....
library(magrittr)
dat <- Filter(function(a) grepl("^[0-9][0-9]", a), wcq_1934_text) %>%
paste(., collapse = "\n") %>%
textConnection() %>%
read.fwf(file = ., widths = c(10, 16, 17, 4, 99), stringsAsFactors = FALSE) %>%
lapply(trimws) %>%
as.data.frame(stringsAsFactors = FALSE)
The widths are fragile and unique to this page. If other reporting pages have slightly different column layouts, you'll need to use a different function, perhaps one that can automatically determine the breaks.
head(dat)
# V1 V2 V3 V4 V5
# 1 11.06.33 Stockholm Sweden 6-2 Estonia
# 2 29.06.33 Kaunas Lithuania 0-2 Sweden
# 3 11.03.34 Madrid Spain 9-0 Portugal
# 4 18.03.34 Lisboa Portugal 1-2 Spain
# 5 25.03.34 Milano Italy 4-0 Greece
# 6 25.03.34 Sofia Bulgaria 1-4 Hungary
From here, it's up to you which columns you want to use.
For instance, handling of the date, you might want:
dat$V1 <- as.POSIXct(gsub("([0-9]+)$", "19\\1", dat$V1), format = "%d.%m.%Y")
dat$V1
# [1] "1933-06-11 PST" "1933-06-29 PST" "1934-03-11 PST" "1934-03-18 PST" "1934-03-25 PST" "1934-03-25 PST" "1934-04-25 PST" "1934-04-29 PST"
# [9] "1933-10-15 PST" "1934-03-15 PST" "1933-09-24 PST" "1933-10-29 PST" "1934-04-29 PST" "1934-02-25 PST" "1934-04-08 PST" "1934-04-29 PST"
# [17] "1934-03-11 PST" "1934-04-15 PST" "1934-01-28 PST" "1934-02-01 PST" "1934-02-04 PST" "1934-03-04 PST" "1934-03-11 PST" "1934-03-18 PST"
# [25] "1934-05-24 PST" "1934-03-16 PST" "1934-04-06 PST"
The gsub stuff is because as.POSIXct assumes 2-digit years less than 69 are in the 20th century, 19th for 69-99.
It's easy enough to use either strsplit on the scores, but you could also do:
library(tidyr)
dat %>%
separate(V4, c("score1", "score2"), sep="-") %>%
head()
# Warning: Too few values at 1 locations: 10
# V1 V2 V3 score1 score2 V5
# 1 1933-06-11 Stockholm Sweden 6 2 Estonia
# 2 1933-06-29 Kaunas Lithuania 0 2 Sweden
# 3 1934-03-11 Madrid Spain 9 0 Portugal
# 4 1934-03-18 Lisboa Portugal 1 2 Spain
# 5 1934-03-25 Milano Italy 4 0 Greece
# 6 1934-03-25 Sofia Bulgaria 1 4 Hungary
(The warning is expected, since one game was not played so has "n/p" for a score. You might want to handle non-score values in V4 before trying the split, perhaps replacing anything not numeric-dash-numeric with NA.)
Equally specific to this particular site but may be easier to generalize:
library(rvest)
library(purrr)
library(dplyr)
library(stringi)
pg <- read_html("http://www.rsssf.com/tables/34q.html")
Target the <pre> and strip out some things that aren't part of "tables":
html_nodes(pg, "pre") %>%
html_text() %>%
stri_split_lines() %>%
flatten_chr() %>%
discard(stri_detect_regex, "^(NB| )") -> lines
Now, we get the start and end lines indexes of each "group":
starts <- which(grepl("^Group", lines))
ends <- c(starts[-1], length(lines))
We iterate over those starts and ends and:
extract the group info
clean up the table
discard any "empty" tables
turn the tabular data into a data frame, doing some munging along the way
I can annotate the following more if needed:
map2_df(starts, ends, ~{
grp_info <- stri_match_all_regex(lines[.x], "Group ([[:digit:]]+) \\[(.*)]")[[1]][,2:3]
lines[(.x+1):.y] %>%
discard(stri_detect_regex, "(^[^[:digit:]]| round)") %>%
discard(`==`, "") -> grp
if (length(grp) == 0) return(NULL)
stri_split_regex(grp, "\ \ +") %>%
map_df(~{
.x[1:4] %>%
as.list() %>%
set_names(c("date", "team_a", "team_b", "score_team")) %>%
flatten_df() %>%
separate(score_team, c("score", "team_c"), sep=" ") %>%
mutate(group_num = grp_info[1], group_info = grp_info[2]) %>%
separate(date, c("d", "m", "y")) %>%
mutate(date = as.Date(sprintf("19%s-%s-%s", y, m, d))) %>%
select(-d, -m, -y)
})
})
## # A tibble: 27 x 7
## team_a team_b score team_c group_num group_info date
## <chr> <chr> <chr> <chr> <chr> <chr> <date>
## 1 Stockholm Sweden 6-2 Estonia 1 Sweden 1933-06-11
## 2 Kaunas Lithuania 0-2 Sweden 1 Sweden 1933-06-29
## 3 Madrid Spain 9-0 Portugal 2 Spain 1934-03-11
## 4 Lisboa Portugal 1-2 Spain 2 Spain 1934-03-18
## 5 Milano Italy 4-0 Greece 3 Italy 1934-03-25
## 6 Sofia Bulgaria 1-4 Hungary 4 Hungary, Austria 1934-03-25
## 7 Wien Austria 6-1 Bulgaria 4 Hungary, Austria 1934-04-25
## 8 Budapest Hungary 4-1 Bulgaria 4 Hungary, Austria 1934-04-29
## 9 Warszawa Poland 1-2 Czechoslovakia 5 Czechoslovakia 1933-10-15
## 10 Praha Czechoslovakia n/p Poland 5 Czechoslovakia 1934-03-15
## 11 Beograd Yugoslavia 2-2 Switzerland 6 Romania, Switzerland 1933-09-24
## 12 Bern Switzerland 2-2 Romania 6 Romania, Switzerland 1933-10-29
## 13 Bucuresti Romania 2-1 Yugoslavia 6 Romania, Switzerland 1934-04-29
## 14 Dublin Ireland 4-4 Belgium 7 Netherlands, Belgium 1934-02-25
## 15 Amsterdam Netherlands 5-2 Ireland 7 Netherlands, Belgium 1934-04-08
## 16 Antwerpen Belgium 2-4 Netherlands 7 Netherlands, Belgium 1934-04-29
## 17 Luxembourg Luxembourg 1-9 Germany 8 Germany, France 1934-03-11
## 18 Luxembourg Luxembourg 1-6 France 8 Germany, France 1934-04-15
## 19 Port-au-Prince Haiti 1-3 Cuba 11 USA 1934-01-28
## 20 Port-au-Prince Haiti 1-1 Cuba 11 USA 1934-02-01
## 21 Port-au-Prince Haiti 0-6 Cuba 11 USA 1934-02-04
## 22 Cd. de Mexico Mexico 3-2 Cuba 11 USA 1934-03-04
## 23 Cd. de Mexico Mexico 5-0 Cuba 11 USA 1934-03-11
## 24 Cd. de Mexico Mexico 4-1 Cuba 11 USA 1934-03-18
## 25 Roma USA 4-2 Mexico 11 USA 1934-05-24
## 26 Cairo Egypt 7-1 Palestina 12 Egypt 1934-03-16
## 27 Tel Aviv Palestina 1-4 Egypt 12 Egypt 1934-04-06

Using table() to count occurrences of strings in the R results in some, but not all, incorrect outputs

I usually find all my answers by searching here, and I've never had to post anything before. However this is a very particular problem and I haven't been able to find an answer. I hope you can help.
I have this table, called "FSR":
Mouse Day Percent.Rewarded Percent.Premature
Y3.5 1 0.72 0.73
Y3.6 1 0.47 0.68
Y3.7 1 0.74 0.71
X7.1 1 0.74 0.79
X7.2 1 0.74 0.80
AA1.1 1 0.91 0.84
AA1.2 1 0.70 0.75
AA1.3 1 0.95 0.85
I want to count the number of times each Mouse ID appears in the column Mouse, which should be easy:
FSRCounts <- table(FSR$Mouse)
So far, so good. This appears to work:
print(FSRCounts)
AA1.1 AA1.2 AA1.3 X7.1 X7.2 Y3.5 Y3.6 Y3.7
1 1 1 1 1 1 1 1
If I want to know how many times a particular mouse has appeared, this also works, no matter which mouse:
FSRCounts["Y3.6"]
Y3.6
1
FSRCounts["AA1.1"]
AA1.1
1
However, for some reason when I use another table, called data, to get the mouse IDs the code doesn't work for every mouse and I can't figure out why.
Here's the table "data". I have deleted columns irrelevant to this question:
Experiment mouse
1 RIGHT_autoshape_gonogo_LMR X6.1
2 LEFT_autoshape_gonogo_LMR X6.2
3 RIGHT_autoshape_gonogo_LMR Y3.1
4 LEFT_autoshape_gonogo_LMR Y3.2
5 RIGHT_autoshape_gonogo_LMR Y3.3
6 LEFT_autoshape_gonogo_LMR Y3.4
7 RIGHT_5sec_reactiontime_gonogo_LMR Y3.5
8 LEFT_5sec_reactiontime_gonogo_LMR Y3.6
9 RIGHT_5sec_reactiontime_gonogo_LMR Y3.7
10 LEFT_5sec_reactiontime_gonogo_LMR X7.1
11 RIGHT_5sec_reactiontime_gonogo_LMR X7.2
12 LEFT_5sec_reactiontime_gonogo_LMR AA1.1
13 RIGHT_5sec_reactiontime_gonogo_LMR AA1.2
14 LEFT_5sec_reactiontime_gonogo_LMR AA1.3
15 RIGHT_autoshape_gonogo_LMR AA1.4
16 LEFT_autoshape_gonogo_LMR AA1.5
17 RIGHT_autoshape_gonogo_LMR AA1.6
18 RIGHT_autoshape_gonogo_LMR Y4.2
19 LEFT_autoshape_gonogo_LMR Y4.3
And here's the code:
for (i in 1:nrow(data)) {
if (grepl("5sec", data[i,"Experiment"])) {
FiveToday <- T
FM <- data[i,"mouse"]
FD <- FSRCounts[FM] + 1
if (is.na(FSRCounts[FM])){
FD <- 1
}
}
}
It works for some mice, but not others. I've added the "print()" lines to show exactly where the code is screwing up.
For example, it works for AA1.2:
> FM <- data[13,"mouse"]
> print("FM:")
[1] "FM:"
> print(FM)
[1] AA1.2
Levels: AA1.1 AA1.2 AA1.3 AA1.4 AA1.5 AA1.6 X6.1 X6.2 X7.1 X7.2 Y3.1 Y3.2 Y3.3 Y3.4 Y3.5 Y3.6 Y3.7 Y4.2 Y4.3
> print("FSR Count Table:")
[1] "FSR Count Table:"
> print(FSRCounts)
AA1.1 AA1.2 AA1.3 X7.1 X7.2 Y3.5 Y3.6 Y3.7
1 1 1 1 1 1 1 1
> print("Count:")
[1] "Count:"
> print(FSRCounts[FM])
AA1.2
1
> FD <- FSRCounts[FM] + 1
> print("FD:")
[1] "FD:"
> print(FD)
AA1.2
2
> if (is.na(FSRCounts[FM])){print("FD is 1")
+ FD <- 1
}
But not for Y3.6:
> FM <- data[8,"mouse"]
> print("FM:")
[1] "FM:"
> print(FM)
[1] Y3.6
Levels: AA1.1 AA1.2 AA1.3 AA1.4 AA1.5 AA1.6 X6.1 X6.2 X7.1 X7.2 Y3.1 Y3.2 Y3.3 Y3.4 Y3.5 Y3.6 Y3.7 Y4.2 Y4.3
> print("FSR Count Table:")
[1] "FSR Count Table:"
> print(FSRCounts)
AA1.1 AA1.2 AA1.3 X7.1 X7.2 Y3.5 Y3.6 Y3.7
1 1 1 1 1 1 1 1
> print("Count:")
[1] "Count:"
> print(FSRCounts[FM])
<NA>
NA
> FD <- FSRCounts[FM] + 1
> print("FD:")
[1] "FD:"
> print(FD)
<NA>
NA
> if (is.na(FSRCounts[FM])){print("FD is 1")
+ FD <- 1
+ }
[1] "FD is 1"
Can anyone help me figure out why this is happening and fix it? Or suggest an alternate way to do the same thing? Thanks for your help!
Dan Hoops

Conditional sum on data.frame based on duplicates

I have been trying to make a conditional sum based on a data.framethat has duplicates. I want to sum the ones that has an identical permno and date and create a separate column with this information filling in NA's or preferable 0's.
My data set looks like this:
data.frame(crsp)
permno date PAYDT DISTCD divamt FACPR FACSHR PRC RET
1 10022 19280929 19281001 1272 0.25 0 0 71.00 0.045208
2 10022 19280929 19281001 1232 1.00 0 0 71.00 0.045208
3 10022 19281031 NA NA NA NA NA 73.50 0.035211
4 10022 19281130 NA NA NA NA NA 72.50 -0.013605
5 10022 19281231 19290202 1232 1.00 0 0 68.00 -0.044828
6 10022 19281231 19290202 1272 0.25 0 0 68.00 -0.044828
7 10022 19290131 NA NA NA NA NA 73.75 0.084559
8 10022 19290228 NA NA NA NA NA 69.00 -0.064407
9 10022 19290328 19290401 1232 1.00 0 0 65.00 -0.039855
10 10022 19290328 19290401 1272 0.25 0 0 65.00 -0.039855
11 10022 19290430 NA NA NA NA NA 67.00 0.030769
12 10022 19290531 NA NA NA NA NA 64.75 -0.033582
First, I have created permno + date to make a unique pickup-code
crsp$permnodate = paste(as.character(crsp$permno),as.character(crsp$date),sep="")
Second, I have then tried to sum the duplicates and making this into a new frame:
crsp_divsingl <- aggregate(crsp$divamt, by = list(permnodate = crsp$permnodate), FUN = sum, na.rm = TRUE)
However, I am unable to transfer this information back correctly to the original data.frame(crsp), as the columns have different lenghts where cbind and cbind.fill don't allow me to match this correctly. Specifically, I want the sum of the divamts for one/the first of the unique permnodates so it corresponds with the remaining data.frame in length. I have not had succed with merge or match either.
I haven't tried loop functions yet or managed to create any if or ifelse functions with succes. Basically, this can be done in excel with the VLOOKUP or the index.match formula, however, this is more tricky in R than I first thought.
Help is much appreciated.
Best regards
Troels
You can use duplicated and merge to achieve this more easily. I've written an example. You'll have to alter this for your purposes, but hopefully it will put you on the right track:
# Creating a fake sample dataset.
set.seed(9)
permno <- 10022:10071 # Allowing 50 possible permno's.
date <- 19280929:19280978 # Allow 50 possible dates.
value <- c(NA, 1:9) # Allowing NA or a 0 through 9 value.
# Creating fake data frame.
crsp <- data.frame(permno = sample(permno, 1000, TRUE), date = sample(date, 1000, TRUE), value = sample(value, 1000, TRUE))
# Loading a function that uses duplicated to get both the duplicated rows and the original rows.
fullDup <- function(x) {
bool <- duplicated(x) | duplicated(x, fromLast = TRUE)
return(bool)
}
# Getting the duplicated rows.
crsp.dup <- crsp[fullDup(crsp[, c("permno", "date")]), ] # fullDup returns a boolean of all the rows that were duplicated to another row by permno and date including the first row.
# Now aggregate.
crsp.dup[is.na(crsp.dup)] <- 0 # Converting NA values to 0.
crsp.dup <- aggregate(value ~ permno + date, crsp.dup, sum)
names(crsp.dup)[3] <- "value.dup" # Changing the name of the value column.
# Now merge back in with the original dataset.
crsp <- merge(crsp, crsp.dup, by = c("permno", "date"), all.x = TRUE)

Getting imported json data into a data frame

I have a file containing over 1500 json objects that I want to work with in R. I've been able to import the data as a list, but am having trouble coercing it into a useful structure. I want to create a data frame containing a row for each json object and a column for each key:value pair.
I've recreated my situation with this small, fake data set:
[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]
Some features of the data:
The objects all contain the same number of key:value pairs although
some of the values are null
There are two non-numeric columns per object (name and group)
name is the unique identifier, there are 10 or so groups
many of the name and group entires contain spaces, commas and other punctuation.
Based on this question: R list(structure(list())) to data frame, I tried the following:
json_file <- "test.json"
json_data <- fromJSON(json_file)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))
With both my real data and this fake data, the last line give me this error:
Error in data.frame(name = "Doe, John", group = "Red", `age (y)` = 24, :
arguments imply differing number of rows: 1, 0
You just need to replace your NULLs with NAs:
require(RJSONIO)
json_file <- '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'
json_file <- fromJSON(json_file)
json_file <- lapply(json_file, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
Once you have a non-null value for each element, you can call rbind without getting an error:
do.call("rbind", json_file)
name group age (y) height (cm) wieght (kg) score
[1,] "Doe, John" "Red" "24" "182" "74.8" NA
[2,] "Doe, Jane" "Green" "30" "170" "70.1" "500"
[3,] "Smith, Joan" "Yellow" "41" "169" "60" NA
[4,] "Brown, Sam" "Green" "22" "183" "75" "865"
[5,] "Jones, Larry" "Green" "31" "178" "83.9" "221"
[6,] "Murray, Seth" "Red" "35" "172" "76.2" "413"
[7,] "Doe, Jane" "Yellow" "22" "164" "68" "902"
This is very simple if you use either library(jsonlite) or library(jsonify)
Both of these handle the null values and converts them to NA, and they preserve the data types.
Data
json_file <- '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'
jsonlite
library(jsonlite)
jsonlite::fromJSON( json_file )
# name group age (y) height (cm) wieght (kg) score
# 1 Doe, John Red 24 182 74.8 NA
# 2 Doe, Jane Green 30 170 70.1 500
# 3 Smith, Joan Yellow 41 169 60.0 NA
# 4 Brown, Sam Green 22 183 75.0 865
# 5 Jones, Larry Green 31 178 83.9 221
# 6 Murray, Seth Red 35 172 76.2 413
# 7 Doe, Jane Yellow 22 164 68.0 902
str( jsonlite::fromJSON( json_file ) )
# 'data.frame': 7 obs. of 6 variables:
# $ name : chr "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group : chr "Red" "Green" "Yellow" "Green" ...
# $ age (y) : int 24 30 41 22 31 35 22
# $ height (cm): int 182 170 169 183 178 172 164
# $ wieght (kg): num 74.8 70.1 60 75 83.9 76.2 68
# $ score : int NA 500 NA 865 221 413 902
jsonify
library(jsonify)
jsonify::from_json( json_file )
# name group age (y) height (cm) wieght (kg) score
# 1 Doe, John Red 24 182 74.8 NA
# 2 Doe, Jane Green 30 170 70.1 500
# 3 Smith, Joan Yellow 41 169 60.0 NA
# 4 Brown, Sam Green 22 183 75.0 865
# 5 Jones, Larry Green 31 178 83.9 221
# 6 Murray, Seth Red 35 172 76.2 413
# 7 Doe, Jane Yellow 22 164 68.0 90
str( jsonify::from_json( json_file ) )
# 'data.frame': 7 obs. of 6 variables:
# $ name : chr "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group : chr "Red" "Green" "Yellow" "Green" ...
# $ age (y) : int 24 30 41 22 31 35 22
# $ height (cm): int 182 170 169 183 178 172 164
# $ wieght (kg): num 74.8 70.1 60 75 83.9 76.2 68
# $ score : int NA 500 NA 865 221 413 902
To remove null values use parameter nullValue
json_data <- fromJSON(json_file, nullValue = NA)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))
this way there won´t be any unnecessary quotes in your output
library(rjson)
Lines <- readLines("yelp_academic_dataset_business.json")
business <- as.data.frame(t(sapply(Lines, fromJSON)))
You may try this to load JSON data into R
dplyr::bind_rows(fromJSON(file_name))
Changing the package from rjson to jsonlite fixed it for me.
So instead of this:
fromAPIPlantsPages <- rjson::fromJSON(content(apiGetPlants,type="text",encoding = "UTF-8"))
dfPlantenAPI <- as.data.frame(fromAPIPlantsPages)
I changed it to this:
fromAPIPlantsPages <- jsonlite::fromJSON(content(apiGetPlants,type="text",encoding = "UTF-8"))
dfPlantenAPI <- as.data.frame(fromAPIPlantsPages)