I imported a json file using rjson and converted it to a data.frame but all of the data is spread widthwise and the column names contain the key information.
stations <- fromJSON(file = "station_information.json")
test <- as.data.frame(stations[3])
What this looks like is:
> dim(test)
[1] 2 5985
> test[1:27]
data.stations.station_id data.stations.name data.stations.short_name
1 72 W 52 St & 11 Ave 6926.01
2 72 W 52 St & 11 Ave 6926.01
data.stations.lat data.stations.lon data.stations.region_id
1 40.76727 -73.99393 71
2 40.76727 -73.99393 71
data.stations.rental_methods data.stations.capacity
1 KEY 39
2 CREDITCARD 39
data.stations.eightd_has_key_dispenser data.stations.station_id.1
1 FALSE 79
2 FALSE 79
data.stations.name.1 data.stations.short_name.1 data.stations.lat.1
1 Franklin St & W Broadway 5430.08 40.71912
2 Franklin St & W Broadway 5430.08 40.71912
data.stations.lon.1 data.stations.region_id.1 data.stations.rental_methods.1
1 -74.00667 71 KEY
2 -74.00667 71 CREDITCARD
data.stations.capacity.1 data.stations.eightd_has_key_dispenser.1
1 33 FALSE
2 33 FALSE
data.stations.station_id.2 data.stations.name.2 data.stations.short_name.2
1 82 St James Pl & Pearl St 5167.06
2 82 St James Pl & Pearl St 5167.06
data.stations.lat.2 data.stations.lon.2 data.stations.region_id.2
1 40.71117 -74.00017 71
2 40.71117 -74.00017 71
data.stations.rental_methods.2 data.stations.capacity.2
1 KEY 27
2 CREDITCARD 27
data.stations.eightd_has_key_dispenser.2
1 FALSE
2 FALSE
As you can see, this cannot be fixed with a simple transpose t() or melt() solution. I am wondering what I am doing wrong on import or conversion to data.frame which leaves me with a dataframe stretched with the index of what should be rows appended to the column names.
I have tried both of these methods but I am left with the same stretched data:
plyr::ldply(stations, data.frame)
do.call(rbind, lapply(stations, data.frame, stringsAsFactors=FALSE))
In the end I would like my output to look like every 9 columns were "cut" and stacked on to the first 9 - so that I am left with 655 rows and 9 columns Any suggestions would be appreciated.
NOTE: I am taking the JSON directly from this link (it is not a large file)
Here is a reproducible example of the first 27 columns, which should be reshaped to a 9 x 3 dataframe:
> dput(df)
structure(list(data.stations.station_id = structure(c(1L, 1L), class = "factor", .Label = "72"),
data.stations.name = structure(c(1L, 1L), class = "factor", .Label = "W 52 St & 11 Ave"),
data.stations.short_name = structure(c(1L, 1L), class = "factor", .Label = "6926.01"),
data.stations.lat = c(40.76727216, 40.76727216), data.stations.lon = c(-73.99392888,
-73.99392888), data.stations.region_id = c(71, 71), data.stations.rental_methods = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity = c(39, 39), data.stations.eightd_has_key_dispenser = c(FALSE,
FALSE), data.stations.station_id.1 = structure(c(1L, 1L), class = "factor", .Label = "79"),
data.stations.name.1 = structure(c(1L, 1L), class = "factor", .Label = "Franklin St & W Broadway"),
data.stations.short_name.1 = structure(c(1L, 1L), class = "factor", .Label = "5430.08"),
data.stations.lat.1 = c(40.71911552, 40.71911552), data.stations.lon.1 = c(-74.00666661,
-74.00666661), data.stations.region_id.1 = c(71, 71), data.stations.rental_methods.1 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.1 = c(33, 33), data.stations.eightd_has_key_dispenser.1 = c(FALSE,
FALSE), data.stations.station_id.2 = structure(c(1L, 1L), class = "factor", .Label = "82"),
data.stations.name.2 = structure(c(1L, 1L), class = "factor", .Label = "St James Pl & Pearl St"),
data.stations.short_name.2 = structure(c(1L, 1L), class = "factor", .Label = "5167.06"),
data.stations.lat.2 = c(40.71117416, 40.71117416), data.stations.lon.2 = c(-74.00016545,
-74.00016545), data.stations.region_id.2 = c(71, 71), data.stations.rental_methods.2 = structure(c(2L,
1L), .Label = c("CREDITCARD", "KEY"), class = "factor"),
data.stations.capacity.2 = c(27, 27), data.stations.eightd_has_key_dispenser.2 = c(FALSE,
FALSE)), .Names = c("data.stations.station_id", "data.stations.name",
"data.stations.short_name", "data.stations.lat", "data.stations.lon",
"data.stations.region_id", "data.stations.rental_methods", "data.stations.capacity",
"data.stations.eightd_has_key_dispenser", "data.stations.station_id.1",
"data.stations.name.1", "data.stations.short_name.1", "data.stations.lat.1",
"data.stations.lon.1", "data.stations.region_id.1", "data.stations.rental_methods.1",
"data.stations.capacity.1", "data.stations.eightd_has_key_dispenser.1",
"data.stations.station_id.2", "data.stations.name.2", "data.stations.short_name.2",
"data.stations.lat.2", "data.stations.lon.2", "data.stations.region_id.2",
"data.stations.rental_methods.2", "data.stations.capacity.2",
"data.stations.eightd_has_key_dispenser.2"), row.names = c(NA,
-2L), class = "data.frame")
So the output structure should look like this (obviously with the values not NA). Each row represents the appended index number of the original dataframe's column names
> output
data.stations.station_id data.stations.name data.stations.short_name
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.lat data.stations.lon data.stations.region_id
1 NA NA NA
2 NA NA NA
3 NA NA NA
data.stations.rental_methods data.stations.capacity
1 NA NA
2 NA NA
3 NA NA
data.stations.eightd_has_key_dispenser
1 NA
2 NA
3 NA
I would try:
library(data.table)
rbindlist(lapply(split(seq_along(df), c(0, (seq_along(df)%/%9)[-length(df)])),
function(x) df[, x]), use.names = FALSE)
## data.stations.station_id data.stations.name data.stations.short_name data.stations.lat
## 1: 72 W 52 St & 11 Ave 6926.01 40.76727
## 2: 72 W 52 St & 11 Ave 6926.01 40.76727
## 3: 79 Franklin St & W Broadway 5430.08 40.71912
## 4: 79 Franklin St & W Broadway 5430.08 40.71912
## 5: 82 St James Pl & Pearl St 5167.06 40.71117
## 6: 82 St James Pl & Pearl St 5167.06 40.71117
## data.stations.lon data.stations.region_id data.stations.rental_methods
## 1: -73.99393 71 KEY
## 2: -73.99393 71 CREDITCARD
## 3: -74.00667 71 KEY
## 4: -74.00667 71 CREDITCARD
## 5: -74.00017 71 KEY
## 6: -74.00017 71 CREDITCARD
## data.stations.capacity data.stations.eightd_has_key_dispenser
## 1: 39 FALSE
## 2: 39 FALSE
## 3: 33 FALSE
## 4: 33 FALSE
## 5: 27 FALSE
## 6: 27 FALSE
That is, create a list of data.frames with 9 columns each, and rbind them. This way, you won't have problems of data coercion when converting to a matrix.
This results in a 6 row x 9 column data.table. Not sure what rule you want to use to drop rows to end up with just 3 rows....
But I think you're trying to solve a problem that does not exist. Try reading your data like this:
library(jsonlite)
x <- fromJSON("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
head(x[[3]]$stations)
## station_id name short_name lat lon region_id
## 1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71
## 2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71
## 3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71
## 4 83 Atlantic Ave & Fort Greene Pl 4354.07 40.68383 -73.97632 71
## 5 116 W 17 St & 8 Ave 6148.02 40.74178 -74.00150 71
## 6 119 Park Ave & St Edwards St 4700.06 40.69609 -73.97803 71
## rental_methods capacity eightd_has_key_dispenser
## 1 KEY, CREDITCARD 39 FALSE
## 2 KEY, CREDITCARD 33 FALSE
## 3 KEY, CREDITCARD 27 FALSE
## 4 KEY, CREDITCARD 62 FALSE
## 5 KEY, CREDITCARD 39 FALSE
## 6 KEY, CREDITCARD 19 FALSE
dim(x[[3]]$stations)
# [1] 665 9
You can use matrix but make sure all your factor columns are characters, i.e.
ind <- sapply(df, is.factor)
df[ind] <- lapply(df[ind], as.character)
final_df <- as.data.frame(matrix(unlist(df), ncol = 9, byrow = TRUE))
final_df[c(TRUE, FALSE),]
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 72 W 52 St & 11 Ave W 52 St & 11 Ave 6926.01 6926.01 40.76727216 40.76727216 -73.99392888
#3 79 79 Franklin St & W Broadway Franklin St & W Broadway 5430.08 5430.08 40.71911552 40.71911552 -74.00666661
#5 82 82 St James Pl & Pearl St St James Pl & Pearl St 5167.06 5167.06 40.71117416 40.71117416 -74.00016545
On the other hand, as #A5C1D2H2I1M1N2O1R2T1 notes, you might be looking for this instead:
as.data.frame(matrix(c(t(df)), ncol = 9, byrow = TRUE))
# V1 V2 V3 V4 V5 V6 V7 V8 V9
#1 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 KEY 39 FALSE
#2 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 KEY 33 FALSE
#3 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 KEY 27 FALSE
#4 72 W 52 St & 11 Ave 6926.01 40.76727 -73.99393 71 CREDITCARD 39 FALSE
#5 79 Franklin St & W Broadway 5430.08 40.71912 -74.00667 71 CREDITCARD 33 FALSE
#6 82 St James Pl & Pearl St 5167.06 40.71117 -74.00017 71 CREDITCARD 27 FALSE
Related
I would like to scrape a series of tables from a website whose URL does not change when I click through the tables in my browser. Each table corresponds to a unique date. The default table is that which corresponds to today's date. I can scroll through past dates in my browser, but can't seem to find a way to do so in R.
Using library(rvest) this bit of code will reliably download the table that corresponds to today's date (I'm only interested in the first of the three tables).
webad <- "https://official.nba.com/referee-assignments/"
off <- webad %>%
read_html() %>%
html_table()
off <- off[[1]]
How can I download the table that corresponds to, say "2022-10-04", to "2022-10-06", or to yesterday?
I've tried to work through it by identifying the node under which the table lies, in the hopes that I could manipulate it to reflect a prior date. However, the following reproduces the same table as above:
webad <- "https://official.nba.com/referee-assignments/"
off <- webad %>%
read_html() %>%
html_nodes("#main > div > section:nth-child(1) > article > div > div.dayContent > div > table") %>%
html_table()
off <- off[[1]]
Scrolling through past dates in my browser, I've identified various places in the html that reference the prior date; but I can't seem to change it from R, yet alone get the table I download to reflect a change:
webad %>%
read_html() %>%
html_nodes("#main > div > section:nth-child(1) > article > header > div")
I've messed around some with html_form(), follow_link(), and set_values() also, but to no avail.
Is there a good way to navigate this particular URL in R?
You can consider the following approach :
library(RSelenium)
library(rvest)
port <- as.integer(4444L + rpois(lambda = 1000, 1))
rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
remDr <- rd$client
remDr$open()
url <- "https://official.nba.com/referee-assignments/"
remDr$navigate(url)
web_Obj_Date <- remDr$findElement("css selector", "#ref-filters-menu > li > div > button")
web_Obj_Date$clickElement()
web_Obj_Date_Input <- remDr$findElement("id", 'ref-date')
web_Obj_Date_Input$clearElement()
web_Obj_Date_Input$sendKeysToElement(list("2022-10-05"))
web_Obj_Date_Input$doubleclick()
web_Obj_Date <- remDr$findElement("css selector", "#ref-filters-menu > li > div > button")
web_Obj_Date$clickElement()
web_Obj_Go_Button <- remDr$findElement("css selector", "#date-filter")
web_Obj_Go_Button$submitElement()
html_Content <- remDr$getPageSource()[[1]]
read_html(html_Content) %>% html_table()
[[1]]
# A tibble: 5 x 5
Game `Official 1` `Official 2` `Official 3` Alternate
<chr> <chr> <chr> <chr> <lgl>
1 Indiana # Charlotte John Goble (#10) Lauren Holtkamp (#7) Phenizee Ransom (#70) NA
2 Cleveland # Philadelphia Marc Davis (#8) Jacyn Goble (#68) Tyler Mirkovich (#97) NA
3 Toronto # Boston Josh Tiven (#58) Matt Boland (#18) Intae hwang (#96) NA
4 Dallas # Oklahoma City Courtney Kirkland (#61) Mitchell Ervin (#27) Cheryl Flores (#91) NA
5 Phoenix # L.A. Lakers Bill Kennedy (#55) Rodney Mott (#71) Jenna Reneau (#93) NA
[[2]]
# A tibble: 0 x 5
# ... with 5 variables: Game <lgl>, Official 1 <lgl>, Official 2 <lgl>, Official 3 <lgl>, Alternate <lgl>
# i Use `colnames()` to see all variable names
[[3]]
# A tibble: 0 x 5
# ... with 5 variables: Game <lgl>, Official 1 <lgl>, Official 2 <lgl>, Official 3 <lgl>, Alternate <lgl>
# i Use `colnames()` to see all variable names
[[4]]
# A tibble: 6 x 7
S M T W T F S
<int> <int> <int> <int> <int> <int> <int>
1 NA NA NA NA NA NA 1
2 2 3 4 5 6 7 8
3 9 10 11 12 13 14 15
4 16 17 18 19 20 21 22
5 23 24 25 26 27 28 29
6 30 31 NA NA NA NA NA
Here is another approach that can be considered :
library(RDCOMClient)
library(rvest)
url <- "https://official.nba.com/referee-assignments/"
IEApp <- COMCreate("InternetExplorer.Application")
IEApp[['Visible']] <- TRUE
IEApp$Navigate(url)
Sys.sleep(5)
doc <- IEApp$Document()
clickEvent <- doc$createEvent("MouseEvent")
clickEvent$initEvent("click", TRUE, FALSE)
web_Obj_Date <- doc$querySelector("#ref-filters-menu > li > div > button")
web_Obj_Date$dispatchEvent(clickEvent)
web_Obj_Date_Input <- doc$GetElementById('ref-date')
web_Obj_Date_Input[["Value"]] <- "2022-10-05"
web_Obj_Go_Button <- doc$querySelector("#date-filter")
web_Obj_Go_Button$dispatchEvent(clickEvent)
html_Content <- doc$Body()$innerHTML()
read_html(html_Content) %>% html_table()
[[1]]
# A tibble: 5 x 5
Game `Official 1` `Official 2` `Official 3` Alternate
<chr> <chr> <chr> <chr> <lgl>
1 Indiana # Charlotte John Goble (#10) Lauren Holtkamp (#7) Phenizee Ransom (#70) NA
2 Cleveland # Philadelphia Marc Davis (#8) Jacyn Goble (#68) Tyler Mirkovich (#97) NA
3 Toronto # Boston Josh Tiven (#58) Matt Boland (#18) Intae hwang (#96) NA
4 Dallas # Oklahoma City Courtney Kirkland (#61) Mitchell Ervin (#27) Cheryl Flores (#91) NA
5 Phoenix # L.A. Lakers Bill Kennedy (#55) Rodney Mott (#71) Jenna Reneau (#93) NA
[[2]]
# A tibble: 8 x 7
Game `Official 1` `Official 2` `Official 3` Alternate `` ``
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "Game" "Official 1" "Official 2" "Official 3" "Alternate" NA NA
2 "S" "M" "T" "W" "T" "F" "S"
3 "" "" "" "" "" "" "1"
4 "2" "3" "4" "5" "6" "7" "8"
5 "9" "10" "11" "12" "13" "14" "15"
6 "16" "17" "18" "19" "20" "21" "22"
7 "23" "24" "25" "26" "27" "28" "29"
8 "30" "31" "" "" "" "" ""
[[3]]
# A tibble: 7 x 7
Game `Official 1` `Official 2` `Official 3` Alternate `` ``
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 "S" "M" "T" "W" "T" "F" "S"
2 "" "" "" "" "" "" "1"
3 "2" "3" "4" "5" "6" "7" "8"
4 "9" "10" "11" "12" "13" "14" "15"
5 "16" "17" "18" "19" "20" "21" "22"
6 "23" "24" "25" "26" "27" "28" "29"
7 "30" "31" "" "" "" "" ""
[[4]]
# A tibble: 6 x 7
S M T W T F S
<int> <int> <int> <int> <int> <int> <int>
1 NA NA NA NA NA NA 1
2 2 3 4 5 6 7 8
3 9 10 11 12 13 14 15
4 16 17 18 19 20 21 22
5 23 24 25 26 27 28 29
6 30 31 NA NA NA NA NA
If you install the Docker software (see https://docs.docker.com/engine/install/), you can consider the following approach with firefox :
library(RSelenium)
library(rvest)
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
url <- "https://official.nba.com/referee-assignments/"
remDr$navigate(url)
web_Obj_Date <- remDr$findElement("css selector", "#ref-filters-menu > li > div > button")
web_Obj_Date$clickElement()
web_Obj_Date_Input <- remDr$findElement("id", 'ref-date')
web_Obj_Date_Input$clearElement()
web_Obj_Date_Input$sendKeysToElement(list("2022-10-05"))
web_Obj_Date_Input$doubleclick()
web_Obj_Date <- remDr$findElement("css selector", "#ref-filters-menu > li > div > button")
web_Obj_Date$clickElement()
web_Obj_Go_Button <- remDr$findElement("css selector", "#date-filter")
web_Obj_Go_Button$submitElement()
html_Content <- remDr$getPageSource()[[1]]
read_html(html_Content) %>% html_table()
[[1]]
# A tibble: 5 x 5
Game `Official 1` `Official 2` `Official 3` Alternate
<chr> <chr> <chr> <chr> <lgl>
1 Indiana # Charlotte John Goble (#10) Lauren Holtkamp (#7) Phenizee Ransom (#70) NA
2 Cleveland # Philadelphia Marc Davis (#8) Jacyn Goble (#68) Tyler Mirkovich (#97) NA
3 Toronto # Boston Josh Tiven (#58) Matt Boland (#18) Intae hwang (#96) NA
4 Dallas # Oklahoma City Courtney Kirkland (#61) Mitchell Ervin (#27) Cheryl Flores (#91) NA
5 Phoenix # L.A. Lakers Bill Kennedy (#55) Rodney Mott (#71) Jenna Reneau (#93) NA
[[2]]
# A tibble: 0 x 5
# ... with 5 variables: Game <lgl>, Official 1 <lgl>, Official 2 <lgl>, Official 3 <lgl>, Alternate <lgl>
# i Use `colnames()` to see all variable names
[[3]]
# A tibble: 0 x 5
# ... with 5 variables: Game <lgl>, Official 1 <lgl>, Official 2 <lgl>, Official 3 <lgl>, Alternate <lgl>
# i Use `colnames()` to see all variable names
[[4]]
# A tibble: 6 x 7
S M T W T F S
<int> <int> <int> <int> <int> <int> <int>
1 NA NA NA NA NA NA 1
2 2 3 4 5 6 7 8
3 9 10 11 12 13 14 15
4 16 17 18 19 20 21 22
5 23 24 25 26 27 28 29
6 30 31 NA NA NA NA NA
I have created a database in MySQL with data from the Chinook dataset, which has fictitious information on customers that buy music.
One of the tables ("Invoice"), has the billing addresses, which has characters in diverse languages:
InvoiceId CustomerId InvoiceDate BillingAddress
1 2 2009-01-01 00:00:00 Theodor-Heuss-Straße 34
2 4 2009-01-02 00:00:00 Ullevålsveien 14
3 8 2009-01-03 00:00:00 Grétrystraat 63
4 14 2009-01-06 00:00:00 8210 111 ST NW
I tried to retrieve the data using R, with the following code:
library(DBI)
library(RMySQL)
library(dplyr)
library(magrittr)
library(lubridate)
library(stringi)
# Step 1 - Connect to the database ----------------------------------------
con <- DBI::dbConnect(MySQL(),
dbname = Sys.getenv("DB_CHINOOK"),
host = Sys.getenv("HST_CHINOOK"),
user = Sys.getenv("USR_CHINOOK"),
password = Sys.getenv("PASS_CHINOOK"),
port = XXXX)
invoices_tbl <- tbl(con, "Invoice") %>%
collect()
The connection is ok, but when trying to visualize the data, I can't see the special characters:
> head(invoices_tbl[,1:4])
# A tibble: 6 x 4
InvoiceId CustomerId InvoiceDate BillingAddress
<int> <int> <chr> <chr>
1 1 2 2009-01-01 00:00:00 "Theodor-Heuss-Stra\xdfe 34"
2 2 4 2009-01-02 00:00:00 "Ullev\xe5lsveien 14"
3 3 8 2009-01-03 00:00:00 "Gr\xe9trystraat 63"
4 4 14 2009-01-06 00:00:00 "8210 111 ST NW"
5 5 23 2009-01-11 00:00:00 "69 Salem Street"
6 6 37 2009-01-19 00:00:00 "Berger Stra\xdfe 10"
My question is, should I change something in the configuration inside MySQL? Or is it an issue with R? How can I see the special characters? What is the meaning of \xdfe?
Please, any help will be greatly appreciated.
The hexadecimal format can be converted with iconv
invoices_tbl$BillingAddress <- iconv(invoices_tbl$BillingAddress,
"latin1", "utf-8")
-output
invoices_tbl
InvoiceId CustomerId InvoiceDate BillingAddress
1 1 2 2009-01-01 00:00:00 Theodor-Heuss-Straße 34
2 2 4 2009-01-02 00:00:00 Ullevålsveien 14
3 3 8 2009-01-03 00:00:00 Grétrystraat 63
4 4 14 2009-01-06 00:00:00 8210 111 ST NW
5 5 23 2009-01-11 00:00:00 69 Salem Street
6 6 37 2009-01-19 00:00:00 Berger Straße 10
data
invoices_tbl <- structure(list(InvoiceId = 1:6, CustomerId = c(2L, 4L, 8L, 14L,
23L, 37L), InvoiceDate = c("2009-01-01 00:00:00", "2009-01-02 00:00:00",
"2009-01-03 00:00:00", "2009-01-06 00:00:00", "2009-01-11 00:00:00",
"2009-01-19 00:00:00"), BillingAddress = c("Theodor-Heuss-Stra\xdfe 34",
"Ullev\xe5lsveien 14", "Gr\xe9trystraat 63", "8210 111 ST NW",
"69 Salem Street", "Berger Stra\xdfe 10")), row.names = c("1",
"2", "3", "4", "5", "6"), class = "data.frame")
library(XML)
library(RCurl)
library(rlist)
theurl <- getURL("http://legacy.baseballprospectus.com/sortable/index.php?cid=2022181",.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(theurl)
I'm trying to scrape the 2016 table data from the above webpage. If I change the Year to 2010, the url changes to http://legacy.baseballprospectus.com/sortable/index.php?cid=1966487.
I want to automate my algorithm so that it can obtain the table across different Year, but I'm not sure how I can obtain the unique identifiers (e.g. 1966487) for each page automatically. Is there a way to find the list of these?
I've tried looking at the html source code, but no luck.
With rvest, you can set the value in the form and submit it. Wrapped in purrr::map_dfr to iterate and row-bind the results in to a data frame,
library(rvest)
sess <- html_session("http://legacy.baseballprospectus.com/sortable/index.php?cid=2022181")
baseball <- purrr::map_dfr(
2017:2015,
function(y){
Sys.sleep(10 + runif(1)) # be polite
form <- sess %>%
html_node(xpath = '//form[#action="index.php"]') %>%
html_form() %>%
set_values(year = y)
sess <- submit_form(sess, form)
sess %>%
read_html() %>%
html_node('#TTdata') %>%
html_table(header = TRUE)
}
)
tibble::as_data_frame(baseball) # for printing
#> # A tibble: 4,036 x 38
#> `#` NAME TEAM LG YEAR AGE G PA AB R
#> <dbl> <chr> <chr> <chr> <int> <int> <int> <int> <int> <int>
#> 1 1 Giancarlo Stanton MIA NL 2017 27 159 692 597 123
#> 2 2 Joey Votto CIN NL 2017 33 162 707 559 106
#> 3 3 Charlie Blackmon COL NL 2017 30 159 725 644 137
#> 4 4 Aaron Judge NYA AL 2017 25 155 678 542 128
#> 5 5 Nolan Arenado COL NL 2017 26 159 680 606 100
#> 6 6 Kris Bryant CHN NL 2017 25 151 665 549 111
#> 7 7 Mike Trout ANA AL 2017 25 114 507 402 92
#> 8 8 Jose Altuve HOU AL 2017 27 153 662 590 112
#> 9 9 Paul Goldschmidt ARI NL 2017 29 155 665 558 117
#> 10 10 Jose Ramirez CLE AL 2017 24 152 645 585 107
#> # ... with 4,026 more rows, and 28 more variables: H <int>, `1B` <int>,
#> # `2B` <int>, `3B` <int>, HR <int>, TB <int>, BB <int>, IBB <int>,
#> # SO <int>, HBP <int>, SF <int>, SH <int>, RBI <int>, DP <int>,
#> # NETDP <dbl>, SB <int>, CS <int>, AVG <dbl>, OBP <dbl>, SLG <dbl>,
#> # OPS <dbl>, ISO <dbl>, BPF <int>, oppOPS <dbl>, TAv <dbl>, VORP <dbl>,
#> # FRAA <dbl>, BWARP <dbl>
I'm new to R, and have been trying a bunch of examples but I couldn't get anything to change all of my empty cells into NA.
library(XML)
theurl <- "http://www.pro-football-reference.com/teams/sfo/1989.htm"
table <- readHTMLTable(theurl)
table
Thank you.
The result you get from readHTMLTable is giving you a list of two tables, so you need to work on each list element, which can be done using lapply
table <- lapply(table, function(x){
x[x == ""] <- NA
return(x)
})
table$team_stats
Player PF Yds Ply Y/P TO FL 1stD Cmp Att Yds TD Int NY/A 1stD Att Yds TD Y/A 1stD Pen Yds 1stPy
1 Team Stats 442 6268 1021 6.1 25 14 350 339 483 4302 35 11 8.1 209 493 1966 14 4.0 124 109 922 17
2 Opp. Stats 253 4618 979 4.7 37 16 283 316 564 3235 15 21 5.3 178 372 1383 9 3.7 76 75 581 29
3 Lg Rank Offense 1 1 <NA> <NA> 2 10 1 <NA> 20 2 1 1 1 <NA> 13 10 12 13 <NA> <NA> <NA> <NA>
4 Lg Rank Defense 3 4 <NA> <NA> 11 9 9 <NA> 25 11 3 9 5 <NA> 1 3 3 8 <NA> <NA> <NA> <NA>
You have a list of data.frames of factors, though the actual data is mostly numeric. Converting to the appropriate type with type.convert will automatically insert the appropriate NAs for you:
df_list <- lapply(table, function(x){
x[] <- lapply(x, function(y){type.convert(as.character(y), as.is = TRUE)});
x
})
df_list[[1]][, 1:18]
## Player PF Yds Ply Y/P TO FL 1stD Cmp Att Yds.1 TD Int NY/A 1stD.1 Att.1 Yds.2 TD.1
## 1 Team Stats 442 6268 1021 6.1 25 14 350 339 483 4302 35 11 8.1 209 493 1966 14
## 2 Opp. Stats 253 4618 979 4.7 37 16 283 316 564 3235 15 21 5.3 178 372 1383 9
## 3 Lg Rank Offense 1 1 NA NA 2 10 1 NA 20 2 1 1 1.0 NA 13 10 12
## 4 Lg Rank Defense 3 4 NA NA 11 9 9 NA 25 11 3 9 5.0 NA 1 3 3
Or more concisely but with a lot of packages,
library(tidyverse) # for purrr functions and readr::type_convert
library(janitor) # for clean_names
df_list <- map(table, ~.x %>% clean_names() %>% dmap(as.character) %>% type_convert())
df_list[[1]]
## # A tibble: 4 × 23
## player pf yds ply y_p to fl x1std cmp att yds_2 td int ny_a
## <chr> <int> <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 Team Stats 442 6268 1021 6.1 25 14 350 339 483 4302 35 11 8.1
## 2 Opp. Stats 253 4618 979 4.7 37 16 283 316 564 3235 15 21 5.3
## 3 Lg Rank Offense 1 1 NA NA 2 10 1 NA 20 2 1 1 1.0
## 4 Lg Rank Defense 3 4 NA NA 11 9 9 NA 25 11 3 9 5.0
## # ... with 9 more variables: x1std_2 <int>, att_2 <int>, yds_3 <int>, td_2 <int>, y_a <dbl>,
## # x1std_3 <int>, pen <int>, yds_4 <int>, x1stpy <int>
Aim: I am trying to scrape the historical daily stock price for all companies from the webpage http://www.nepalstock.com/datanepse/previous.php. The following code works; however, it always generates the daily stock price for the most recent (Feb 5, 2015) date only. In another words, output is the same, irrespective of the date that I entered. I would appreciate if you could help in this regard.
library(RHTMLForms)
library(RCurl)
library(XML)
url <- "http://www.nepalstock.com/datanepse/previous.php"
forms <- getHTMLFormDescription(url)
# we are interested in the second list with date forms
# forms[[2]]
# HTML Form: http://www.nepalstock.com/datanepse/
# Date: [ ]
get_stock<-createFunction(forms[[2]])
#create sequence of dates from start to end and store it as a list
date_daily<-as.list(seq(as.Date("2011-08-24"), as.Date("2011-08-30"), "days"))
# determine the number of elements in the list
num<-length(date_daily)
daily_1<-lapply(date_daily,function(x){
show(x) #displays the particular date
readHTMLTable(htmlParse(get_stock(Date = x)), which = 7)
})
#18 tables out of which 7 is one what we desired
# change the colnames
col_name<-c("SN","Traded_Companies","No_of_Transactions","Max_Price","Min_Price","Closing_Price","Total_Share","Amount","Previous_Closing","Difference_Rs.")
daily_2<-lapply(daily_1,setNames,nm=col_name)
Output:
> head(daily_2[[1]],5)
SN Traded_Companies No_of_Transactions Max_Price Min_Price Closing_Price Total_Share Amount
1 1 Agricultural Development Bank Ltd 24 489 471 473 2,868 1,359,038
2 2 Arun Valley Hydropower Development Company Limited 40 365 360 362 8,844 3,199,605
3 3 Alpine Development Bank Limited 11 297 295 295 150 44,350
4 4 Asian Life Insurance Co. Limited 10 1,230 1,215 1,225 898 1,098,452
5 5 Apex Development Bank Ltd. 23 131 125 131 6,033 769,893
Previous_Closing Difference_Rs.
1 480 -7
2 363 -1
3 303 -8
4 1,242 -17
5 132 -1
> tail(daily_2[[1]],5)
SN Traded_Companies No_of_Transactions Max_Price Min_Price Closing_Price Total_Share Amount Previous_Closing
140 140 United Finance Ltd 4 255 242 242 464 115,128 255
141 141 United Insurance Co.(Nepal)Ltd. 3 905 905 905 234 211,770 915
142 142 Vibor Bikas Bank Limited 7 158 152 156 710 109,510 161
143 143 Western Development Bank Limited 35 320 311 313 7,631 2,402,497 318
144 144 Yeti Development Bank Limited 22 139 132 139 14,355 1,921,511 134
Difference_Rs.
140 -13
141 -10
142 -5
143 -5
144 5
Here's one quick approach. Note that the site uses a POST request to send the date to the server.
library(rvest)
library(httr)
page <- "http://www.nepalstock.com/datanepse/previous.php" %>%
POST(body = list(Date = "2015-02-01")) %>%
html()
page %>%
html_node(".dataTable") %>%
html_table(header = TRUE)