stop web scraper from showing 404 error in R - html

Trying to webscrape journals from website Oxford articles.
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
#Get Title of journals
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
Title <- ifelse(is.na(Title), "No", Title)
return(Title)
}
#Getting Authors of Journals
Authors <- function(parsedDocument){
Authors <- parsedDocument %>%
html_node("a.linked-name") %>%
html_text() %>%
return(Authors)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
print(URLallArticles)
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
paste(parsedDocument)
#need work on getiing Full Text
#allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
#scraped items that are good
#"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 1994 to present")
}
}
##################### Main function test
findURL(2015)
Code is showing error 404.
I believe it is a problem with getAllArticles,
the last output has a bad url. I've tried using a try catch to stop the error from outputting but haven't been successful. It may also be my logic.
the output for the year 2015 is:
[1] "https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2015%20TO%2012%2F31%2F2015&fl_SiteID=5275&page="
[1] "https://doi.org/10.1093/dnares/dsv028"
[2] "https://doi.org/10.1093/dnares/dsv027"
[3] "https://doi.org/10.1093/dnares/dsv029"
[4] "https://doi.org/10.1093/dnares/dsv030"
[1] "https://doi.org/10.1093/dnares/dsv022"
[1] "https://doi.org/10.1093/dnares/dsv024"
[2] "https://doi.org/10.1093/dnares/dsv025"
[3] "https://doi.org/10.1093/dnares/dsv026"
[4] "https://doi.org/10.1093/dnares/dsv021"
[5] "https://doi.org/10.1093/dnares/dsv023"
[1] "https://doi.org/10.1093/dnares/dsv020"
[2] "https://doi.org/10.1093/dnares/dsv019"
[3] "https://doi.org/10.1093/dnares/dsv017"
[1] "https://doi.org/10.1093/dnares/dsv018"
[2] "https://doi.org/10.1093/dnares/dsv015"
[1] "https://doi.org/10.1093/dnares/dsv013"
[2] "https://doi.org/10.1093/dnares/dsv016"
[3] "https://doi.org/10.1093/dnares/dsv014"
[1] "https://doi.org/10.1093/dnares/dsv012"
[2] "https://doi.org/10.1093/dnares/dsv010"
[1] "https://doi.org/10.1093/dnares/dsv011"
[2] "https://doi.org/10.1093/dnares/dsv009"
[3] "https://doi.org/10.1093/dnares/dsv005"
[1] "https://doi.org/10.1093/dnares/dsv008"
[2] "https://doi.org/10.1093/dnares/dsv007"
[3] "https://doi.org/10.1093/dnares/dsv004"
[1] "https://doi.org/10.1093/dnares/dsv006"
[2] "https://doi.org/10.1093/dnares/dsv002"
[3] "https://doi.org/10.1093/dnares/dsv003"
[4] "https://doi.org/10.1093/dnares/dsv001"
[1] "https://doi.org/10.1093/dnares/dsu047"
[2] "https://doi.org/10.1093/dnares/dsu045"
[3] "https://doi.org/10.1093/dnares/dsu046"
[1] "https://doi.org/10.1093/dnares/dsu044"
[2] "https://doi.org/10.1093/dnares/dsu041"
[3] "https://doi.org/10.1093/dnares/dsu038"
[4] "https://doi.org/10.1093/dnares/dsu040"
[5] "https://doi.org/10.1093/dnares/dsu042"
[6] "https://doi.org/10.1093/dnares/dsu043"
[1] "https://doi.org/10.1093/dnares/"
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_along(specs)) { :
Error in open.connection(x, "rb") : HTTP error 404.
a year like 1994 for example runs without an error, but years like 2015 and 2016 has this error.

You can check for valid URL and add exception -
if (url.exists(URLallArticles[j])){
parsedDocument <- read_html(URLallArticles[j])
paste(parsedDocument)
#need work on getiing Full Text
#allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
#scraped items that are good
#"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
}

Related

R program has stopped working possibly because of Rdata file or R package

I built a webscraper that's working fine until it randomly stopped working. I thought it was because of my Rdata files but I deleted the ones that I found. I now get an error in my first function because I can't access the URL correctly anymore.
#Getting the number of Page
getPageNumber <- function(URL) {
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
browser()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
getPageNumber("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2010%20TO%2012%2F31%2F2010&fl_SiteID=5275&page=")
output should be getPageNumber("academic.oup.com/dnaresearch/…) [1] 9
instead I get NA

R web scraping function getPageNumber error

I am building a webscraper and trying to understand why my getPage Number Function does not work. The function worked last night and tonight I have been having an error getting the right output
library(rvest)
library(RCurl)
library(XML)
library(stringr)
getPageNumber <- function(URL) {
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
getPageNumber("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2010%20TO%2012%2F31%2F2010&fl_SiteID=5275&page=")
The output I am getting is NA, when it should be numeric number
Do this to troubleshoot your problems:
getPageNumber <- function(URL) {
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
browser() ## <-- inject this here to stop when you get to this point.
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
getPageNumber("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2010%20TO%2012%2F31%2F2010&fl_SiteID=5275&page=")
This is what my R session looks like (I typed total_results and results_per_page there myself to see what they contained):
> getPageNumber("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2010%20TO%2012%2F31%2F2010&fl_SiteID=5275&page=")
Called from: getPageNumber("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2010%20TO%2012%2F31%2F2010&fl_SiteID=5275&page=")
Browse[1]> debug at #10: pageNumber <- tryCatch(ceiling(total_results/results_per_page),
error = function(e) {
1
})
Browse[2]> total_results
[1] 176
Browse[2]> results_per_page
[1] 20
Browse[2]>
debug at #11: return(pageNumber)
Browse[2]>
[1] 9
>
Other things you can do while you are inside the running function with browser():
put the html somewhere you can inspect it later: cat( toString(parsedDocument), file="~/foo.html")
look at: parsedDocument %>% toString() %>% str_match('num_results":"(.*?)"')
If you still experience NA as output, try changing your function code to this:
getPageNumber <- function(URL) {
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
if( is.na(pageNumber) ) {
cat( "PAGENUMBER IS NA!\n" )
cat( "total_results is: ", total_results, "\n" )
cat( "results_per_page is: ", results_per_page, "\n" )
cat( toString(parsedDocument), file="~/foo.html")
cat( "The HTML is stored here: ", normalizePath("~/foo.html") )
cat( "Open it in a text editor and investigate why it went wrong\n" )
}
return(pageNumber)
}

How can I get the data of the second web page?

I am trying to get the data from a web in R using rvest package: https://etfdb.com/stock/AAPL/
But no matter how I tried, I can only get the table of the first page. Could anybody help me do this? Thank you so much.
See code below. tb1 and tb2 are the same!! That's wired.
url1 <- "https://etfdb.com/stock/AAPL/#etfs&sort_name=weighting&sort_order=desc&page=1"
url2 <- "https://etfdb.com/stock/AAPL/#etfs&sort_name=weighting&sort_order=desc&page=2"
tbs1 <- rvest::html_nodes(xml2::read_html(url1), "table")
tbs2 <- rvest::html_nodes(xml2::read_html(url2), "table")
tb1 <- rvest::html_table(tbs1[1])[[1]]
tb2 <- rvest::html_table(tbs2[1])[[1]]
This website post GET requests to update JSON data to the table. After some attempts, this is the code I came up with to deal with JSON data: (not a beautiful code but it works)
library(rjson)
library(rvest)
library(writexl)
lastpage <- 9;
df <- data.frame();
for (i in 1:lastpage){
x <- fromJSON(file = paste("https://etfdb.com/data_set/?tm=40274&cond={%22by_stock%22:25}&no_null_sort=&count_by_id=true&limit=25&sort=weighting&order=desc&limit=25&offset=", 25 * (i-1), sep = ""));
x <- x[2][[1]];
pg_df <- data.frame(matrix(unlist(x), nrow=length(x), byrow=T),stringsAsFactors=FALSE);
df <- rbind(df, pg_df);
}
for (i in 1:nrow(df)){
df$X1[i] <- read_html(df$X1[i]) %>% html_text(trim = TRUE);
df$X3[i] <- read_html(df$X3[i]) %>% html_text(trim = TRUE);
df$X5[i] <- read_html(df$X5[i]) %>% html_text(trim = TRUE);
}
df <- data.frame(df$X1, df$X3, df$X5, df$X7, df$X9);
colnames(df) <- c("Ticker", "ETF", "ETFdb.com Category", "Expense Ratio", "Weighting");
write_xlsx(
df,
path = "stock.xlsx",
col_names = TRUE,
format_headers = TRUE,
use_zip64 = FALSE
)
Update:
You can see the data source at the attribute data-url of the table here:
I'll update the code that makes it easier for you:
library(rjson)
library(rvest)
library(writexl)
stock_ticket <- "AAPL";
url <- paste("https://etfdb.com/stock/", stock_ticket, sep = "");
lastpage <- 9;
df <- data.frame();
data_url <- read_html(url) %>% html_node(xpath = "//table[#id='etfs']") %>% html_attr("data-url");
for (i in 1:lastpage){
x <- fromJSON(file = paste("https://etfdb.com", data_url, "&offset=", 25 * (i-1), sep = ""));
x <- x[2][[1]];
pg_df <- data.frame(matrix(unlist(x), nrow=length(x), byrow=T),stringsAsFactors=FALSE);
df <- rbind(df, pg_df);
}
for (i in 1:nrow(df)){
df$X1[i] <- read_html(df$X1[i]) %>% html_text(trim = TRUE);
df$X3[i] <- read_html(df$X3[i]) %>% html_text(trim = TRUE);
df$X5[i] <- read_html(df$X5[i]) %>% html_text(trim = TRUE);
}
df <- data.frame(df$X1, df$X3, df$X5, df$X7, df$X9);
colnames(df) <- c("Ticker", "ETF", "ETFdb.com Category", "Expense Ratio", "Weighting");
write_xlsx(
df,
path = "stock.xlsx",
col_names = TRUE,
format_headers = TRUE,
use_zip64 = FALSE
)

Scraping table with multiple headers in R using any package? (XML, rCurl, rlist htmltab, rvest etc)

I am attempting to scrape this table
http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1
Here are all my attempts. None of them get even close to extracting any information. Am i missing something?
library("rvest")
library("tidyverse")
# METHOD 1
url <- "http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1"
data <- url %>%
read_html() %>%
html_nodes(xpath='//*[#id="t1"]/tbody/tr[1]') %>%
html_table()
data <- data[[1]]
# METHOD 2
library(XML)
library(RCurl)
library(rlist)
theurl <- getURL("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",.opts = list(ssl.verifypeer = FALSE) )
tables <- readHTMLTable(theurl)
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tables[[which.max(n.rows)]]
# METHOD 3
library(htmltab)
tab <- htmltab("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1",
which = '//*[#id="t1"]/tbody/tr[4]',
header = '//*[#id="t1"]/tbody/tr[3]',
rm_nodata_cols = TRUE)
# METHOD 4
website <-read_html("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1")
scraped <- website %>%
html_nodes("table") %>%
.[(2)] %>%
html_table(fill = TRUE) %>%
`[[`(1)
# METHOD 5
getHrefs <- function(node, encoding)
if (!is.null(xmlChildren(node)$a)) {
paste(xpathSApply(node, './a', xmlGetAttr, "href"), collapse = ",")
} else {
return(xmlValue(xmlChildren(node)$text))
}
data <- ( readHTMLTable("http://www.hko.gov.hk/cis/dailyExtract_e.htm?y=1999&m=1", which = 1, elFun = getHrefs) )
The expected results should be the 12 colnames in the table & the data below

web scraping html in R

I want get the URL list from scraping http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm like this:
[1] "P-Obama-Inaugural-Speech-Inauguration.htm"
[2] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
and this is my code:
library(XML)
url = "http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm"
doc = htmlTreeParse(url, useInternalNodes = T)
url.list = xpathSApply(doc, "//a[contains(#href, 'htm')]")
The problem is that I want to unlist() url.list so I can strsplit it but it doesn't unlist.
One more step ought to do it (just need to get the href attribute):
library(XML)
url <- "http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm"
doc <- htmlTreeParse(url, useInternalNodes=TRUE)
url.list <- xpathSApply(doc, "//a[contains(#href, 'htm')]")
hrefs <- gsub("^/", "", sapply(url.list, xmlGetAttr, "href"))
head(hrefs, 6)
## [1] "P-Obama-Inaugural-Speech-Inauguration.htm"
## [2] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
## [3] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
## [4] "E-Barack-Obama-Speech-Manassas-Virgina-Last-Rally-2008-Election.htm"
## [5] "E10-Barack-Obama-The-American-Promise-Acceptance-Speech-at-the-Democratic-Convention-Mile-High-Stadium--Denver-Colorado-August-28-2008.htm"
## [6] "E10-Barack-Obama-The-American-Promise-Acceptance-Speech-at-the-Democratic-Convention-Mile-High-Stadium--Denver-Colorado-August-28-2008.htm"
free(doc)
UPDATE Obligatory rvest + dplyr way:
library(rvest)
library(dplyr)
speeches <- html("http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm")
speeches %>% html_nodes("a[href*=htm]") %>% html_attr("href") %>% head(6)
## same output as above