web scraping html in R - html

I want get the URL list from scraping http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm like this:
[1] "P-Obama-Inaugural-Speech-Inauguration.htm"
[2] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
and this is my code:
library(XML)
url = "http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm"
doc = htmlTreeParse(url, useInternalNodes = T)
url.list = xpathSApply(doc, "//a[contains(#href, 'htm')]")
The problem is that I want to unlist() url.list so I can strsplit it but it doesn't unlist.

One more step ought to do it (just need to get the href attribute):
library(XML)
url <- "http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm"
doc <- htmlTreeParse(url, useInternalNodes=TRUE)
url.list <- xpathSApply(doc, "//a[contains(#href, 'htm')]")
hrefs <- gsub("^/", "", sapply(url.list, xmlGetAttr, "href"))
head(hrefs, 6)
## [1] "P-Obama-Inaugural-Speech-Inauguration.htm"
## [2] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
## [3] "E11-Barack-Obama-Election-Night-Victory-Speech-Grant-Park-Illinois-November-4-2008.htm"
## [4] "E-Barack-Obama-Speech-Manassas-Virgina-Last-Rally-2008-Election.htm"
## [5] "E10-Barack-Obama-The-American-Promise-Acceptance-Speech-at-the-Democratic-Convention-Mile-High-Stadium--Denver-Colorado-August-28-2008.htm"
## [6] "E10-Barack-Obama-The-American-Promise-Acceptance-Speech-at-the-Democratic-Convention-Mile-High-Stadium--Denver-Colorado-August-28-2008.htm"
free(doc)
UPDATE Obligatory rvest + dplyr way:
library(rvest)
library(dplyr)
speeches <- html("http://obamaspeeches.com/P-Obama-Inaugural-Speech-Inauguration.htm")
speeches %>% html_nodes("a[href*=htm]") %>% html_attr("href") %>% head(6)
## same output as above

Related

No connection in R

I've been trying to learn webscraping from an online course, and they give the following as an example
url <- "https://www.canada.ca/en/employment-social-development/services/labour-relations/international/agreements.html"
website<- read_html(url)
treaties_links <- website %>% html_nodes("li") %>% html_nodes("a") %>% html_attr("href")
treaties_links <-treaties_links[23:30]
treaties_links_full <- lapply(treaties_links, function(x) (paste("https://www.canada.ca",x,sep="")))
treaties_links_full[8] <-treaties_links[8]
treaty_texts <- lapply(treaties_links_full, function(x) (read_html(x)))
When I get to this last line it returns an error
Error in open.connection(x, "rb") :
Could not resolve host: www.canada.cahttp
Your error is in your lapply() code. If you print treaties_links, you will see that they are not all internal links, i.e. links starting with /, and some are links to other domains:
print(treaties_links)
[1] "/en/employment-social-development/services/labour-relations/international/agreements/chile.html"
[2] "/en/employment-social-development/services/labour-relations/international/agreements/costa-rica.html"
[3] "/en/employment-social-development/services/labour-relations/international/agreements/peru.html"
[4] "/en/employment-social-development/services/labour-relations/international/agreements/colombia.html"
[5] "/en/employment-social-development/services/labour-relations/international/agreements/jordan.html"
[6] "/en/employment-social-development/services/labour-relations/international/agreements/panama.html"
[7] "http://www.international.gc.ca/trade-agreements-accords-commerciaux/agr-acc/honduras/labour-travail.aspx?lang=eng"
[8] "http://international.gc.ca/trade-commerce/assets/pdfs/agreements-accords/korea-coree/18_CKFTA_EN.pdf"
This means that when you are running paste("https://www.canada.ca",x,sep="") on e.g. link 7, you get:
"https://www.canada.cahttp://www.international.gc.ca/trade-agreements-accords-commerciaux/agr-acc/honduras/labour-travail.aspx?lang=eng"
Assuming you want to keep that link you might change your lapply to:
treaties_links_full <- lapply(
treaties_links,
function(x) {
ifelse(
substr(x,1,1)=="/",
paste("https://www.canada.ca",x,sep=""),
x
)
}
)
This will only prepend "https://www.canada.ca" to the links within that domain.

stop web scraper from showing 404 error in R

Trying to webscrape journals from website Oxford articles.
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
results_per_page <- length(parsedDocument %>% html_nodes(".sr-list"))
total_results <- parsedDocument %>%
toString() %>%
str_match(., 'num_results":"(.*?)"') %>%
.[,2] %>%
as.integer()
pageNumber <- tryCatch(ceiling(total_results / results_per_page), error = function(e) {1})
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
#Get Title of journals
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
Title <- ifelse(is.na(Title), "No", Title)
return(Title)
}
#Getting Authors of Journals
Authors <- function(parsedDocument){
Authors <- parsedDocument %>%
html_node("a.linked-name") %>%
html_text() %>%
return(Authors)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
print(URLallArticles)
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
paste(parsedDocument)
#need work on getiing Full Text
#allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
#scraped items that are good
#"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 1994 to present")
}
}
##################### Main function test
findURL(2015)
Code is showing error 404.
I believe it is a problem with getAllArticles,
the last output has a bad url. I've tried using a try catch to stop the error from outputting but haven't been successful. It may also be my logic.
the output for the year 2015 is:
[1] "https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F2015%20TO%2012%2F31%2F2015&fl_SiteID=5275&page="
[1] "https://doi.org/10.1093/dnares/dsv028"
[2] "https://doi.org/10.1093/dnares/dsv027"
[3] "https://doi.org/10.1093/dnares/dsv029"
[4] "https://doi.org/10.1093/dnares/dsv030"
[1] "https://doi.org/10.1093/dnares/dsv022"
[1] "https://doi.org/10.1093/dnares/dsv024"
[2] "https://doi.org/10.1093/dnares/dsv025"
[3] "https://doi.org/10.1093/dnares/dsv026"
[4] "https://doi.org/10.1093/dnares/dsv021"
[5] "https://doi.org/10.1093/dnares/dsv023"
[1] "https://doi.org/10.1093/dnares/dsv020"
[2] "https://doi.org/10.1093/dnares/dsv019"
[3] "https://doi.org/10.1093/dnares/dsv017"
[1] "https://doi.org/10.1093/dnares/dsv018"
[2] "https://doi.org/10.1093/dnares/dsv015"
[1] "https://doi.org/10.1093/dnares/dsv013"
[2] "https://doi.org/10.1093/dnares/dsv016"
[3] "https://doi.org/10.1093/dnares/dsv014"
[1] "https://doi.org/10.1093/dnares/dsv012"
[2] "https://doi.org/10.1093/dnares/dsv010"
[1] "https://doi.org/10.1093/dnares/dsv011"
[2] "https://doi.org/10.1093/dnares/dsv009"
[3] "https://doi.org/10.1093/dnares/dsv005"
[1] "https://doi.org/10.1093/dnares/dsv008"
[2] "https://doi.org/10.1093/dnares/dsv007"
[3] "https://doi.org/10.1093/dnares/dsv004"
[1] "https://doi.org/10.1093/dnares/dsv006"
[2] "https://doi.org/10.1093/dnares/dsv002"
[3] "https://doi.org/10.1093/dnares/dsv003"
[4] "https://doi.org/10.1093/dnares/dsv001"
[1] "https://doi.org/10.1093/dnares/dsu047"
[2] "https://doi.org/10.1093/dnares/dsu045"
[3] "https://doi.org/10.1093/dnares/dsu046"
[1] "https://doi.org/10.1093/dnares/dsu044"
[2] "https://doi.org/10.1093/dnares/dsu041"
[3] "https://doi.org/10.1093/dnares/dsu038"
[4] "https://doi.org/10.1093/dnares/dsu040"
[5] "https://doi.org/10.1093/dnares/dsu042"
[6] "https://doi.org/10.1093/dnares/dsu043"
[1] "https://doi.org/10.1093/dnares/"
Error in open.connection(x, "rb") : HTTP error 404.
In addition: Warning message:
In for (i in seq_along(specs)) { :
Error in open.connection(x, "rb") : HTTP error 404.
a year like 1994 for example runs without an error, but years like 2015 and 2016 has this error.
You can check for valid URL and add exception -
if (url.exists(URLallArticles[j])){
parsedDocument <- read_html(URLallArticles[j])
paste(parsedDocument)
#need work on getiing Full Text
#allData <- data.frame("Full text"=FullText(parsedDocument),stringsAsFactors = FALSE)
#scraped items that are good
#"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" = CorrespondingAuthors(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publish Date" = PublicationDate(parsedDocument),"Abstract" = Abstract(parsedDocument),"Keywords" = Keywords(parsedDocument)
allData <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
}

How can I time scraping news stories from a list of urls with R?

I am trying to download the text of newspaper articles for textual analysis using R. I have a large list of urls to individual articles and want to use Rvest to extract each of these articles' text and title and convert it into a data frame.
As an example, I have a subset of my dataset with articles from The Guardian:
> items$link[1:8]
[1] "https://www.theguardian.com/uk-news/2019/nov/16/concerns-raised-cladding-bolton-student-building-fire"
[2] "https://www.theguardian.com/uk-news/2019/nov/16/top-lawyer-calls-prince-andrew-bbc-interview-catastrophic-error"
[3] "https://www.theguardian.com/politics/live/2019/nov/16/general-election-labour-meet-decide-manifesto-clause-v-live-news"
[4] "https://www.theguardian.com/politics/2019/nov/16/priti-patel-block-rescue-british-isis-children"
[5] "https://www.theguardian.com/politics/2019/nov/16/police-assessing-claims-that-tories-offered-peerages-to-brexit-party"
[6] "https://www.theguardian.com/world/2019/nov/16/paris-police-fire-teargas-on-anniversary-of-gilets-jaunes-protests"
[7] "https://www.theguardian.com/us-news/2019/nov/16/trump-personally-kept-pressure-ukraine-impeachment-inquiry-witness-david-holmes-diplomat"
[8] "https://www.theguardian.com/world/2019/nov/16/hong-kong-chinese-troops-deployed-to-help-clear-roadblocks"
My code so far is:
## SETUP ##
rm(list=ls())
library(tidyverse)
library(rvest)
library(stringr)
library(readtext)
library(quanteda)
library(beepr)
setwd("uk")
## Functions ##
parse_texts <- function(nod){
body <- str_squish(as.character(nod) %>% read_html() %>%
html_nodes('.js-article__body > p') %>% #collects all text in article
html_text())
one_body <- paste(body, collapse = " ") # puts all of the text together
data.frame(title = str_squish(nod %>% read_html() %>%
html_node('.content__headline') %>%
html_text()),
date_time = str_squish(nod %>% read_html() %>%
html_node('.content__dateline-wpd--modified') %>%
html_text()),
text = one_body,
stringsAsFactors = FALSE)
}
#extract file text
test_df <- lapply(items$link[1:5], parse_texts) %>% bind_rows()
This works, for the most part. My problem is that I have thousands of urls in my data. How can I automate a script that will slowly work through this list?
Thanks to Dave2e for answering the question.
I added Sys.sleep(2) to the parse_texts function and was able to go through my list of URLs.

R Webscraping RCurl and httr Content

I'm learning a bit about webscraping and I'm having a little doubt regarding 2 packages (httr and RCurl), I'm trying to get a code from a magazine (ISSN) on the researchgate website and I came across a situation. When extracting the content from the site by httr and RCurl, I get the ISSN in the RCurl package and in httr my function is returning NULL, could anyone tell me why this? in my opinion it was for both functions to be working. Follow the code below.
library(rvest)
library(httr)
library(RCurl)
url <- "https://www.researchgate.net/journal/0730-0301_Acm_Transactions_On_Graphics"
########
# httr #
########
conexao <- GET(url)
conexao_status <- http_status(conexao)
conexao_status
content(conexao, as = "text", encoding = "utf-8") %>% read_html() -> webpage1
ISSN <- webpage1 %>%
html_nodes(xpath = '//*/div/div[2]/div[1]/div[1]/table[2]/tbody/tr[7]/td') %>%
html_text %>%
str_to_title() %>%
str_split(" ") %>%
unlist
ISSN
########
# RCurl #
########
options(RCurlOptions = list(verbose = FALSE,
capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"),
ssl.verifypeer = FALSE))
webpage <- getURLContent(url) %>% read_html()
ISSN <- webpage %>%
html_nodes(xpath = '//*/div/div[2]/div[1]/div[1]/table[2]/tbody/tr[7]/td') %>%
html_text %>%
str_to_title() %>%
str_split(" ") %>%
unlist
ISSN
sessionInfo() R version 3.5.0 (2018-04-23) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows >= 8 x64 (build
9200)
Matrix products: default
locale: [1] LC_COLLATE=Portuguese_Brazil.1252
LC_CTYPE=Portuguese_Brazil.1252 LC_MONETARY=Portuguese_Brazil.1252
[4] LC_NUMERIC=C LC_TIME=Portuguese_Brazil.1252
attached base packages: [1] stats graphics grDevices utils
datasets methods base
other attached packages: [1] testit_0.7 dplyr_0.7.4
progress_1.1.2 readxl_1.1.0 stringr_1.3.0 RCurl_1.95-4.10
bitops_1.0-6 [8] httr_1.3.1 rvest_0.3.2 xml2_1.2.0
jsonlite_1.5
loaded via a namespace (and not attached): [1] Rcpp_0.12.16
bindr_0.1.1 magrittr_1.5 R6_2.2.2 rlang_0.2.0
tools_3.5.0 [7] yaml_2.1.19 assertthat_0.2.0
tibble_1.4.2 bindrcpp_0.2.2 curl_3.2 glue_1.2.0
[13] stringi_1.1.7 pillar_1.2.2 compiler_3.5.0
cellranger_1.1.0 prettyunits_1.0.2 pkgconfig_2.0.1
Because the content type is JSON and not HTML, you can't use read_html() on it:
> conexao
Response [https://www.researchgate.net/journal/0730-0301_Acm_Transactions_On_Graphics]
Date: 2018-06-02 03:15
Status: 200
Content-Type: application/json; charset=utf-8
Size: 328 kB
Use fromJSON() instead to extract issn:
library(jsonlite)
result <- fromJSON(content(conexao, as = "text", encoding = "utf-8") )
result$result$data$journalFullInfo$data$issn
result:
> result$result$data$journalFullInfo$data$issn
[1] "0730-0301"

When scraping with rvest expected html_node not appearing

The ITTO website produces a table of timber products and flows directly under the search form once the query is submitted (on the same page). Using information I obtained from Chrome's SelectorGadget I'm expecting the table to appear as the css element "td". Using rvest to scrape information on Albania for 2014...
library(rvest)
session <- html_session("http://www.itto.int/annual_review_output/?mode=searchdata")
form <- html_form(session)[[2]]
form <- set_values(form, "countries[]" = "8", "products[]" = "1" ,"flows[]" = "1", "years[]" = "2014")
query <- submit_form(session, form, submit = NULL)
page <- read_html(query) %>% html_nodes("td")
page
Which results in the table "td" being absent:
{xml_nodeset (0)}
Examining other elements of the page with html_nodes() suggests that submit_form() performed otherwise as expected.
So my question is where is the expected table?
It might be easier (in the long run) to scrape the select box options and just feed the POST call directly:
library(httr)
library(rvest)
res <- POST(url = "http://www.itto.int/annual_review_output/?mode=searchdata",
body = list(`countries[]` = "76",
`products[]` = "1", `flows[]` = "1",
`years[]` = "2014"),
encode = "form")
pg <- content(res, as="parsed")
html_nodes(pg, "td")
## {xml_nodeset (7)}
## [1] <td>Brazil</td>
## [2] <td>Ind. roundwood</td>
## [3] <td>Exports Quantity</td>
## [4] <td>1000 m3</td>
## [5] <td>2014</td>
## [6] <td style="text-align:right;">204.59</td>
## [7] <td>I</td>