read html table in R - html

Im trying to read head2head data from tennis abstract webpage in R using package XML.
I want the big h2h table at the bottom,
css selector: html > body > div#main > table#maintable > tbody > tr > td#stats > table#matches.tablesorter
I have tried following suggestions from scraping html into r data frame.
I believe the difficulty is caused by table within table
url = "http://www.tennisabstract.com/cgi-bin/player.cgi?p=NovakDjokovic&f=ACareerqqs00&view=h2h"
library(RCurl)
library(XML)
webpage <- getURL(url)
webpage <- readLines(tc <- textConnection(webpage)); close(tc) #doesnt have the h2h table
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
results <- xpathSApply(pagetree, "//*/table[#class='tablesorter']/tr/td", xmlValue) # gives NULL
tables <- readHTMLTable( url,stringsAsFactors=T) # has 4 tables, not the desired one
I'm new to html parsing, so please bear with.

This is not the most efficient but it will do the job.
library(rvest)
library(RSelenium)
tennis.url <- "http://www.tennisabstract.com/cgi-bin/player.cgi?p=NovakDjokovic&f=ACareerqqs00&view=h2h"
checkForServer(); startServer()
remDrv <- remoteDriver()
remDrv$open()
remDrv$navigate(tennis.url)
tennis.html <- html(remDrv$getPageSource()[[1]])
remDrv$close()
H2Hs <- tennis.html %>% html_nodes(".h2hclick") %>% html_text %>% as.numeric
Opponent <- tennis.html %>% html_nodes("#matches a") %>% html_text
Country <- tennis.html %>% html_nodes("a+ span") %>% html_text %>% gsub("[^(A-Z)]", "", .)
W <- tennis.html %>% html_nodes("#matches td:nth-child(3)") %>% .[-1] %>% html_text %>% as.numeric
L <- tennis.html %>% html_nodes("#matches td:nth-child(4)") %>% .[-1] %>% html_text %>% as.numeric
Win.Prc <- tennis.html %>% html_nodes("#matches td:nth-child(5)") %>% .[-1] %>% html_text
And so on for the rest. You just need to increment the # in nth-child(#) and then create a data frame.

Related

Webscraping between two tags

I am trying to scrape the following page(s):
Htpps://mywebsite.com
In particular, I would like to get the name of each entry. I noticed that the text I am interested in is always in (MY TEXT) the middle of these two tags: <div class="title"> MY TEXT
I know how to search for these tags individually:
#load libraries
library(rvest)
library(httr)
library(XML)
library(rvest)
# set up page
url<-"https://www.mywebsite.com"
page <-read_html(url)
#option 1
b = page %>% html_nodes("title")
option1 <- b %>% html_text() %>% strsplit("\\n")
#option 2
b = page %>% html_nodes("a")
option2 <- b %>% html_text() %>% strsplit("\\n")
Is there some way that I could have specified the "html_nodes" argument so that it picked up on "MY TEXT" - i.e. scrape between <div class="title"> and </a> :
<div class="title"> MY TEXT
Scraping of pages 1:10
library(tidyverse)
library(rvest)
my_function <- function(page_n) {
cat("Scraping page ", page_n, "\n")
page <- paste0("https://www.dentistsearch.ca/search-doctor/",
page_n, "?category=0&services=0&province=55&city=&k=") %>% read_html
tibble(title = page %>%
html_elements(".title a") %>%
html_text2(),
adress = page %>%
html_elements(".marker") %>%
html_text2(),
page = page_n)
}
df <- map_dfr(1:10, my_function)
You can use the xpath argument inside html_elements to locate each a tag inside a div with class "title".
Here's a complete reproducible example.
library(rvest)
"https://www.mywebsite.ca/extension1/" %>%
paste0("2?extension2") %>%
read_html() %>%
html_elements(xpath = "//div[#class='title']/a") %>%
html_text()
Or to get all entries on the first 10 pages:
library(rvest)
unlist(lapply(1:10, function(page){
"https://www.mywebsite.ca/extension1/" %>%
paste0(page, "?extension2") %>%
read_html() %>%
html_elements(xpath = "//div[#class='title']/a") %>%
html_text()}))
Created on 2022-07-26 by the reprex package (v2.0.1)

webscraping a pdf file using R

I've been web scraping articles in R from the Oxford journals and want to grab the full text of specific articles. All articles have a pdf link to them so I've been trying to pull the pdf link and scrape the entire text onto a csv. The full text should all fit into 1 row however the output in the csv file shows one article of 11 rows. How can I fix this issue?
The code is below:
####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)
fullText <- function(parsedDocument){
endLink <- parsedDocument %>%
html_node('.article-pdfLink') %>% html_attr('href')
frontLink <- "https://academic.oup.com"
#link of pdf
pdfLink <- paste(frontLink,endLink,sep = "")
#extract full text from pdfLink
pdfFullText <- pdf_text(pdfLink)
fulltext <- paste(pdfFullText, sep = "\n")
return(fulltext)
}
#############################################
#main function with input as parameter year
testFullText <- function(DOIurl){
parsedDocument <- read_html(DOIurl)
DNAresearch <- data.frame()
allData <- data.frame("Full Text" = fullText(parsedDocument), stringsAsFactors = FALSE)
DNAresearch <- rbind(DNAresearch, allData)
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")
This is how I would approach this task.
library(tidyverse)
library(rvest)
df <- data.frame(
# you have a data.frame with a column where there are links to html research articles
links_to_articles = c("https://doi.org/10.1093/dnares/dsm026", "https://doi.org/10.1093/dnares/dsm027")
) %>%
# telling R to process each row separately (it is useful because functions such as read_html process one link rather than a vector of links)
rowwise() %>%
mutate(
pdf_link = read_html(links_to_articles) %>%
html_node('.article-pdfLink') %>%
html_attr('href') %>%
paste0("https://academic.oup.com", .),
articles_txt = pdf_text(pdf_link) %>%
paste0(collapse = " ")
) %>%
ungroup()
# writing the csv
df %>%
write_csv(file = "DNAresearch.csv")
Using your code, I would do:
####install.packages("rvest")
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#for Fulltext to read pdf
####install.packages("pdftools")
library(pdftools)
fullText <- function(parsedDocument){
endLink <- parsedDocument %>%
html_node('.article-pdfLink') %>% html_attr('href')
frontLink <- "https://academic.oup.com"
#link of pdf
pdfLink <- paste(frontLink,endLink,sep = "")
#extract full text from pdfLink
pdfFullText <- pdf_text(pdfLink)
fulltext <- paste(pdfFullText, collapse = " ") # here I changed sep to collapse
return(fulltext)
}
#############################################
#main function with input as parameter year
testFullText <- function(DOIurl){
parsedDocument <- read_html(DOIurl)
DNAresearch <- data.frame()
allData <- data.frame("Full Text" = fullText(parsedDocument) %>% str_squish(), stringsAsFactors = FALSE) # here I used str_squish to remove extra spaces
DNAresearch <- rbind(DNAresearch, allData)
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
}
testFullText("https://doi.org/10.1093/dnares/dsm026")

Remove href and/or deactivate anchored links when printing a PDF from HTML using xml2 and pagedown with R

I'm using R to extract 100s of articles from a food blog and convert them to PDF. I'm 99% done, but when I print the final PDF, in-line hyperlinks have their URL written right within the text. I do not want every link rendered to text in the PDF, and believe I need to remove the href attributes from the HTML prior to printing with pagedown. Does anyone know how to do this? My example code below should get you to my pdf creation loop for the first article. The initial portions pull all of the URLs into a vector. The portion that needs this enhancementThanks.
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(downloader)
library(pagedown)
library(xml2)
library(htmltools)
#Specifying the url for desired website to be scraped
url1 <- paste0('https://www.foodrepublic.com/author/george-embiricos/page/', '1', '/')
#Reading the HTML code from the website
webpage1 <- read_html(url1)
# Pull the links for all articles on George's initial author page
dat <- html_attr(html_nodes(webpage1, 'a'), "href") %>%
as_tibble() %>%
filter(str_detect(value, "([0-9]{4})")) %>%
unique() %>%
rename(link=value)
dat <- head(dat, 10)
# Pull the links for all articles on George's 2nd-89th author page
for (i in 2:89) {
url <- paste0('https://www.foodrepublic.com/author/george-embiricos/page/', i, '/')
#Reading the HTML code from the website
webpage <- read_html(url)
links <- html_attr(html_nodes(webpage, 'a'), "href") %>%
as_tibble() %>%
filter(str_detect(value, "([0-9]{4})")) %>%
unique() %>%
rename(link=value)
dat <- bind_rows(dat, links) %>%
unique()
}
dat <- dat %>%
arrange(link)
dat <- tail(dat, 890)
articleUrls <- dat$link[1]
# Mac
# Windows
setwd("YOUR-WD")
# articleUrls <- articleUrls[1]
for(i in seq_along(articleUrls)) {
filename <- str_extract(articleUrls[i], "[^/]+(?=/$|$)")
a <- read_html(articleUrls[i])
xml_remove(a %>% xml_find_all("aside"))
xml_remove(a %>% xml_find_all("footer"))
xml_remove(a %>% xml_find_all(xpath = "//*[contains(#class, 'article-related mb20')]"))
xml_remove(a %>% xml_find_all(xpath = "//*[contains(#class, 'tags')]"))
#xml_remove(a %>% xml_find_all("head") %>% xml2::xml_find_all("script"))
xml_remove(a %>% xml2::xml_find_all("//script"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'ad box')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'newsletter-signup')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'article-footer')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'article-footer-sidebar')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'site-footer')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'sticky-newsletter')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'site-header')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, '.fb_iframe_widget')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, '_8f1i')]"))
xml_remove(a %>% xml_find_all("//*[contains(#class, 'newsletter-toggle')]"))
# xml_remove(a %>% xml_find_all("//*[contains(#class, 'articleBody')]"))
# xml_remove(a %>% xml_find_all("//href='([^\"]*)'"))
xml2::write_html(a, file = paste0("html/", filename, ".html"))
tryCatch(pagedown::chrome_print(input = paste0("html/", filename, ".html"),
output=paste0("pdf/", filename, ".pdf"),
format="pdf", timeout = 300, verbose=0,
wait=20), error=function(e) paste("wrong"))
}
You can see a screenshot of what I'm seeing below. The "< >" portion containing the URL should not display. It should only say "King's Brew".
Try something like this:
library(dplyr)
library(xml2)
allHref <- a %>% xml_find_all("//a")
for (l in allHref) {
cntnt <- l %>% xml_text(trim = T)
xml_replace(l, read_xml(paste0("<span>", cntnt, "</span>")))
}
First of all we find all links. Then, for each one of them we extract its content and replace the link itself with this content.

Reading off links on a site and storing them in a list

I am trying to read off the urls to data from StatsCan as follows:
# 2015
url <- "https://www.nrcan.gc.ca/our-natural-resources/energy-sources-distribution/clean-fossil-fuels/crude-oil/oil-pricing/crude-oil-prices-2015/18122"
x1 <- read_html(url) %>%
html_nodes(xpath = '//*[#class="col-md-4"]/ul/li/ul/li/a') %>%
html_attr("href")
# 2014
url2 <- "https://www.nrcan.gc.ca/our-natural-resources/energy-sources-distribution/clean-fossil-fuels/crude-oil/oil-pricing/crude-oil-prices-2014/16993"
x2 <- read_html(url) %>%
html_nodes(xpath = '//*[#class="col-md-4"]/ul/li/ul/li/a') %>%
html_attr("href")
Doing so returns two empty lists; I am confused as this worked for this link: https://www.nrcan.gc.ca/our-natural-resources/energy-sources-distribution/clean-fossil-fuels/crude-oil/oil-pricing/18087. Ultimately I want to loop over the list and read off the tables on each page as so:
for (i in 1:length(x2)){
out.data <- read_html(x2[i]) %>%
html_table(fill = TRUE) %>%
`[[`(1) %>%
as_tibble()
write.xlsx(out.data, str_c(destination,i,".xlsx"))
}
In order to extract all url, I recommend using the css selector ".field-item li a" and subset according to a pattern.
links <- read_html(url) %>%
html_nodes(".field-item li a") %>%
html_attr("href") %>%
str_subset("fuel-prices/crude")
Your XPath needs to be fixed. You can use the following one :
//strong[contains(.,"Oil")]/following-sibling::ul//a

htmlParse - inner text

I need to scrape this text: from an html document using htmlParse (package: XML) in R:
<h1 class="IT">
<span class="f" id="hotel">HOTEL</span>
<span class="nowrap">
<i class="b stars ratings_stars_5 star_track" data-track-on-mouseover=""></i>
</span>
</span>
</h1>
I am using this code (code-example) to scrape the name of hotels. However, I need to add the rating of the hotels:
for (i in seq_len(3)){
txt <- getURL(url=baseURL[i], followlocation = TRUE, encoding="UTF-8")
doc <- htmlParse(txt)
hotel <- cssApply(doc, ".details>h3", cssCharacter)
hotel <- cssApplyInNodeSet(doc, ".details", "h3", cssCharacter)
data <- cbind(hotel)
}
rvest can generally make these ops much easier:
library(rvest)
library(stringr)
pg <- html("http://www.booking.com/hotel/es/starwoodalfonso.es.html#tab-reviews")
pg %>%
html_nodes("i.b-sprite.stars") %>%
html_attr("class") %>%
str_extract("ratings_stars_[[:digit:]]+") %>%
str_replace("ratings_stars_", "") %>%
as.numeric()
## [1] 5
pg %>%
html_nodes("span#hp_hotel_name") %>%
html_text()
## [1] "Hotel Alfonso XIII"
should be very straightforward to stick results in a data.frame, wrap the iteration in an lapply then dplyr::bind_rows
EDIT
Since you're stuck with the CSS package you can use rvest + the cssApply in the exact same manner:
pg <- html("http://www.booking.com/hotel/es/starwoodalfonso.es.html#tab-reviews")
pg %>%
cssApply("i.b-sprite.stars", cssClass) %>%
str_extract("ratings_stars_[[:digit:]]+") %>%
str_replace("ratings_stars_", "") %>%
as.numeric()
pg %>% cssApply("span#hp_hotel_name", cssCharacter)