Error Handling on VBA Excel (Web Scraper) - html

This is the program I created. The goal of this program is to visit each link on a specific cell range and get the "a href" of each listed links.
I used a list of links but there's a certain link that ends with .pdf and from there I get a type mismatch. Is there a way that I could make my program continue and just skip the error that it got from a specific link?
This is the link that causes the error https://ir-web-assets-v.s3.amazonaws.com/uploads/nuggets/5d40644eafe17554cf969aab/Islands_Locals_Program_Guest_FAQ.pdf
Sub extensiveScrape()
Dim extractedLinks As Range 'Links taken from RUN
Dim urls As String 'Links taken from Extensive Search
Dim appIE As Object
Dim LastRow As Long 'Number of rows
Dim rCell As Range
Dim rRng As Range
Dim html2 As HTMLDocument
Dim itemEle As Object
Dim linkurl As Object
Dim y As Integer
Application.ScreenUpdating = False
Set appIE = CreateObject("InternetExplorer.Application")
Set sht = ThisWorkbook.Worksheets("results")
LastRow = sht.Cells(sht.Rows.Count, "A").End(xlUp).Row
Set rRng = sht.Range("A1:A" & LastRow)
For Each rCell In rRng.Cells
With appIE
.navigate rCell.Value
.Visible = True
End With
Do While appIE.readyState <> 4: Wait 5: Loop
Application.StatusBar = "Scraping Extensively..."
DoEvents
Set html2 = appIE.document
Set itemEle = html2.getElementsByTagName("a")
y = 1
For Each linkurl In itemEle
Sheets("results").Range("B" & y).Value = linkurl
y = y + 1
Next
'rCell.Offset(0, 1).Value = itemEle
Next rCell
appIE.Quit
Set appIE = Nothing
Application.StatusBar = ""
Application.ScreenUpdating = True
End Sub
Private Sub Wait(ByVal nSec As Long)
nSec = nSec + Timer
While nSec > Timer
DoEvents
Wend
End Sub

Related

Downloading the whole webpage as HTML/PDF using Excel VBA

I'm creating a program that can visit each link that is located in Column A and download it as HTML/PDF into my local storage. So far, the program can visit each link listed in the column.
Code for opening the links one by one:
Sub visitLinks()
Dim extractedLinks As Range
Dim urls As String
Dim appIE As Object
Dim LastRow As Long
Dim rCell As Range
Dim rRng As Range
Application.ScreenUpdating = False
Set appIE = CreateObject("InternetExplorer.Application")
Set sht = ThisWorkbook.Worksheets("s1")
LastRow = sht.Cells(sht.Rows.Count, "A").End(xlUp).Row
Set rRng = sht.Range("A2:A" & LastRow)
For Each rCell In rRng.Cells
With appIE
.navigate rCell.Value
.Visible = True
End With
Do While appIE.readyState <> 4: Wait 5: Loop
Application.StatusBar = "Downloading as HTML..."
DoEvents
On Error Resume Next
Next rCell
appIE.Quit
Set appIE = Nothing
Application.StatusBar = ""
Application.ScreenUpdating = True
End Sub
------------------------
Private Sub Wait(ByVal nSec As Long)
nSec = nSec + Timer
While nSec > Timer
DoEvents
Wend
End Sub
What I'm trying to do is, after the program visits the first link. It will save the webpage as HTML/PDF and save it into C:......\folder[name].html/.pdf
If it's possible I would also like it to take the file name in column B.
Column A
Column B
google.com
google
facebook.com
facebook
The files located in C:......\folder\ should be google.html/.pdf and facebook.html/.pdf

Excel VBA to Login to Website and Grab TR & TD Elements

I am testing the code below. I think this is very close, but I can't seem to login to the site for some reason.
Sub Website_Login_Test()
Dim oHTML_Element As IHTMLElement
Dim sURL As String
On Error GoTo Err_Clear
sURL = "https://login.my_site_here.jsp?"
Set oBrowser = New InternetExplorer
oBrowser.Silent = True
oBrowser.timeout = 60
oBrowser.navigate sURL
oBrowser.Visible = True
Do
' Wait till the Browser is loaded
Loop Until oBrowser.readyState = READYSTATE_COMPLETE
Set HTMLDoc = oBrowser.document
For Each oHTML_Element In HTMLDoc
Debug.Print oHTML_Element
Next
HTMLDoc.all.UserId.Value = "my_id"
HTMLDoc.all.Password.Value = "my_pass"
For Each oHTML_Element In HTMLDoc.getElementsByTagName("Button")
Debug.Print oHTML_Element.Name
'oHTML_Element.Click: Exit For
'Debug.Print oHTML_Element.Type
'If oHTML_Element.Type = "submit" Then oHTML_Element.Click: Exit For
Next
' oBrowser.Refresh ' Refresh If Needed
Err_Clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
Call Test
End Sub
Sub Test()
Dim ie As Object, i As Long, strText As String
Dim doc As Object, hTable As Object, hBody As Object, hTR As Object, hTD As Object
Dim tb As Object, bb As Object, tr As Object, td As Object
Dim y As Long, z As Long, wb As Excel.Workbook, ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
ie.navigate "https://after_login_move_to_page_for_scraping.jsp"
Do While ie.busy: DoEvents: Loop
Do While ie.readyState <> 4: DoEvents: Loop
Set doc = ie.document
Set hTable = doc.getElementsByTagName("table")
For Each tb In hTable
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each tr In hTR
Set hTD = tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each td In hTD
ws.Cells(z, y).Value = td.innertext
y = y + 1
Next td
DoEvents
z = z + 1
Next tr
Exit For
Next bb
Exit For
Next tb
End Sub
I can't login to the site, so I can't do the scraping, but I think the code is pretty close. Here is the HTML for the id object, the password object and the button object. What am I doing wrong?
I think you must trigger the keypress event of the input fields. If there are other events you must trigger, have a look here, how you can find them:
Automate IE via Excel to fill in a dropdown and continue
Sub WebsiteLogin()
Const url As String = "https://login.my_site_here.jsp"
Const userName As String = "Here Your LogIn Name"
Const passWord As String = "Here Your Password"
Dim ie As Object
Dim htmlDoc As Object
Dim nodeInputUserName As Object
Dim nodeInputPassWord As Object
'Initialize Internet Explorer, set visibility,
'call URL and wait until page is fully loaded
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = True
ie.navigate url
Do Until ie.readyState = 4: DoEvents: Loop
Set htmlDoc = ie.document
'Set the log in name
Set nodeInputUserName = htmlDoc.getElementById("USERID")
nodeInputUserName.Value = userName
Call TriggerEvent(htmlDoc, nodeInputUserName, "onkeypress")
'Set the password
Set nodeInputPassWord = htmlDoc.getElementById("PASSWORD")
nodeInputPassWord.Value = passWord
Call TriggerEvent(htmlDoc, nodeInputPassWord, "onkeypress")
'Click submit button
htmlDoc.querySelector("a[role='button']").Click
End Sub
This is the procedure to trigger events:
Private Sub TriggerEvent(htmlDocument As Object, htmlElementWithEvent As Object, eventType As String)
Dim theEvent As Object
htmlElementWithEvent.Focus
Set theEvent = htmlDocument.createEvent("HTMLEvents")
theEvent.initEvent eventType, True, False
htmlElementWithEvent.dispatchEvent theEvent
End Sub

HTML Webpage Query with Various Search Parameters

I am attempting to import data from a website that requires certain search parameters. There are a selection of about 5 options that is required on the website. I'm trying to build a script that will query the website, select specific search parameters and search. From there import the results to my excel sheet.
The code I created is not working. I am new to VBA so would appreciate the help.
This is what I have:
Private Sub RegulatoryDataPull_Click()
Dim eRow As Long
Dim objIE As Object
Dim HDoc As HTMLDocument
Dim HEle As HTMLUListElement
Set objIE = CreateObject("InternetExplorer.Application") ' Create document object.
Set HDoc = objIE.document ' Create HTML element (<ul>) object.
Set HEle = HDoc.getElementById("dnn_ctr85406_StateNetDB_resultsCount") ' Get the element reference using its ID.
Set sht = Sheets("Sheet1")
eRow = Sheet1.Cells(Rows.Count, 7).End(x1Up.Offset(7, 0)).Row
With objIE
.Visible = True
.navigate "https://www.ncsl.org/research/energy/energy-legislation-tracking-database.aspx"
Do While .Busy Or _
.readyState <> 4
DoEvents
Loop
Var arr = [document.querySelectorAll('["name=dnn$ctr85406$StateNetDB$ckBxTopics$16"],[name="dnn$ctr85406$StateNetDB$ckBxTopics$5"],[name="dnn$ctr85406$StateNetDB$ckBxTopics$3"],[name="dnn$ctr85406$StateNetDB$ckBxTopics$8"]')]
Topics.Item(0).Value = Topicchoice
Set States = .document.getElementsByName("dnn$ctr85406$StateNetDB$ckBxAllStates")
States.Item(0).Value = Stateschoice
Set Status = .document.getElementsByName("dnn$ctr85406$StateNetDB$ddlStatus")
Status.Item(0).Value = Statuschoice
Set Year = .document.getElementsByName("dnn$ctr85406$StateNetDB$ddlYear")
Year.Item(0).Value = Yearchoice
.document.getElementById("dnn_ctr85406_StateNetDB_btnSearch").Click
Do While .Busy Or _
.readyState <> 4
DoEvents
Loop
Dim ele As Object
' Loop through elements inside the <ul> element and find <br>, which has the texts we want.
With HEle
For ele = 0 To .getElementsByTagName("br").Length - 1
Debug.Print .getElementsByTagName("br").Item(ele).getElementsByTagName("br").Item(0).innerHTML
End Select
Next ele
End With
Set objIE = Nothing
End Sub
Welcome to SO! I copy-pasted your code in Excel-VBA and it indeed crashed. In that case the easiest thing to do is step through it with F8 (don't just run the code with F5/a button). That does help in finding the line where the code blocks/crashes. After some modifications I came up with this code that works on my machine. It's by no means finished, but should give you a good start.
Private Sub RegulatoryDataPullTWO()
Dim eRow As Long
Dim objIE As Object
Dim HDoc As HTMLDocument
Dim HEle As HTMLUListElement
Set objIE = CreateObject("InternetExplorer.Application") ' Create document object.
objIE.Visible = True
objIE.navigate "https://www.ncsl.org/research/energy/energy-legislation-tracking-database.aspx"
Do While objIE.Busy Or objIE.readyState <> 4
DoEvents
Loop
Set HDoc = objIE.document ' Create HTML element (<ul>) object.
Set Top1 = objIE.document.getElementsByName("dnn$ctr85406$StateNetDB$ckBxTopics$16")
Top1.Item(0).Value = True
Set States = objIE.document.getElementsByName("dnn$ctr85406$StateNetDB$ckBxAllStates")
States.Item(0).Value = True
Set Status = objIE.document.getElementsByName("dnn$ctr85406$StateNetDB$ddlStatus")
Status.Item(0).Value = "Adopted"
Set yr = objIE.document.getElementsByName("dnn$ctr85406$StateNetDB$ddlYear")
yr.Item(0).Value = "2019"
objIE.document.getElementById("dnn_ctr85406_StateNetDB_btnSearch").Click
Do While objIE.Busy Or objIE.readyState <> 4
DoEvents
Loop
Set HEle = HDoc.getElementById("dnn_ctr85406_StateNetDB_resultsCount") ' Get the element reference using its ID.
Set HList = HDoc.getElementById("dnn_ctr85406_StateNetDB_linkList")
Set Sht = Sheets("Sheet1")
Debug.Print HEle.outerText
Sht.Range("B2").Value = HEle.outerText
ResRw = 3
For e = 0 To HList.getElementsByTagName("a").Length - 1
Set lnk = HList.getElementsByTagName("a").Item(e)
'Debug.Print e1.outerText, e1.outerHTML
If lnk.outerText <> "Bill Text Lookup" And lnk.outerText <> "*" Then
Debug.Print Replace(Replace(lnk.ParentNode.innerText, Chr(10), ""), Chr(13), "")
Debug.Print lnk.ParentNode.NextSibling.NextSibling.innerText
Sht.Range("A" & ResRw).Value = Replace(Replace(lnk.ParentNode.innerText, Chr(10), ""), Chr(13), "")
Sht.Range("B" & ResRw).Value = lnk.ParentNode.NextSibling.NextSibling.innerText
ResRw = ResRw + 1
End If
Next e
Set objIE = Nothing
End Sub

How to download a table from a web with VBA?

I'am trying to download a table from this page
to excel with VBA: http://www.merval.sba.com.ar/Vistas/Cotizaciones/Acciones.aspx --> table "Panel General"
I can download the table "Panel Merval" but i couldn't download the other table.
I use this code for table "Panel Merval":
Sub GetTable()
Dim ieApp As InternetExplorer
Dim ieDoc As Object
Dim ieTable As Object
Dim clip As DataObject
'create a new instance of ie
Set ieApp = New InternetExplorer
'you don’t need this, but it’s good for debugging
ieApp.Visible = False
'now that we’re in, go to the page we want
ieApp.Navigate "http://www.merval.sba.com.ar/Vistas/Cotizaciones/Acciones.aspx"
Do While ieApp.Busy: DoEvents: Loop
Do Until ieApp.ReadyState = READYSTATE_COMPLETE: DoEvents: Loop
'get the table based on the table’s id
Set ieDoc = ieApp.Document
Set ieTable = ieDoc.all.Item("ctl00_ContentCentral_tcAcciones_tpMerval_grdMerval")
'copy the tables html to the clipboard and paste to teh sheet
If Not ieTable Is Nothing Then
Set clip = New DataObject
clip.SetText "" & ieTable.outerHTML & ""
clip.PutInClipboard
Sheet1.Select
Sheet1.Range("b2").Select
Sheet1.PasteSpecial "Unicode Text"
End If
'close 'er up
ieApp.Quit
Set ieApp = Nothing
End Sub
or this one
Public Sub PanelLider()
Dim oDom As Object: Set oDom = CreateObject("htmlFile")
Dim x As Long, y As Long
Dim oRow As Object, oCell As Object
Dim vData As Variant
Dim link As String
link = "http://www.merval.sba.com.ar/Vistas/Cotizaciones/Acciones.aspx"
y = 1: x = 1
With CreateObject("msxml2.xmlhttp")
.Open "GET", link, False
.Send
oDom.body.innerHTML = .ResponseText
End With
With oDom.getElementsByTagName("table")(27)
Dim dataObj As Object
Set dataObj = CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
dataObj.SetText "<table>" & .innerHTML & "</table>"
dataObj.PutInClipboard
End With
Sheets(2).Paste Sheets(2).Cells(1, 1)
End Sub
Could someone help me to download the table "Panel General"?
Many thanks.
Selenium
The following gets the table using selenium basic.
Option Explicit
Public Sub GetTable()
Dim html As New HTMLDocument, htable As HTMLTable, headers()
headers = Array("Especie", "Hora Cotización", "Cierre Anterior", "Precio Apertura", "Precio Máximo", _
"Precio Mínimo", "Último Precio", "Variación Diaria", "Volumen Efectivo ($)", "Volumen Nominal", "Precio Prom. Pon")
With New ChromeDriver
.get "http://www.merval.sba.com.ar/Vistas/Cotizaciones/Acciones.aspx"
.FindElementById("__tab_ctl00_ContentCentral_tcAcciones_tpGeneral").Click
Do
DoEvents
Loop While .FindElementById("ctl00_ContentCentral_tcAcciones_tpGeneral_dgrGeneral", timeout:=7000).Text = vbNullString
html.body.innerHTML = .PageSource
Set htable = html.getElementById("ctl00_ContentCentral_tcAcciones_tpGeneral_dgrGeneral")
WriteTable2 htable, headers, 1, ActiveSheet
.Quit
End With
End Sub
Public Sub WriteTable2(ByVal htable As HTMLTable, ByRef headers As Variant, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, tCell As Object, tr As Object, td As Object, R As Long, c As Long, tBody As Object
R = startRow: c = 1
With ActiveSheet
Set tRow = htable.getElementsByTagName("tr")
For Each tr In tRow
Set tCell = tr.getElementsByTagName("td")
For Each td In tCell
.Cells(R, c).Value = td.innerText
c = c + 1
Next td
R = R + 1: c = 1
Next tr
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
End With
End Sub
References:
HTML Object Library
Selenium Type Library
With IE (Using WriteTable2 sub from above):
Option Explicit
Public Sub GetInfo()
Dim ie As New InternetExplorer, html As HTMLDocument, hTable As HTMLTable, headers(), a As Object
headers = Array("Especie", "Hora Cotización", "Cierre Anterior", "Precio Apertura", "Precio Máximo", _
"Precio Mínimo", "Último Precio", "Variación Diaria", "Volumen Efectivo ($)", "Volumen Nominal", "Precio Prom. Pon")
Application.ScreenUpdating = False
With ie
.Visible = True
.navigate "http://www.merval.sba.com.ar/Vistas/Cotizaciones/Acciones.aspx"
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("__tab_ctl00_ContentCentral_tcAcciones_tpGeneral").Click
Do
DoEvents
On Error Resume Next
Set hTable = .document.getElementById("ctl00_ContentCentral_tcAcciones_tpGeneral_dgrGeneral")
On Error GoTo 0
Loop While hTable Is Nothing
WriteTable2 hTable, headers, 1, ActiveSheet
.Quit '<== Remember to quit application
Application.ScreenUpdating = True
End With
End Sub
References:
Microsoft Internet Explorer Controls

Fetching data from web page

I am trying to fetch the publication date corresponding to each patent number.
Here is the Excel sheet:
The database is espacenet.com
Here's the link for the first patent you see in the Excel sheet:
http://worldwide.espacenet.com/searchResults?compact=false&PN=US7055777B2&ST=advanced&locale=en_EP&DB=EPODOC
Under the "Publication Info" header, I need to get the date after matching the patent number with the one in the Excel sheet.
Here's the code:
Sub tryextraction()
Dim ie As New InternetExplorer
Dim sdd As String
Dim tdd() As String
Dim num0 As Integer
Dim num1 As Integer
Dim doc As HTMLDocument
Dim i As Integer
Dim j As Integer
ie.Visible = True
num1 = ActiveSheet.UsedRange.Rows.Count
For num0 = 2 To num1
ie.navigate "http://worldwide.espacenet.com/searchResults?compact=false&PN=" & Range("A" & num0) & "&ST=advanced&locale=en_EP&DB=EPODOC"
Do
DoEvents
Loop Until ie.readyState = READYSTATE_COMPLETE
Set doc = ie.document
sdd = Trim(doc.getElementsByTagName("td")(5).innerText)
tdd() = Split(sdd, vbLf)
j = UBound(tdd)
For i = 0 To j
If InStr(tdd(i), "(") <> 0 Then
tdd(i) = Replace(tdd(i), " ", "")
tdd(i) = Replace(tdd(i), "(", "")
tdd(i) = Replace(tdd(i), ")", "")
If tdd(i) = Range("A" & num0).Value Then
Range("B" & num0).Value = tdd(i + 1)
End If
End If
Next i
Next num0
ie.Quit
End Sub
The code is not giving any error. The column "Publication Date" remains blank after the code finishes running.
The html tag which contains the publication info has been taken correctly.
There are some trailing white space characters after the ID you are searching for in the document so tdd(i) = Range("A" & num0).Value never evaluates to true. It's not just a space, so a simple Trim(tdd(i)) = Range("A" & num0).Value call does not help. Try instead InStr(tdd(i), Range("A" & num0).Value) If that is not good enough, you'll have to specifically remove CRLF from the end of the string before doing the compare.
There are often multiple publication dates under the publication info header.
Example:
The following script obtains all of these and the preceeding line (so you have the associated publication along with date).
It loops from row 2 of the Activesheet, to the last populated row, picking up the Publication Numbers from column A and writing out the results starting from column B. Depending on how many dates there are, the data will extend across multiple columns from B.
Regex:
A regex of ^(.*)\s\d{4}-\d{2}-\d{2} is used to retrieve the date pattern and the preceeding line i.e. The publication identifier and the date. Try it
Example output:
VBA:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, html As New HTMLDocument, url As String, pubInfo As Object
Dim loopRange As Range, iRow As Range, counter As Long
'example US7055777B2
Application.ScreenUpdating = False
With ActiveSheet
Set loopRange = Range("A2:A" & .Cells(.Rows.Count, "A").End(xlUp).Row)
End With
With IE
.Visible = True
counter = 2 '<== start from row 2
For Each iRow In loopRange
If Not IsEmpty(iRow) Then
url = "https://worldwide.espacenet.com/searchResults?compact=false&PN=" & iRow.Value & "&ST=advanced&locale=en_EP&DB=EPODOC"
.navigate url
While .Busy Or .readyState < 4: DoEvents: Wend
Set html = .document
Do
DoEvents
On Error Resume Next
Set pubInfo = html.querySelector(".publicationInfoColumn")
On Error GoTo 0
Loop While pubInfo Is Nothing
Dim tempArr()
tempArr = GetDateAndPatent(pubInfo.innerText, "^(.*)\s\d{4}-\d{2}-\d{2}") '"(?m)^(.*)\s\d{4}-\d{2}-\d{2}" '<==This is not supported
With ActiveSheet
.Cells(counter, 2).Resize(1, UBound(tempArr) + 1) = tempArr
End With
End If
counter = counter + 1
Next iRow
.Quit '<== Remember to quit application
End With
Application.ScreenUpdating = True
End Sub
Public Function GetDateAndPatent(ByVal inputString As String, ByVal sPattern As String) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With CreateObject("vbscript.regexp")
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .test(inputString) Then
Set matches = .Execute(inputString)
For Each iMatch In matches
ReDim Preserve arrMatches(i)
arrMatches(i) = iMatch.Value
i = i + 1
Next iMatch
End If
End With
GetDateAndPatent = arrMatches
End Function