VBA Web Scraping, SPAN issue - html

Im currently trying to scrape some data from a website utilizing but having a complete block on this.
My VBA code is as follows;
Sub WISE()
Dim IE As InternetExplorer
Dim HTML As HTMLDocument
Dim WPage As String
WPage = "www.thencc.org.uk/Our_Members/MemDetails.aspx?CompID=AAA-01-01"
Set IE = New InternetExplorer
IE.Visible = True
IE.navigate WPage
Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Trying to go to " & WPage
DoEvents
Loop
Dim hlpe As String
'Clearly missing something
hlpe = _
HTML.getElementsByTagName("span").getElementsByTagName("b").innerText
Range("a5").Value = hlpe
Set IE = Nothing
Application.StatusBar = ""
End Sub
The website HTML details which i'm trying to get sits in the below;
Essentially i was going to pull the lot then parse in excel, ideally i want the address / Number.
<span id="MainContent_lblDetails"><table class=tabLayout width='90%'> <tr><td style='height:20px'><b>AA Autovan Leisure Ltd</b><br/><br/>Servicing and repairs – mobile specialists in servicing and repairing touring caravans and motorhomes. Contact us for more information<br/><br/>7 Sycamore Lane, Holmfirth, Huddersfield, HD9 7RT<br/>West Yorkshire, England<br/><br/><b>Tel - </b>01484 683636<br/><b>Web - </b><a href='http://www.aaautovanleisure.com' style='color:#0061a0' target='_blank'>www.aaautovanleisure.com</div></td></tr><tr><td align='right'><a href='javascript:history.go(-1)' style='color:#0061a0'> Back </a></td></tr></table></span>

We can parse text related to the element, but we want to use its html so as to have useful delimiters to split on. Also, we can do away with the slow browser opening and issue an XMLHTTP GET request.
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, HTML As New htmldocument, arrayItems() As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://www.thencc.org.uk/Our_Members/MemDetails.aspx?CompID=AAA-01-01", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With HTML
.body.innerHTML = sResponse
arrayItems = Split(.querySelector("#MainContent_lblDetails").outerHTML, "<BR>")
Debug.Print "Address: " & arrayItems(4) & Chr$(32) & arrayItems(5)
Debug.Print Replace$(Replace$(Replace$(arrayItems(7), "<B>", vbNullString), "</B", vbNullString), "- >", vbNullString)
End With
End Sub

Related

Grabbing a single piece of data from a website's HTML and assign it to a variable

I'm working on a project where I grab data that I stored in an excel sheet and search a specific website that can be seen in the code below. Once the website completes the search, I want to grab the "worth" from the top right of the page. I'm fairly new to using VBA with HTML, so I'm not sure how to take the element (worth) that I'm looking for from the web page, and assign it to a variable in VBA so I can paste it into my excel sheet.
Right now I'm able to open IE, insert my data into the search bar of the specific website that I'm using, and click search. What I have is seen below. Thank you in advance!
Sub BrowsetoSite()
Dim IE As New SHDocVw.InternetExplorer
Dim website As String
Dim i As Integer
i = 2
'Set ie = New SHDocVw.InternetExplorer
website = "https://cardmavin.com/category/football"
IE.navigate website
IE.Visible = False
Do While IE.readyState <> READYSTATE_COMPLETE
'assign info to variable to enter into the search bar
Loop
Dim idoc As MSHTML.HTMLDocument
Set idoc = IE.document
Dim Brand As String
Dim Year As String
Dim Num As String
Dim Name As String
Dim search As String
Dim value As Variant
Brand = Range("A" & i).value
Year = Range("B" & i).value
Num = Range("D" & i).value
Name = Range("E" & i).value
search = (Year & " " & Brand & " " & Name & " " & Num)
i=i+1
idoc.getElementById("search-field").value = search
idoc.getElementById("to-mavin").Click
While IE.readyState <> READYSTATE_COMPLETE
DoEvents
Loop
Dim value As Variant
value = idoc.getElementsByTagName("h4")(0).innerText
MsgBox value
IE.Quit
End Sub
The issue that I'm having is the value = idoc.getElementsByTagName("h4")(0).innerText. I've tried to get the element a few different ways, but have been unsuccessful so far.
You need Set idoc = IE.document after you've submitted the search, to get a reference to that new page. Otherwise you're still trying to access the previous page.
i=i+1
idoc.getElementById("search-field").value = search
idoc.getElementById("to-mavin").Click
While IE.readyState <> READYSTATE_COMPLETE
DoEvents
Loop
Set idoc = IE.document '<<<<<<<<<<<<<<
Dim value As Variant
value = idoc.getElementById("worthBox") _
.getElementsByTagName("h4")(0).innerText
MsgBox value
Try this approach. Suppose in cell A1 the string 2008 Topps Thomas DeCoud
Sub Test()
Const sURL As String = "https://mavin.io/search?q="
Dim json As Object
Set json = GetJSONFromHTMLHead(sURL & Application.WorksheetFunction.EncodeURL(Range("A1").Value))
Debug.Print json("offers")("priceCurrency")
Debug.Print json("offers")("price")
End Sub
Function GetJSONFromHTMLHead(ByVal sURL As String) As Object
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument, re As Object, json As Object
Set http = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
Set re = CreateObject("VBScript.RegExp")
re.Pattern = "<head>([\s\S]+)<\/head>"
With http
.OPEN "Get", sURL, False
.send
html.body.innerHTML = Replace$(Replace$(re.Execute(.responseText)(0), "<head>", "<body>"), "</head>", "</body>")
End With
Set json = JSONConverter.ParseJson(html.querySelector("script[type='application/ld+json']").innerHTML)
Set GetJSONFromHTMLHead = json
End Function

VBA Web Scraping: Object turns out empty (getelementbyID)

I intend to extract the information from the website below (website is in Malay) containing information on tender awards.
https://tender.selangor.my/results?type=tenders
My code is as below, but the 'tenders' object appears as Nothing causing me unable to proceed further. Hope you can share some guidance on what I am doing wrong.
Worksheets("Data").Cells.Clear
Dim xhr As Object
Dim html As New HTMLDocument
Dim tenders As Object, item As Object, item2 As Object
Dim tender As Object
Dim i As Integer
Set xhr = CreateObject("MSXML2.XMLHTTP")
With xhr
.Open "GET", "https://tender.selangor.my/results?type=tenders", False
.send
If .readyState = 4 And .Status = 200 Then
html.body.innerHTML = .responseText
Else
MsgBox "Error" & vbNewLine & "Ready state: " & .readyState & _
vbNewLine & "HTTP request status: " & .Status
End If
End With
Set tenders = html.getElementById("DataTables_Table_0")
The tabular content that you are interested in are generated dynamically, so you can either make use of Internet Explorer or issue a get http requests with appropriate parameters to parse the json content using third party library. As the first option is easy to go with, I've created an example using the same:
Sub GetInformation()
Dim tenders As Object
With CreateObject("InternetExplorer.Application")
.Visible = True
.navigate "https://tender.selangor.my/results?type=tenders"
While .Busy Or .readyState < 4: DoEvents: Wend
Application.Wait Now + TimeValue("00:00:05")
Set tenders = .document.getElementById("DataTables_Table_0")
Debug.Print tenders.innerText
.Quit
End With
End Sub

Scrape values from website using VBA

Help needed in order to scrape some data from a website.
As a first step i manage to visit the website and import my variables but:
1.i don't know how to press "Convert currencies" button
2.and afterwards to get "Converted Amount" & "Rate" to excel.
any help will be appreciate!!!
Sub Test()
Dim IE As InternetExplorer
Dim Amount As String
Dim Source As String
Dim Target As String
Dim Datestring As String
Amount = 10000
Source = "Euro"
Target = "UK pound sterling"
Datestring = "03-08-2018"
'Open Browser and download data
Set IE = New InternetExplorer
With IE
.Visible = True
.Navigate "http://sdw.ecb.europa.eu/curConverter.do?sourceAmount=" & _
Amount & _
"&sourceCurrency=" & _
Source & _
"&targetCurrency=" & _
Target & _
"&inputDate=" & _
Datestring & _
"&submitConvert.x=209&submitConvert.y=10"
submitConvert.Click
While .Busy Or .readyState < 4: DoEvents: Wend
End With
End Sub
XmlHttpRequest (XHR):
Faster to use XHR where there is no browser opening.
Option Explicit
Public Sub GetRates()
Dim sResponse As String, i As Long, html As New HTMLDocument, clipboard As Object
Dim sourceAmount As String, sourceCurrency As String, targetCurrency As String, inputDate As String
sourceAmount = "10000"
sourceCurrency = "EUR"
targetCurrency = "GBP"
inputDate = "03-08-2018"
Dim url As String
url = "http://sdw.ecb.europa.eu//curConverter.do?sourceAmount=" & sourceAmount & "&sourceCurrency=" & sourceCurrency & _
"&targetCurrency=" & targetCurrency & "&inputDate=" & inputDate & "&submitConvert.x=52&submitConvert.y=8"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
Set clipboard = New MSForms.DataObject
clipboard.SetText .querySelectorAll("table.tableopenpage").item(1).outerHTML
clipboard.PutInClipboard
ActiveSheet.Cells(1, 1).PasteSpecial
End With
End Sub
Less robust but if you just want the
converted amount:
.querySelectorAll("table.tableopenpage").item(1).getElementsbytagname("td")(7).innertext
And the rate:
.querySelectorAll("table.tableopenpage").item(1).getElementsbytagname("td")(10).innertext
Output:
References:
Microsoft Forms Object Library (or add a userform to your project)
Microsoft HTML Object Library
Internet Explorer:
The data is already there due to the query string you are using in the URL. No need for a click.
Just use the correct abbreviations for currencies.
Option Explicit
Public Sub Test()
Dim IE As InternetExplorer, Amount As String, Source As String, Target As String
Dim Datestring As String, hTable As HTMLTable
Amount = 10000
Source = "EUR"
Target = "GBP"
Datestring = "03-08-2018"
Dim url As String
url = "http://sdw.ecb.europa.eu/curConverter.do?sourceAmount=" & _
Amount & _
"&sourceCurrency=" & _
Source & _
"&targetCurrency=" & _
Target & _
"&inputDate=" & _
Datestring & _
"&submitConvert.x=209&submitConvert.y=10"
Set IE = New InternetExplorer
With IE
.Visible = True
.navigate url
While .Busy Or .readyState < 4: DoEvents: Wend
Dim clipboard As Object
Set clipboard = New MSForms.DataObject
clipboard.SetText .document.getElementsByClassName("tableopenpage")(1).outerHTML
clipboard.PutInClipboard
ActiveSheet.Cells(1, 1).PasteSpecial
End With
End Sub
If interested in how to click though:
1) Use the correct 3 letter abbreviations for the currencies.
2) You can click the submit button with:
.document.querySelector("input[name=submitConvert]").Click
It uses a CSS selector of
input[name=submitConvert]
This says
element with input tag having an attribute name whose value is submitconvert.
3) You then need a
While .Busy Or .readyState < 4: DoEvents: Wend
to allow the page to refresh.
4) You can then grab the results table with:
.document.querySelectorAll("table.tableopenpage").item(1)
This collects all elements with a tag table and class tableopenpage. You want the second of these, which is 1 on a 0-based index system.
References required:
Microsoft Internet Controls
Microsoft HTML Object Library
Microsoft Forms Object Library
Other:
I find it simpler to grab the table in one go but you could target the rate, for example, more specifically with a CSS selector of:
a[target*=quickview]
Be aware that Excel may swop Date from dd/mm/yyyy to mm/dd/yyyy on output and so you will need to correct this, or at least be aware of it.

XMLHTTP60 Req not showing entire HTML Document

I am trying to obtain the HTML document from a website to what else, scrape for data!
Unfortunately, I cannot obtain the entire HTML Document associated with the web page. My debug.print statement doesn't show the entire web page as I would like, it gets cut off. I'm somewhat new to programming, help would be greatly appreciated!
My code is below:
Const SecForm4 As String = "https://www.secform4.com/significant-buys.htm"
Sub LoadWebPage()
Dim XMLReq As New MSXML2.XMLHTTP60
XMLReq.Open "GET", SecForm4, False
XMLReq.send
If XMLReq.Status <> 200 Or XMLReq.readyState <> 4 Then
MsgBox "Problem" & vbNewLine & XMLReq.Status & "-" & XMLReq.statusText
Exit Sub
End If
ParsingHTMLDocument XMLReq.responseText
End Sub
Sub ParsingHTMLDocument(HTMLText As String)
Dim HTMLDoc As New MSHTML.HTMLDocument
HTMLDoc.body.innerHTML = HTMLText
Debug.Print HTMLText
End Sub
The following works in terms of grabbing the document and the table is present. You are unlikely to be able to print the entire document to the immediate window as it has limitations on capacity. Instead you could write to a text file and inspect.
Change the filepath ,"C:\Users\User\Desktop\Test.txt", to one for you.
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, i As Long, html As New HTMLDocument, hTable As HTMLTable
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.secform4.com/significant-buys.htm", False
.Send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
WriteTxtFile sResponse
With html
.body.innerHTML = sResponse
Set hTable = .getElementById("filing_table")
MsgBox hTable.localName
End With
End Sub
Public Sub WriteTxtFile(ByVal aString As String, Optional ByVal filePath As String = "C:\Users\User\Desktop\Test.txt")
Dim fso As Object, Fileout As Object
Set fso = CreateObject("Scripting.FileSystemObject")
Set Fileout = fso.CreateTextFile(filePath, True, True)
Fileout.Write aString
Fileout.Close
End Sub
Reference to HTML Object Library required.

VBA Excel pulling new webpage data after clicking on "submit"

I'm trying to pull some info from a website that provides oil well data by API number (API is a unique number for every well in the US)
Website: http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellapi1
API example: 1708300502
The issue is, when I get to the 2nd page, IE.document.getElementsByTagName("body")(0).innerText still returns data from the initial page. How do I fetch the updated page data?
The ultimate goal is to get to the 2nd page, click on "30570" via IE.document.getElementsByTagName("a")(0).Click and then read the final 3rd page. I just cannot figure out how to read the updated page :(
Option Explicit
Sub sonris_WellData()
Dim IE As InternetExplorer
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
Dim i As Integer
'Open SONRIS website
Application.StatusBar = "Opening Website"
IE.navigate "http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellapi1"
Do While IE.readyState <> 4: DoEvents: Loop
Application.Wait Now() + TimeValue("00:00:01")
Application.StatusBar = False
IE.document.forms(0).p_apinum.Value = "1708300502" 'plug-in API
IE.document.forms(0).submit
' Wait until the next page opens
Application.StatusBar = "Opening Website"
Do While IE.readyState <> 4: DoEvents: Loop
Application.Wait Now() + TimeValue("00:00:01")
Application.StatusBar = False
' Read the page - this is where the issue occurs, MsgBox keeps returning text from the very 1st page
MsgBox IE.document.getElementsByTagName("body")(0).innerText
IE.Quit
End Sub
This seems to be working. Rather than DoEvents use the WinAPI Sleep function. I also added a call to the Sleep function after the form submit.
MOre often we are seeing sites that are dynamically served by some javascript/etc., in these cases the browser may appear to be READYSTATE_COMPLETE or not Busy but the page has not yet rendered the "new" results.
Option Explicit
Public Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)
Sub sonris_WellData()
Dim IE As Object 'InternetExplorer
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
Dim i As Integer
'Open SONRIS website
Application.StatusBar = "Opening Website"
IE.navigate "http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellapi1"
Do While IE.readyState <> 4
Sleep 1000
Loop
Application.StatusBar = False
IE.document.forms(0).p_apinum.Value = "1708300502" 'plug-in API
IE.document.forms(0).submit
Sleep 1000
' Wait until the next page opens
Application.StatusBar = "Opening Website"
Do While IE.readyState <> 4
Sleep 1000
Loop
Application.StatusBar = False
' Read the page - this is where the issue occurs, MsgBox keeps returning text from the very 1st page
MsgBox IE.document.getElementsByTagName("body")(0).innerText
IE.Quit
End Sub
You can experiment maybe with a slightly longer Sleep after the .submit.
Alternatively, I notice that after you submit, the URL changes, so you could also try changing the second waiting loop to:
Do While IE.LocationURL ="http://sonlite.dnr.state.la.us/sundown/cart_prod/cart_con_wellapi1"
Sleep 1000
Loop
This should put the Excel.Application to wait until the URL has changed.
Alternatively, you may have better luck using an XMLHTTPRequest (there are many examples of this here on SO and elsewhere on the internet). This allows you to send a request just like the browser, without actually using a web browser. Then you can simply parse the return text as HTML or XML. I would use the Microsoft XML, v6.0 library reference for this.
POST requests:
① Entering the Well API number
I examined the web page making the selections you mention. I inspected the web traffic using fiddler and noticed that the initial request, when you submit the API number is handled by a POST request.
② POST request:
The POST body has the following parameter:
p_apinum is the key and the associated value is the original Well API number.
Using this info I formulated a POST request direct thus avoiding your first landing page.
③ Pressing the hyperlink:
Next, I noticed that the element you wanted to press:
Looking at the associated HTML it has an associated relative hyperlink:
I use a helper function to parse the page HTML to get this relative link and construct the absolute path: GetNextURL(page.body.innerHTML).
④ Making a new request:
I re-use my HTTPRequest function GetPage to send a second request, with an empty body, and grab all the tables from the HTML document returned via: page.getElementsByTagName("table").
⑤ Writing the tables to the Excel worksheet:
I loop all the tables on the page using helper function AddHeaders to write out the table headers, and WriteTables to write the current table to the sheet.
Example page content:
Example code output:
VBA:
Option Explicit
Public Sub GetWellInfo()
Dim ws As Worksheet, page As HTMLDocument, targetTable As HTMLTable, apiNumbers(), currNumber As Long
Const PARAM1 As String = "p_apinum"
Const BASESTRING As String = "http://sonlite.dnr.state.la.us/sundown/cart_prod/"
apiNumbers = Array(1708300502, 1708300503)
Application.ScreenUpdating = False
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ws
.Cells.ClearContents
For currNumber = LBound(apiNumbers) To UBound(apiNumbers)
Set page = GetPage(BASESTRING & "cart_con_wellapi2", apiNumbers(currNumber), PARAM1)
Set page = GetPage(BASESTRING & GetNextURL(page.body.innerHTML))
Dim allTables As Object
Set allTables = page.getElementsByTagName("table")
For Each targetTable In allTables
AddHeaders targetTable, GetLastRow(ws, 1) + 2, ws
WriteTables targetTable, GetLastRow(ws, 1), ws
Next targetTable
Next currNumber
End With
Application.ScreenUpdating = True
End Sub
Public Function GetPage(ByVal url As String, Optional ByVal apiNumber As Long, Optional ByVal paramN As String = vbNullString) As HTMLDocument
Dim objHTTP As Object, html As New HTMLDocument
Set objHTTP = CreateObject("WinHttp.WinHttpRequest.5.1")
Dim sBody As String
If Not paramN = vbNullString Then sBody = paramN & "=" & apiNumber
With objHTTP
.SetTimeouts 10000, 10000, 10000, 10000
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
On Error Resume Next
.send (sBody)
If Err.Number = 0 Then
If .Status = "200" Then
html.body.innerHTML = .responseText
Set GetPage = html
Else
Debug.Print "HTTP " & .Status & " " & .statusText
Exit Function
End If
Else
Debug.Print "Error " & Err.Number & " " & Err.Source & " " & Err.Description
Exit Function
End If
On Error GoTo 0
End With
End Function
Public Function GetNextURL(ByVal inputString As String)
GetNextURL = Replace$(Replace$(Split(Split(inputString, "href=")(1), ">")(0), Chr$(34), vbNullString), "about:", vbNullString)
End Function
Public Sub AddHeaders(ByVal hTable As Object, ByVal startRow As Long, ByVal ws As Worksheet)
Dim headers As Object, header As Object, columnCounter As Long
Set headers = hTable.getElementsByTagName("th")
For Each header In headers
columnCounter = columnCounter + 1
ws.Cells(startRow, columnCounter) = header.innerText
Next header
End Sub
Public Sub WriteTables(ByVal hTable As HTMLTable, Optional ByVal startRow As Long = 1, Optional ByRef ws As Worksheet)
If ws Is Nothing Then Set ws = ActiveSheet
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long
r = startRow
With ActiveSheet
Set tRow = hTable.getElementsByTagName("tr")
For Each tr In tRow
Set tCell = tr.getElementsByTagName("td")
For Each td In tCell
.Cells(r, c).Value = td.innerText
c = c + 1
Next td
r = r + 1: c = 1
Next tr
End With
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
End With
End Function
References:
VBE > Tools > References > HTML Object Library.