How to Isolate multiple innertext entries when using get elementbyID - html

I'm trying to isolate 2 different innerText strings from a webpage, but cannot single them out. The innerText for all the tags comes as a whole.
The date and season number are the issue.
I am using getElementById and this gives me a single element. The div with id "next_episode" has what looks like 2 different entries for inner text that I'm interested in. When I loop through the inner text of its children, these 2 entries are skipped. I can't figure out how to isolate the 2 different innerText entries of just the "next_episode" tag. I'm isolating the text I require by using the index number in the arrays my code returns.
Dim IE_00 As SHDocVw.InternetExplorer
Dim HTMLDoc_00 As MSHTML.HTMLDocument
Set IE_00 = New SHDocVw.InternetExplorer
IE_00.Visible = True
IE_00.navigate "https://next-episode.net/final-space"
Do While IE_00.readyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc_00 = IE_00.document
Dim NETC_05 As MSHTML.IHTMLElementCollection
Dim NET_05 As MSHTML.IHTMLElement
'Can loop through the inner text of the children one by one and find what
I need
Set NETC_05 = HTMLDoc_00.getElementById("next_episode").Children
For Each NET_05 In NETC_05
Debug.Print NET_05.innerText
Next NET_05
'This just gives a big block of text that includes the missing inner text
I need
Set NET_05 = HTMLDoc_00.getElementById("next_episode")
Debug.Print NET_05.innerText

Data (for the most part) is in NextSiblings:
The Node.nextSibling read-only property returns the node immediately
following the specified one in their parent's childNodes, or returns
null if the specified node is the last child in the parent element.
*1
You could write a function, like GetNextSiblings, which checks the current node for specific search strings and then extracts required values from NextSibling. I have re-ordered output columns to make for less code, but you could easily loop an alternative headers array, and use that ordering to access from dict info to write out values in a different order. I determine order of output by order of entry of keys in the dict. I loop the headers array to populate the dict keys and later update the dict with the values scraped.
The overhead of a browser is not needed as the required content is not dynamically loaded. A simple, and much faster, xhr request will suffice.
Side-note:
I would recommend, for this type of page, to use Python 3 and BeautifulSoup (bs4 4.7.1+) as this gives you access to pseudo selector :contains. The code could then be much more concise and the program faster. I show this at the end.
VBA:
Option Explicit
Public Sub GetShowInfo()
Dim html As MSHTML.HTMLDocument, headers(), i As Long, aCollection As Object, info As Object
headers = Array("Name:", "Countdown:", "Date:", "Season:", "Episode:", "Status:")
Set html = New HTMLDocument
With CreateObject("Msxml2.xmlhttp")
.Open "GET", "https://next-episode.net/final-space", False
.send
html.body.innerHTML = .responseText
End With
Set info = CreateObject("Scripting.Dictionary")
For i = LBound(headers) To UBound(headers)
info(headers(i)) = vbNullString
Next
info("Name:") = html.querySelector("#next_episode .sub_main").innerText
info("Countdown:") = html.querySelector("#next_episode span").innerText
Set aCollection = html.getElementById("middle_section").getElementsByTagName("div")
Set info = GetNextSiblings(aCollection, headers, info)
Set aCollection = html.getElementById("next_episode").getElementsByTagName("div")
Set info = GetNextSiblings(aCollection, headers, info)
With ThisWorkbook.Worksheets("Sheet1")
.Cells(1, 1).Resize(1, info.Count) = info.keys
.Cells(2, 1).Resize(1, info.Count) = info.items
End With
End Sub
Public Function GetNextSiblings(ByVal aCollection As Object, ByRef headers(), ByVal info As Object) As Object
Dim item As Object, i As Long
For Each item In aCollection
For i = 2 To UBound(headers)
If InStr(item.outerHTML, headers(i)) > 0 Then
If headers(i) = "Episode:" Then
info(headers(i)) = item.NextSibling.innerText
Else
info(headers(i)) = item.NextSibling.NodeValue
End If
Exit For
End If
Next
Next
Set GetNextSiblings = info
End Function
Reading:
NextSibling
CSS selectors
querySelector
Python (with bs4 4.7.1+):
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://next-episode.net/final-space')
soup = bs(r.content, 'lxml')
current_nodes = ['Status:','Name:', 'Countdown:','Date:','Season:','Episode:']
for node in current_nodes:
selector = f'#middle_section div:contains("{node}"), #next_episode div:contains("{node}")'
if node in ['Episode:','Name:']:
print(node, soup.select_one(selector).text.replace(node,''))
elif node == 'Countdown:':
print(node, soup.select_one(selector).next_sibling.text)
else:
print(node, soup.select_one(selector).next_sibling)

'Setting XML 05 as an Object
Dim XML_05 As New MSXML2.XMLHTTP60
'Setting HTML Document 05 as an Object
Dim HTML_05 As New MSHTML.HTMLDocument
XML_05.Open "GET", Cells(Row, NextEpisodeURL).Value, False
XML_05.send
HTML_05.body.innerHTML = XML_05.responseText
'Setting Net Element Tag Collection 05 as an Object
Dim NETC_05 As MSHTML.IHTMLElementCollection
'Setting Net Element Tag 05 as an Object
Dim NET_05 As MSHTML.IHTMLElement
'Setting Reg EX 05 as an Object
Dim REO_05 As VBScript_RegExp_55.RegExp
'Setting Match Object 05 as Object
Dim MO_05 As Object
'Setting Season array as Array
Dim SN_05() As String
'Setting Episode Name 05 as Array
Dim ENA_05() As String
'Setting Episode Number 05 as Array
Dim EN_05() As String
'Getting Episode Name Episode Number and Season Number From Net
'Set NETC_05 = HTML_05.getElementsByClassName("sub_main")
Set NET_05 = HTML_05.getElementById("previous_episode")
Set REO_05 = New VBScript_RegExp_55.RegExp
REO_05.Global = True
REO_05.IgnoreCase = True
'Getting Episode Name
REO_05.Pattern = "(Name:(.*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
ENA_05 = Split(MO_05(0), ":")
Debug.Print ENA_05(1)
Cells(Row, NextEpName).Value = ENA_05(1)
'Getting Episode Number
REO_05.Pattern = "(Episode:([0-9]*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
EN_05 = Split(MO_05(0), ":")
Debug.Print EN_05(1)
Cells(Row, EpisodeNet).Value = EN_05(1)
'Getting Season Number
REO_05.Pattern = "(Season:([0-9]*))"
Set MO_05 = REO_05.Execute(NET_05.innerText)
Debug.Print MO_05.Count
Debug.Print MO_05(0).Value
SN_05 = Split(MO_05(0), ":")
Debug.Print SN_05(1)
Cells(Row, SeasonNet).Value = SN_05(1)
'Getting Countdown From Net
Set NETC_05 = HTML_05.getElementById("next_episode").Children
Cells(Row, Countdown).Value = NETC_05(5).innerText
Debug.Print NETC_05(5).innerText
Public Sub GetShowInfo()
Dim html As MSHTML.HTMLDocument, headers(), i As Long, aCollection As Object, info As Object
Set html = New HTMLDocument
With CreateObject("Msxml2.xmlhttp")
.Open "GET", "https://next-episode.net/chicago-fire", False
.send
html.body.innerHTML = .responseText
End With
Set aCollection = html.getElementById("next_episode").getElementsByTagName("div")
Set aCollection = html.getElementById("next_episode").getElementsByClassName("subheadline")
On Error Resume Next
For Each ele In aCollection
Debug.Print ele.innerText
Debug.Print ele.outerText
Debug.Print ele.nextElementSibling.innerText
Debug.Print ele.nextElementSibling.innerText
Next ele
End Sub

Related

Amazon DVD details Web Scraping not able to pick the required Element

I passing EAN numbers of the certain movies and extracting movie name and ASIN number in Amazon.
"https://www.amazon.de/s?k=7321925005738&__mk_de_DE=ÅMÅŽÕÑ&ref=nb_sb_noss"
But in Amazon website,i face problem like some time the search results contains Sponsored product result also (It may or may not come),How ever i want to extract other than Sponsored products.
So when ever i debug.print the Amazon ASIN number and Movie name,it prints all the ASIN number and movie name (Which includes sponsored product).
For identifying the sponsored product,the way i am using is data-component-type="sp-sponsored-result"
in the response text,
where as actual product does not contain this id at "data-component-type" all,so i am not able to separate the actual movie name (Other than sponsored result)
I tried if not xxxx then ,still my code prints all here i am attaching my code
Here is my code
Sub Amazon_Pull()
Dim Link_2 As String
Link_2 = "https://www.amazon.de/s?k=7321925005738&__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&ref=nb_sb_noss"
Dim xhr As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set xhr = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", Link_2, False
.send
html.body.innerHTML = StrConv(.responseBody, vbUnicode)
End With
'Debug.Print html.body.innerHTMLDebug.Print html.getElementsByTagName("div").getAttribute("data-index").Length
Dim hTable As Object
Dim hba As Object
Set hTable = html.getElementsByTagName("div")
For Each hba In hTable
If Left(hba.getAttribute("data-asin"), 1) = "B" Then
If hba.getElementsByTagName("div")(2).getAttribute("data-component-type") <> "sp-sponsored-result" Then
Debug.Print hba.getAttribute("data-asin")
End If
End If
Next hba
Set xhr = Nothing
Set html = Nothing
'-------------
End Sub
Use a css attribute = value selector to restrict to the appropriate nodes
Dim nodeList As Object, i As Long
Set nodelist = hba.querySelectorall("[data-asin]")
For i = 0 To nodeList.Length - 1
Debug.Print nodeList.item(i).getAttribute("data-asin")
Next
You can remove your conditional statements and add all the conditional logic into the css selector with starts with ^ operator for the character B
Dim nodeList As Object, i As Long
Set nodelist = hba.querySelectorall("[data-asin^=B]")
For i = 0 To nodeList.Length - 1
Debug.Print nodeList.item(i).getAttribute("data-asin")
Next
You can do it quick and dirty like this. But if the word "Gesponsert" is part of film title, it fails ;-)
Sub Amazon_Pull()
Dim Link_2 As String
Dim xhr As MSXML2.XMLHTTP60
Dim html As MSHTML.HTMLDocument
Dim hTable As Object
Dim hba As Object
Dim i As Long
Link_2 = "https://www.amazon.de/s?k=7321925005738"
Set xhr = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", Link_2, False
.send
html.body.innerHTML = StrConv(.responseBody, vbUnicode)
End With
Set hTable = html.querySelectorAll("div[data-index]")
For i = 0 To hTable.Length - 1
If InStr(1, hTable(i).innerText, "Gesponsert") = 0 Then
Debug.Print hTable(i).getAttribute("data-asin") & " " & hTable(i).getElementsByTagName("h2")(0).innerText
End If
Next i
Set xhr = Nothing
Set html = Nothing
End Sub
In my opinion it's always better to use the code structure of a page and not a part of the content. I know, that's not always possible and it's often more complex.
To check if an offer on amazon is sponsored, you can use the structure of the page code like this. One advantage is that it also works on the international Amazon platforms, regardless of the national language.
(Not tested, because Amazon blocked me as a bot.)
Sub Amazon_Pull()
Dim Link_2 As String
Dim xhr As MSXML2.XMLHTTP60
Dim html As MSHTML.HTMLDocument
Dim hTable As Object
Dim hba As Object
Dim i As Long
Dim check As Long
Dim sponsored As Boolean
Dim checkSponsored As Object
Link_2 = "https://www.amazon.de/s?k=7321925005738"
'Link_2 = "https://www.amazon.de/s?k=apple"
Set xhr = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", Link_2, False
.send
html.body.innerHTML = StrConv(.responseBody, vbUnicode)
End With
Set hTable = html.querySelectorAll("div[data-index]")
For i = 0 To hTable.Length - 1
sponsored = False
Set checkSponsored = hTable(i).querySelectorAll("div[data-component-type]")
For check = 0 To checkSponsored.Length - 1
If checkSponsored.getAttribute("data-component-type") = "sp-sponsored-result" Then
sponsored = True
End If
Next check
If Not sponsored Then
Debug.Print hTable(i).getAttribute("data-asin") & " " & hTable(i).getElementsByTagName("h2")(0).innerText
End If
Next i
Set xhr = Nothing
Set html = Nothing
End Sub

I'm getting stuck at vba runtime error 424

I'm getting
run-time error 424
in 68th row (line)
request.Open "GET", Url, False
and I don't know how to fix it.
My previous question I posted ;
How to scrape specific part of online english dictionary?
My final goal is to get result like this;
A B
beginning bɪˈɡɪnɪŋ
behalf bɪˈhæf
behave bɪˈheɪv
behaviour bɪˈheɪvjər
belong bɪˈlɔːŋ
below bɪˈloʊ
bird bɜːrd
biscuit ˈbɪskɪt
Here's code I wrote, and it's mostly based on someone else's code I found on internet.
' Microsoft ActiveX Data Objects x.x Library
' Microsoft XML, v3.0
' Microsoft VBScript Regular Expressions
Sub ParseHelp()
' Word reference from
Dim Url As String
Url = "https://www.oxfordlearnersdictionaries.com/definition/english/" & Cells(ActiveCell.Row, "B").Value
' Get dictionary's html
Dim Html As String
Html = GetHtml(Url)
' Check error
If InStr(Html, "<TITLE>Not Found</Title>") > 0 Then
MsgBox "404"
Exit Sub
End If
' Extract phonetic alphabet from HTML
Dim wrapPattern As String
wrapPattern = "<span class='name' (.*?)</span>"
Set wrapCollection = FindRegexpMatch(Html, wrapPattern)
' MsgBox StripHtml(CStr(wrapCollection(1)))
' Fill phonetic alphabet into cell
If Not wrapCollection Is Nothing Then
Dim wrap As String
On Error Resume Next
wrap = StripHtml(CStr(wrapCollection(1)))
If Err.Number <> 0 Then
wrap = ""
End If
Cells(ActiveCell.Row, "C").Value = wrap
Else
MsgBox "not found"
End If
End Sub
Public Function StripHtml(Html As String) As String
Dim RegEx As New RegExp
Dim sOut As String
Html = Replace(Html, "</li>", vbNewLine)
Html = Replace(Html, " ", " ")
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>"
End With
sOut = RegEx.Replace(Html, "")
StripHtml = sOut
Set RegEx = Nothing
End Function
Public Function GetHtml(Url As String) As String
Dim xmlhttp As Object
Set xmlhttp = CreateObject("MSXML2.serverXMLHTTP")
Dim converter As New ADODB.stream
' Get
request.Open "GET", Url, False
request.send
' raw bytes
converter.Open
converter.Type = adTypeBinary
converter.Write request.responseBody
' read
converter.Position = 0
converter.Type = adTypeText
converter.Charset = "utf-8"
' close
GetHtml = converter.ReadText
converter.Close
End Function
Public Function FindRegexpMatch(txt As String, pat As String) As Collection
Set FindRegexpMatch = New Collection
Dim rx As New RegExp
Dim matcol As MatchCollection
Dim mat As Match
Dim ret As String
Dim delimiter As String
txt = Replace(txt, Chr(10), "")
txt = Replace(txt, Chr(13), "")
rx.Global = True
rx.IgnoreCase = True
rx.MultiLine = True
rx.Pattern = pat
Set matcol = rx.Execute(txt)
'MsgBox "Match:" & matcol.Count
On Error GoTo ErrorHandler
For Each mat In matcol
'FindRegexpMatch.Add mat.SubMatches(0)
FindRegexpMatch.Add mat.Value
Next mat
Set rx = Nothing
' Insert code that might generate an error here
Exit Function
ErrorHandler:
' Insert code to handle the error here
MsgBox "FindRegexpMatch. " & Err.GetException()
Resume Next
End Function
Any kind of help would be greatly appreciated.
The following is an example of how to read in values from column A and write out pronounciations to column B. It uses css selectors to match a child node then steps up to parentNode in order to ensure entire pronounciation is grabbed. There are a number of ways you could have matched on the parent node to get the second pronounciation. Note that I use a parent node and Replace as the pronounciation may span multiple childNodes.
If doing this for lots of lookups please be a good netizen and put some waits in the code so as to not bombard the site with requests.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, urls()
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row 'you need at least two words in column A or change the redim.
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(urls))
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", "https://www.oxfordlearnersdictionaries.com/definition/english/" & urls(i), False
.send
html.body.innerHTML = .responseText
data = Replace$(Replace$(html.querySelector(".name ~ .wrap").ParentNode.innerText, "/", vbNullString), Chr$(10), Chr$(32))
results(i) = Right$(data, Len(data) - 4)
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
Required references (VBE>Tools>References):
Microsoft HTML Object Library
Should you go down the API route then here is a small example. You can make 1000 free calls in a month with Prototype account. The next best, depending on how many calls you wish to make looks like the 10,001 calls (that one extra PAYG call halves the price). # calls will be affected by whether word is head word or needs lemmas lookup call first. The endpoint construction you need is GET /entries/{source_lang}/{word_id}?fields=pronunciations though that doesn't seem to filter massively. You will need a json parser to handle the json returned e.g. github.com/VBA-tools/VBA-JSON/blob/master/JsonConverter.bas. Download raw code from there and add to standard module called JsonConverter. You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, words()
'If not performing lemmas lookup then must be head word e.g. behave, behalf
Const appId As String = "yourAppId"
Const appKey As String = "yourAppKey"
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row
words = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(words))
Set html = New MSHTML.HTMLDocument
Dim json As Object
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(words) To UBound(words)
.Open "GET", "https://od-api.oxforddictionaries.com/api/v2/entries/en-us/" & LCase$(words(i)) & "?fields=pronunciations", False
.setRequestHeader "app_id", appId
.setRequestHeader "app_key", appKey
.setRequestHeader "ContentType", "application/json"
.send
Set json = JsonConverter.ParseJson(.responseText)
results(i) = IIf(json("results")(1)("type") = "headword", json("results")(1)("lexicalEntries")(1)("pronunciations")(2)("phoneticSpelling"), "lemmas lookup required")
Set json = Nothing
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub

Webscraping in VBA where some HTML information has no way to refer to it

I have this VBA script scraping from this URL https://accessgudid.nlm.nih.gov/devices/10806378034350
I want the LOT,SERIAL, and EXPIRATION information which in the below pic, has a "Yes" or "No" inside the HTML.
How do I return just that Yes or No information?
Sub GetInnerInformation(HTMLPage As MSHTML.HTMLDocument)
Dim HTMLResult As MSHTML.IHTMLElement
Dim HTMLResults As MSHTML.IHTMLElementCollection
Dim HTMLRow As MSHTML.IHTMLElement
Dim HTMLCell As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer
Set HTMLResults = HTMLPage.getElementsByClassName("device-attribute")
For Each HTMLResult In HTMLResults
If (HTMLResult.innerText Like "*Lot*") = True Then
Debug.Print HTMLResult.innerText, HTMLResult.outerText, HTMLResult.innerHTML
End If
Next HTMLResult
End Sub
In my Immediate Window I get:
Lot or Batch Number: Lot or Batch Number: Lot or Batch Number:
So no reference to the Yes or No that is in the HTML.
HTML Parser:
You could use a css attribute = value selector to target the span with [?] that is just before the div of interest. Then climb up to shared parent with parentElement, and move to the div of interest with NextSibling. You can then use getElementsByTagName to grab the labels nodes, and loop that nodeList to write out required info. To get the values associated with labels, you again need to use NextSibling to handle the br children within the parent div.
I use xmlhttp to make the request which is faster than opening a browser.
Option Explicit
Public Sub WriteOutYesNos()
Dim html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://accessgudid.nlm.nih.gov/devices/10806378034350", False
.send
html.body.innerHTML = .responseText
End With
Dim nodes As Object, i As Long
Set nodes = html.querySelector("[title*='A production identifier (PI) is a variable']").parentElement.NextSibling.getElementsByTagName("LABEL")
For i = 0 To nodes.Length - 3
With ActiveSheet
.Cells(i + 1, 1) = nodes(i).innerText
.Cells(i + 1, 2) = nodes(i).NextSibling.NodeValue
End With
Next
End Sub
JSON Parser:
Data is also available as json which means you can use a json parser to handle. I use jsonconverter.bas as the json parser to handle response. Download raw code from here and add to standard module called JsonConverter . You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
Option Explicit
Public Sub WriteOutYesNos()
Dim json As Object, ws As Worksheet, results(), i As Long, s As String
Set ws = ThisWorkbook.Worksheets("Sheet1")
results = Array("lotBatch", "serialNumber", "manufacturingDate")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://accessgudid.nlm.nih.gov/devices/10806378034350.json", False
.send
Set json = JsonConverter.ParseJson(.responseText)
End With
With ws
For i = LBound(results) To UBound(results)
.Cells(i + 1, 1) = results(i)
.Cells(i + 1, 2).Value = IIf(json(results(i)), "Yes", "No")
Next
End With
End Sub
XML Parser:
Results also come as xml which you can parse with xml parser provided you handle the default namespace appropriately:
Option Explicit
Public Sub WriteOutYesNos()
Dim xmlDoc As Object, ws As Worksheet, results(), i As Long
Set xmlDoc = CreateObject("MSXML2.DOMDocument")
Set ws = ThisWorkbook.Worksheets("Sheet1")
results = Array("lotBatch", "serialNumber", "manufacturingDate")
With xmlDoc
.validateOnParse = True
.setProperty "SelectionLanguage", "XPath"
.setProperty "SelectionNamespaces", "xmlns:i='http://www.fda.gov/cdrh/gudid'"
.async = False
If Not .Load("https://accessgudid.nlm.nih.gov/devices/10806378034350.xml") Then
Err.Raise .parseError.ErrorCode, , .parseError.reason
Exit Sub
End If
End With
With ws
For i = LBound(results) To UBound(results)
.Cells(i + 1, 1) = results(i)
.Cells(i + 1, 2).Value = IIf(xmlDoc.SelectSingleNode("//i:" & results(i)).Text, "Yes", "No")
Next
End With
End Sub
Tinkered around and found it. I had to hardcode the results a little but here is what I got. Let me know if you've found a more elegant answer!
Sub GetInnerInformation(HTMLPage As MSHTML.HTMLDocument)
Dim HTMLResult As MSHTML.IHTMLElement
Dim HTMLResults As MSHTML.IHTMLElementCollection
Dim HTMLRow As MSHTML.IHTMLElement
Dim HTMLCell As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer
Dim Lot As Boolean
Dim Serial As Boolean
Dim Expiration As Boolean
Set HTMLResults = HTMLPage.getElementsByClassName("expandable-device-content")
For Each HTMLResult In HTMLResults
If (HTMLResult.innerText Like "*Lot or Batch Number*") = True Then
Debug.Print HTMLResult.innerText
If HTMLResult.innerText Like "*Lot or Batch Number: Yes*" Then
Lot = True
End If
If HTMLResult.innerText Like "*Lot or Batch Number: No*" Then
Lot = False
End If
If HTMLResult.innerText Like "*Serial Number: Yes*" Then
Serial = True
End If
If HTMLResult.innerText Like "*Serial Number: No*" Then
Serial = False
End If
If HTMLResult.innerText Like "*Expiration Date: Yes*" Then
Serial = True
End If
If HTMLResult.innerText Like "*Expiration Date: No*" Then
Serial = False
End If
Debug.Print Lot, Serial, Expiration
End If
Next HTMLResult
End Sub

Exploring the Instr VBA Function In Webscraping

I want to scrape this URL https://www.realtor.com/realestateandhomes-search/06510 using the VBA InStr function and extract all URLs with this substring "06510"
Here's is a sample code I've been trying to make work.
Option Explicit
Sub GetLinks()
'
'To use HTMLDocument you need to set a reference to Tools -> References -> Microsoft HTML Object Library
Dim HTML As New HTMLDocument
Dim http As Object
Dim links As Object
Dim link As HTMLHtmlElement
Dim counter As Long
Dim website As Range
Dim LastRange As Range
Dim row As Long
Dim continue As Boolean
Dim respHead As String
Dim lRow As Long
Application.ScreenUpdating = False
' The row where website addresses start
row = 30
continue = True
lRow = Cells(Rows.count, 1).End(xlUp).row + 1
' XMLHTTP gives errors where ServerXMLHTTP does not
' even when using the same URL's
'Set http = CreateObject("MSXML2.XMLHTTP")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Do While continue
' Could set this to first cell with URL then OFFSET columns to get next web site
Set website = Range("A" & row)
Set LastRange = Range("B" & lRow)
If Len(website.Value) < 1 Then
continue = False
Exit Sub
End If
If website Is Nothing Then
continue = False
End If
'Debug.Print website
With http
On Error Resume Next
.Open "GET", website.Value, False
.send
' If Err.Num is not 0 then an error occurred accessing the website
' This checks for badly formatted URL's. The website can still return an error
' which should be checked in .Status
'Debug.Print Err.Number
' Clear the row of any previous results
Range("B" & row & ":e" & row).Clear
' If the website sent a valid response to our request
If Err.Number = 0 Then
If .Status = 200 Then
HTML.body.innerHTML = http.responseText
Set links = HTML.getElementsByTagName("a")
For Each link In links
If InStr(link.outerHTML, "06510") Then
LastRange.Value = link.href
End If
Next
End If
Set website = Nothing
Else
'Debug.Print "Error loading page"
LastRange.Value = "Error with website address"
End If
On Error GoTo 0
End With
row = row + 1
Loop
Application.ScreenUpdating = True
End Sub
After inspecting the page, here's a sample of the kind of URL to extract - https://www.realtor.com/realestateandhomes-detail/239-Bradley-St_New-Haven_CT_06510_M36855-92189. Any help will be appreciated
Using QHarr's code in a simplified way...
Sub GetLinks()
Dim url As String, links_count As Integer
Dim j As Integer, row As Integer
Dim XMLHTTP As Object, html As Object
'Dim tr_coll As Object, tr As Object
'Dim elements As Object
Dim i As Long, allLinksOfInterest As Object
'Dim td_coll As Object, td As Object, td_col, objT
url = "https://www.realtor.com/realestateandhomes-search/06510"
Set XMLHTTP = CreateObject("MSXML2.XMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.responseText
Set allLinksOfInterest = html.querySelectorAll("[href*='06510']")
For i = 0 To allLinksOfInterest.Length - 1
Debug.Print allLinksOfInterest.Item(i).href
Next
End Sub
Please check if I'm missing anything. I'm still getting the error "Object doesn't support this property or method"
Don't use Instr on entire node outerHTML during a loop of all a tags. There are times when this is required but this shouldn't be one of them (hopefully).
You want to use attribute = value css selector with contains, *, operator. It is specifically for the purpose of matching on substrings in attribute values. This is more efficient.
Set allLinksOfInterest = html.querySelectorAll("[href*='06510']")
So,
Dim i As Long, allLinksOfInterest As Object
Set allLinksOfInterest = html.querySelectorAll("[href*='06510']")
For i = 0 To allLinksOfInterest.Length - 1
Debug.Print allLinksOfInterest.Item(i).href
Next
Attribute = value with contains operator:
[attr*=value]
Represents elements with an attribute name of attr whose
value contains at least one occurrence of value within the string.
VBA:
Produces 26 links currently.All are relative links so need domain added as shown in loop. Some are duplicates so consider adding to a dictionary as keys so as remove duplicates.
Option Explicit
Public Sub GetLinks()
Dim html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.realtor.com/realestateandhomes-search/06510", False
.send
html.body.innerHTML = .responseText
End With
Dim i As Long, allLinksOfInterest As Object
Set allLinksOfInterest = html.querySelectorAll("[href*='06510']")
For i = 0 To allLinksOfInterest.Length - 1
Debug.Print Replace$(allLinksOfInterest.item(i).href,"about:","https://www.realtor.com")
Next
End Sub
If InStr(link.outerHTML, "06510") Then
In the code above, InStr function was used like boolean function. But it is not boolean, instead it returns integer. So, you should add comparison operator after function. May be like:
If InStr(link.outerHTML, "06510")>0 Then

Web scraping using VBA

I would like to extract data from this URL.
I want to extract Title, mobile contact number and address from each of 10 business cards.
Here is some code I tried but did not get success.
Public Sub GetValueFromBrowser()
On Error Resume Next
Dim Sn As Integer
Dim ie As Object
Dim url As String
Dim Doc As HTMLDocument
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
For Sn = 1 To 1
url = Sheets("Infos").Range("C" & Sn).Value
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = 0
.navigate url
While .Busy Or .readyState <> 4
DoEvents
Wend
End With
Set Doc = ie.document
Set elements = Doc.getElementsByClassName(" col-sm-5 col-xs-8 store-details sp-detail paddingR0")
Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "lng_cont_name" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = Doc.getElementsByClassName("Store-Name")(count).innerText
Cells(erow, 2) = Doc.getElementsByClassName("cont_fl_addr")(count).innerText
count = count + 1
End If
Next element
If Val(Left(Sn, 2)) = 99 Then
ActiveWorkbook.Save
End If
Next Sn
End Sub
The tel numbers were not easy as I think they have purposefully been made difficult to scrape. I have found a way to decipher the values from the CSS pseudo ::before element content. The addresses and titles were straightforward CSS selections.
I have since written a cleaner script in python here.
So, how do the various parts of the code work?
titles:
Set titles = .querySelectorAll(".jcn [title]")
I target the titles as elements that have a title attribute with a parent jcn class attribute. The "." indicates a class selector, the "[]" an attribute selector, and the " " in between is a descendant combinator.
querySelectorAll method of document returns a nodeList of all matching elements on the page i.e. the 10 titles.
addresses:
Set addresses = .querySelectorAll(".desk-add.jaddt")
Addresses are targeted by their class attribute desk-add jaddt. As compound class names are not allowed, an additional "." has to be replace the white space in the name.
Telephone numbers (via deciphering contents within storesTextToDecipher) :
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
This is where the magic happens. The numbers are not available via the DOM directly as they are pseudo element content.
If you inspect the relevant HTML you will find a series of pseudo ::before elements. VBA exposes no mechanism for applying pseudo selectors to try and get at this info in the CSS for the page.
What you see is in fact a series of span elements that each have a class attribute beginning with mobilesv. These elements sit within a single parent element of class col-sm-5 col-xs-8 store-details sp-detail paddingR0 (note again the compound class name).
I initially gather a nodeList of all the parent elements.
Sample of returned elements:
Each of these parent elements houses the class name (beginning with mobilesv) elements that constitute the characters of the telephone number string. Some characters are numbers in the string, others represent the +()-. Note: The 2|3 letter strings, in the class names, after icon- e.g. dc, fe.
For example, the first search result on the page, for the initial number 9 in the telephone number:
The actual CSS content for this pseudo element /telephone character can be observed in the CSS styling:
Note the class name and before pseudo element selector: .icon-ji:before
And the content is \9d010.
Long story short.... you can extract the 2 or 3 letters after icon- i.e. ji in this case, and the number string after \9d0, i.e. 10 in this case, and use these two bits of info to decipher the telephone number. This info is available in the response:
See the same 2/3 letter strings that are associated with the class names of the telephone string on the left, and the content instructions on the right. A little maths deduces that the number on the right is 1 greater than the telephone number, for that class, shown in the image of the webpage. I simply create a dictionary that then maps the 2/3 letter abbreviation to the telephone number by parsing this section of the html.
When looping over storesTextToDecipher, I use this dictionary to decipher the actual telephone number from the matching 2/3 letter abbreviation in the class name.
VBA:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""\9d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
html.body.innerHTML = sResponse
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
With html
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
End With
For i = 0 To titles.Length - 1
Debug.Print "title: " & titles.item(i).innerText
Debug.Print "address: " & addresses.item(i).innerText
Debug.Print GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
Next
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
Sample output:
Edit: All page results
As you now want more than 10 the following uses the expected page result count (NUMBER_RESULTS_ON_PAGE) to gather the information from the page. It scrolls the page until the expected number of telephone numbers (which should be unique) are found, or the MAX_WAIT_SEC is hit. This means you avoid an infinite loop and can set your expected result count if you expect a different number. It does rely on each store having a telephone number listed - this seems a fairly reasonable assumption.
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, resultCountDict As Object, cipherDict As Object, t As Date
Const MAX_WAIT_SEC As Long = 300 'wait 5 minutes max before exiting loop to get all results
Const NUMBER_RESULTS_ON_PAGE As Long = 80
Const URL = "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3"
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
Application.ScreenUpdating = True
Set resultCountDict = CreateObject("Scripting.Dictionary")
Set cipherDict = GetCipherDict(URL)
With IE
.Visible = True
.Navigate2 URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
t = Timer
Do
DoEvents
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
Dim telNumber As String, i As Long
For i = 0 To titles.Length - 1
telNumber = GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
If Not resultCountDict.Exists(telNumber) Then
resultCountDict.Add telNumber, Array(titles.item(i).innerText, addresses.item(i).innerText, telNumber)
End If
Next
.parentWindow.execScript "window.scrollBy(0, window.innerHeight);", "javascript"
While IE.Busy Or IE.readyState < 4: DoEvents: Wend
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop Until resultCountDict.Count = NUMBER_RESULTS_ON_PAGE
End With
.Quit
End With
Dim key As Variant, rowCounter As Long
rowCounter = 1
With ThisWorkbook.Worksheets("Sheet1")
For Each key In resultCountDict.keys
.Cells(rowCounter, 1).Resize(1, 3) = resultCountDict(key)
rowCounter = rowCounter + 1
Next
End With
Application.ScreenUpdating = True
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
Public Function GetCipherDict(ByVal URL As String) As Object
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""\9d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
Set GetCipherDict = cipherDict
End Function
EDIT:
Version for where more than one number is present at top (Please note that if you make too many requests or too quickly server will serve you random pages):
Option Explicit
Public Sub GetDetails()
Dim re As Object, decodeDict As Object, i As Long
Dim html As MSHTML.htmlDocument, responseText As String, keys(), values()
Set decodeDict = CreateObject("Scripting.Dictionary")
Set re = CreateObject("vbscript.regexp")
Set html = New MSHTML.htmlDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/chengalpattu/Oasis-Pharma-Near-Saraswathi-Children-School-Revathypuram-Urapakkam/9999PXX44-XX44-181123145524-X8G7_BZDET", False
.setRequestHeader "User-Agent", "Mozilla/4.0"
.send
responseText = .responseText
html.body.innerHTML = responseText
End With
keys = GetMatches(re, responseText, "-(\w+):before")
If UBound(keys) = 0 Then Exit Sub
values = GetMatches(re, responseText, "9d0(\d+)", True)
For i = LBound(values) To UBound(values)
decodeDict(keys(i)) = values(i)
Next
Dim itemsToDecode()
decodeDict(keys(UBound(keys))) = "+"
itemsToDecode = GetValuesToDecode(html)
PrintNumbers re, html, itemsToDecode, decodeDict
End Sub
Public Function GetMatches(ByVal re As Object, ByVal inputString As String, ByVal sPattern As String, Optional ByVal numeric = False, Optional ByVal spanSearch = False) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With re
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .Test(inputString) Then
Set matches = .Execute(inputString)
ReDim arrMatches(0 To matches.Count - 1)
For Each iMatch In matches
If numeric Then
arrMatches(i) = iMatch.SubMatches.item(0) - 1
Else
If spanSearch Then
arrMatches(i) = iMatch
Else
arrMatches(i) = iMatch.SubMatches.item(0)
End If
End If
i = i + 1
Next iMatch
Else
ReDim arrMatches(0)
arrMatches(0) = vbNullString
End If
End With
GetMatches = arrMatches
End Function
Public Function GetValuesToDecode(ByVal html As MSHTML.htmlDocument) As Variant
Dim i As Long, elements As Object, results(), Class As String
Set elements = html.querySelectorAll(".telCntct span[class*='icon']")
ReDim results(elements.Length - 1)
For i = 0 To elements.Length - 1
Class = elements.item(i).className
results(i) = Right$(Class, Len(Class) - InStrRev(Class, "-"))
Next
GetValuesToDecode = results
End Function
Public Sub PrintNumbers(ByVal re As Object, ByVal html As htmlDocument, ByVal itemsToDecode As Variant, ByVal decodeDict As Object)
Dim output As String, i As Long
For i = LBound(itemsToDecode) To UBound(itemsToDecode)
output = output & decodeDict(itemsToDecode(i))
Next
Dim htmlToSearch As String, groups As Variant, startPos As Long, oldStartPos As Long
htmlToSearch = html.querySelector(".telCntct").outerHTML
groups = GetMatches(re, htmlToSearch, "mobilesv|,", False, True)
startPos = 1
Dim totalNumbers As Long
For i = LBound(groups) To UBound(groups)
If InStr(groups(i), ",") > 0 Then
totalNumbers = totalNumbers + 1
Debug.Print Mid$(output, startPos, IIf(startPos = 1, i, i - startPos))
startPos = i + 1
End If
Next
If totalNumbers = 1 Then Debug.Print Right$(output, Len(output) - startPos - 1)
End Sub