Regex VBA Match - html

How can i get the value 81.16 in second msgbox?
Regex get only numeric values from string
Sub Tests()
Const strTest As String = "<td align=""right"">116.83<span class=""up2""></span><br>81.16<span class=""dn2""></span></td>"
RE6 strTest
End Sub
Function RE6(strData As String) As String
Dim RE As Object, REMatches As Object
Set RE = CreateObject("vbscript.regexp")
With RE
' .MultiLine = True
'.Global = False
.Pattern = "\b[\d.]+\b"
End With
Set REMatches = RE.Execute(strData)
MsgBox REMatches(0)
MsgBox REMatches(1) 'getting error here
End Function

First of all, try not to use RegEx to parse any kind of xml. Try using xml parsers or XPath
XPath, the XML Path Language, is a query language for selecting nodes
from an XML document. In addition, XPath may be used to compute values
(e.g., strings, numbers, or Boolean values) from the content of an XML
document.
To solve your problem change
'.Global = False 'Matches only first occurrence
Into
.Global = True 'Matches all occurrences
How to properly parse XML in VBA:
Please note that I had to close your <br /> tag to be valid.
Private Sub XmlTestSub()
On Error GoTo ErrorHandler
Dim xml As MSXML2.DOMDocument60
Dim nodes As MSXML2.IXMLDOMNodeList
Dim node As MSXML2.IXMLDOMNode
yourXmlString = "<td align=""right"">116.83<span class=""up2""></span><br />81.16<span class=""dn2""></span></td>"
Set xml = New MSXML2.DOMDocument60
If (Not xml.LoadXML(yourXmlString)) Then
Err.Raise xml.parseError.ErrorCode, "XmlTestSub", xml.parseError.reason
End If
Set nodes = xml.SelectNodes("/td/text()") 'XPath Query
For Each node In nodes
Debug.Print node.NodeValue
Next node
Done:
Exit Sub
ErrorHandler:
MsgBox Err.Number & " " & Err.Description, vbCritical
Resume Done
End Sub

Related

Working code gives error when run on any other PC

I have working code that requests information from a website.
When I send the file to another PC and run the code, I get:
"Run-time error'91': Object variable or With block variable not set"
I ensured:
Macro security levels are the same (Enable all macros & trust access to VBA project object model)
All the checked boxes in VBA editor > Tools > References are the same (Specifically Microsoft HTML Object Library & Microsoft XML, V6.0 is checked)
Sub Macro1()
Dim request As Object
Dim response As String
Dim html As New HTMLDocument
Dim website As String
Dim Current As Variant
website = "https://www.thalia.de/shop/home/artikeldetails/A1062020980"
Set request = CreateObject("MSXML2.XMLHTTP")
request.Open "GET", website, False
request.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
request.send
response = StrConv(request.responseBody, vbUnicode)
html.body.innerHTML = response
Current = html.getElementsByClassName("element-text-standard value").Item(0).innerText
MsgBox (Current)
End Sub
The line on which I get the error:
Current = html.getElementsByClassName("element-text-standard value").Item(0).innerText
WinHttp
I've tried a ton of various solutions, in the end, it came just to replacing MSXML2.XMLHTTP with WinHttp.WinHttpRequest.5.1 to make it work on my computer. While I was researching, I rewrote the whole thing a little bit. I'm a noob at this so I can't explain why one works and the other does not.
Option Explicit
Sub Macro1()
Const URL As String _
= "https://www.thalia.de/shop/home/artikeldetails/A1062020980"
'Const URL As String _
= "https://www.thalia.de/shop/home/artikeldetails/A1060523771"
Const ClassName As String _
= "element-text-standard value"
Dim WhrResponseText As String
WhrResponseText = GetWhrResponseText(URL)
If Len(WhrResponseText) = 0 Then
MsgBox "Could not get a response.", vbExclamation
Exit Sub
End If
' ' Write the response string to a worksheet.
' Dim ws As Worksheet: Set ws = ActiveSheet ' improve!
' Dim arr() As String: arr = Split(WhrResponseText, vbLf)
' ws.Range("A1").Resize(UBound(arr) + 1).Value = Application.Transpose(arr)
Dim Elements As Object
With CreateObject("htmlfile")
.body.innerHTML = WhrResponseText
Set Elements = .getElementsByClassName(ClassName)
End With
' Using 'Length' to determine if a result was found and returning
' the first element.
Dim Result As Variant
With Elements
If .Length > 0 Then
Result = .Item(0).innerText
MsgBox Result
Else
MsgBox "Nothing found."
End If
End With
Dim i As Long
' Loop through the elements using 'For Each... Next'.
Dim Element As Object
For Each Element In Elements
Debug.Print i, Element.innerText
i = i + 1
Next Element
' ' Loop through the elements using 'For... Next'.
' With Elements
' For i = 0 To .Length - 1
' Debug.Print i, .Item(i).innerText
' Next i
' End With
End Sub
Function GetWhrResponseText( _
ByVal URL As String) _
As String
Const ProcName As String = "GetWhrResponseText"
On Error GoTo ClearError
With CreateObject("WinHttp.WinHttpRequest.5.1")
.Open "GET", URL, False
.send
GetWhrResponseText = StrConv(.responseBody, vbUnicode)
End With
ProcExit:
Exit Function
ClearError:
Debug.Print "'" & ProcName & "' Run-time error '" _
& Err.Number & "':" & vbLf & " " & Err.Description
Resume ProcExit
End Function

I'm getting stuck at vba runtime error 424

I'm getting
run-time error 424
in 68th row (line)
request.Open "GET", Url, False
and I don't know how to fix it.
My previous question I posted ;
How to scrape specific part of online english dictionary?
My final goal is to get result like this;
A B
beginning bɪˈɡɪnɪŋ
behalf bɪˈhæf
behave bɪˈheɪv
behaviour bɪˈheɪvjər
belong bɪˈlɔːŋ
below bɪˈloʊ
bird bɜːrd
biscuit ˈbɪskɪt
Here's code I wrote, and it's mostly based on someone else's code I found on internet.
' Microsoft ActiveX Data Objects x.x Library
' Microsoft XML, v3.0
' Microsoft VBScript Regular Expressions
Sub ParseHelp()
' Word reference from
Dim Url As String
Url = "https://www.oxfordlearnersdictionaries.com/definition/english/" & Cells(ActiveCell.Row, "B").Value
' Get dictionary's html
Dim Html As String
Html = GetHtml(Url)
' Check error
If InStr(Html, "<TITLE>Not Found</Title>") > 0 Then
MsgBox "404"
Exit Sub
End If
' Extract phonetic alphabet from HTML
Dim wrapPattern As String
wrapPattern = "<span class='name' (.*?)</span>"
Set wrapCollection = FindRegexpMatch(Html, wrapPattern)
' MsgBox StripHtml(CStr(wrapCollection(1)))
' Fill phonetic alphabet into cell
If Not wrapCollection Is Nothing Then
Dim wrap As String
On Error Resume Next
wrap = StripHtml(CStr(wrapCollection(1)))
If Err.Number <> 0 Then
wrap = ""
End If
Cells(ActiveCell.Row, "C").Value = wrap
Else
MsgBox "not found"
End If
End Sub
Public Function StripHtml(Html As String) As String
Dim RegEx As New RegExp
Dim sOut As String
Html = Replace(Html, "</li>", vbNewLine)
Html = Replace(Html, " ", " ")
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>"
End With
sOut = RegEx.Replace(Html, "")
StripHtml = sOut
Set RegEx = Nothing
End Function
Public Function GetHtml(Url As String) As String
Dim xmlhttp As Object
Set xmlhttp = CreateObject("MSXML2.serverXMLHTTP")
Dim converter As New ADODB.stream
' Get
request.Open "GET", Url, False
request.send
' raw bytes
converter.Open
converter.Type = adTypeBinary
converter.Write request.responseBody
' read
converter.Position = 0
converter.Type = adTypeText
converter.Charset = "utf-8"
' close
GetHtml = converter.ReadText
converter.Close
End Function
Public Function FindRegexpMatch(txt As String, pat As String) As Collection
Set FindRegexpMatch = New Collection
Dim rx As New RegExp
Dim matcol As MatchCollection
Dim mat As Match
Dim ret As String
Dim delimiter As String
txt = Replace(txt, Chr(10), "")
txt = Replace(txt, Chr(13), "")
rx.Global = True
rx.IgnoreCase = True
rx.MultiLine = True
rx.Pattern = pat
Set matcol = rx.Execute(txt)
'MsgBox "Match:" & matcol.Count
On Error GoTo ErrorHandler
For Each mat In matcol
'FindRegexpMatch.Add mat.SubMatches(0)
FindRegexpMatch.Add mat.Value
Next mat
Set rx = Nothing
' Insert code that might generate an error here
Exit Function
ErrorHandler:
' Insert code to handle the error here
MsgBox "FindRegexpMatch. " & Err.GetException()
Resume Next
End Function
Any kind of help would be greatly appreciated.
The following is an example of how to read in values from column A and write out pronounciations to column B. It uses css selectors to match a child node then steps up to parentNode in order to ensure entire pronounciation is grabbed. There are a number of ways you could have matched on the parent node to get the second pronounciation. Note that I use a parent node and Replace as the pronounciation may span multiple childNodes.
If doing this for lots of lookups please be a good netizen and put some waits in the code so as to not bombard the site with requests.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, urls()
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row 'you need at least two words in column A or change the redim.
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(urls))
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", "https://www.oxfordlearnersdictionaries.com/definition/english/" & urls(i), False
.send
html.body.innerHTML = .responseText
data = Replace$(Replace$(html.querySelector(".name ~ .wrap").ParentNode.innerText, "/", vbNullString), Chr$(10), Chr$(32))
results(i) = Right$(data, Len(data) - 4)
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
Required references (VBE>Tools>References):
Microsoft HTML Object Library
Should you go down the API route then here is a small example. You can make 1000 free calls in a month with Prototype account. The next best, depending on how many calls you wish to make looks like the 10,001 calls (that one extra PAYG call halves the price). # calls will be affected by whether word is head word or needs lemmas lookup call first. The endpoint construction you need is GET /entries/{source_lang}/{word_id}?fields=pronunciations though that doesn't seem to filter massively. You will need a json parser to handle the json returned e.g. github.com/VBA-tools/VBA-JSON/blob/master/JsonConverter.bas. Download raw code from there and add to standard module called JsonConverter. You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, words()
'If not performing lemmas lookup then must be head word e.g. behave, behalf
Const appId As String = "yourAppId"
Const appKey As String = "yourAppKey"
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row
words = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(words))
Set html = New MSHTML.HTMLDocument
Dim json As Object
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(words) To UBound(words)
.Open "GET", "https://od-api.oxforddictionaries.com/api/v2/entries/en-us/" & LCase$(words(i)) & "?fields=pronunciations", False
.setRequestHeader "app_id", appId
.setRequestHeader "app_key", appKey
.setRequestHeader "ContentType", "application/json"
.send
Set json = JsonConverter.ParseJson(.responseText)
results(i) = IIf(json("results")(1)("type") = "headword", json("results")(1)("lexicalEntries")(1)("pronunciations")(2)("phoneticSpelling"), "lemmas lookup required")
Set json = Nothing
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub

Extract all delimited Word <b>...</b>

I have a problem extracting words in MS Excel. I have multiple sentences in HTML format in a row and want to extract all of the words that are delimited by <b>....</b>
Example:
<b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b>
I want to extract the words: "buat", "1", "2", "cendol"
Can you help me to solve my problem? Any code in Excel/VBA is appreciated.
This can be done with the worksheet function FILTERXML if you have Excel 2013+
First you need to change your string into a "well-formed" XML by enclosing it in an external tag, and closing off the unmatched <br> tag:
"<t>" & $A$1 & "</br></t>"
Then it's merely a matter of using an Xpath that will return all of the wanted tags:
FILTERXML("<t>" & $A$1 & "</br></t>","//b")
Wrapping that in an INDEX function allows you to extract the substrings one at a time:
Full Formula entered in A3 and filled down
=IFERROR(INDEX(FILTERXML("<t>" & $A$1 & "</br></t>","//b"),ROWS($1:1)),"")
There's a very easy way to do this by using an HTMLDocument object:
In your VB Editor, go to Tools>References and select the Microsoft HTML Object Library.
Then you can use the following code:
Sub extract()
Dim doc As New HTMLDocument 'Declare and create an object of type HTMLDocument
Dim item As HTMLObjectElement 'Declare an object of type HTMLObjectElement. We will use this to loop through a collection of HTML elements
doc.body.innerHTML = "<b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b> " 'Assign your HTML code as a string in doc body
For Each item In doc.getElementsByTagName("b") 'Loop through all the <b></b> elements in doc
Debug.Print item.innerText 'print the text contained in <b></b> element. This will show up in your immediate window
Next item
End Sub
Try this
Sub Test()
Dim objReg As Object
Dim objMatches As Object
Dim match As Object
Dim s As String
Dim i As Integer
s = "<b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b> "
Set objReg = CreateObject("VBScript.RegExp")
With objReg
.IgnoreCase = False
.Global = True
.Pattern = "<b>(.*?)<\/b>"
Set objMatches = .Execute(s)
End With
For Each match In objMatches
For i = 0 To match.Submatches.Count - 1
Debug.Print Trim(match.Submatches.item(i))
Next i
Next match
Set objReg = Nothing
End Sub
Alternative using XML DomDocument
Analyzing a HTML string it seems evident to use document object structures as in a HTMLDocument or in ►XML. That's why I demonstrate a further approach for the sake of completeness and in addition to #StavrosJon 's valid solution (which uses the more lenient HTMLDocument not needing to be well formed as XML is):
Example call
Sub ExtractViaXML()
Dim html$, myArray()
html = "<b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b> "
myArray = getTokens(html, "b") ' assign findings to array via function getTokens()
Debug.Print UBound(myArray) + 1 & " token(s) found: " & Join(myArray, ", ") ' display results
End Sub
Main function getTokens()
Function getTokens(ByVal html$, Optional myTag$ = "b") As Variant()
' Purpose: isolate "<b>"-Tags (default value) out of html string and return found tokens as variant array
' Note: creates temporary XML DOMDocument (late bound MSXML2 reference)
Dim XmlString$
XmlString = wellformed("<?xml version=""1.0"" encoding=""utf-8""?><tokens>" & html & "</tokens>")
With CreateObject("MSXML2.DOMDocument.6.0")
.ValidateOnParse = True: .Async = False
If .LoadXML(XmlString) Then ' load xml string
Dim myNodeList As Object
Set myNodeList = .DocumentElement.SelectNodes(myTag) ' set node list to memory
Dim i&, ii&, arr()
ii = myNodeList.Length - 1 ' calculate upper boundary of zero-based array
If ii > -1 Then ReDim arr(ii) ' (re)dimension variant array arr()
For i = 0 To ii ' loop through node list
arr(i) = myNodeList.item(i).Text ' assign each found text content to array
Next i
If ii = -1 Then arr = Array("**Nothing found**") ' provide for zero findings
getTokens = arr ' return 0-based 1-dim array with found tokens
Else: ShowParseError (.ParseError) ' optional: display possible error message
End If
End With
End Function
Helper functions
XML requires a well formed node structure with opening and closing tags or, whereas HTML is more lenient towards e.g. single line breaks(<br>). Therefore I added a simple function wellformed() to cure such an issue preventing successful loading. Furthermore I demonstrate using an optional procedure ShowParseError to localize (other) possible loading Errors which you can use as supplement to any .load or .loadXML function.
Function wellformed$(ByVal s$)
' Purpose: force a wellformed version of line breaks in html/xml string ("<br/>")
' Note: unclosed tags like <br> only would prevent a successful load of the xml document
wellformed = Replace(Replace(s, "</br>", "<br>"), "<br>", "<br/>")
End Function
Sub ShowParseError(pe As Object)
' Purpose: display possible parse error
' Note: localizes error occurrence also by indicating position
Dim ErrText$
With pe
ErrText = "Load error " & .ErrorCode & " xml file " & vbCrLf & _
Replace(.URL, "file:///", "") & vbCrLf & vbCrLf & _
.reason & _
"Source Text: " & .srcText & vbCrLf & vbCrLf & _
"Line No.: " & .Line & vbCrLf & _
"Line Pos.: " & .linepos & vbCrLf & _
"File Pos.: " & .filepos & vbCrLf & vbCrLf
End With
MsgBox ErrText, vbExclamation
End Sub
I tried something different, with splitting and joining and splitting again and looping trough array. I typed the text <b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b> in cell A1:
Sub Macro1()
Dim MyWords As Variant
Dim i As Long
Dim MyDelimiter As String
Dim MyLen As Byte
MyDelimiter = "||" 'Choose 1 not common delimiter
MyLen = Len(MyDelimiter)
MyWords = Split(Join(Split(Range("A1").Value, "<b>"), MyDelimiter), "</b>")
For i = 0 To UBound(MyWords) Step 1
Debug.Print Mid(MyWords(i), InStr(1, MyWords(i), MyDelimiter) + MyLen, 99) 'Increase 99 if you are sure there will be longer texts between tags <b>..</b>
Next i
Erase MyWords
End Sub
I get this:
i have tried to simulate this on excel. please check my sample solution below.
Sub test()
Dim testString As String
Dim startPos As Integer
Dim endPos As Integer
Dim resultString As String
Dim str As String
testString = "<b>buat</b> <i>v</i> <b>1</b> kerjakan; lakukan; <b>2</b> bikin;<br>--<b> cendol</b>"
'get the position of start tag
startPos = InStr(1, testString, "<b>") + 3
'get the position of end tag
endPos = InStr(startPos, testString, "</b>")
Do While Len(testString) > 1
'check if the start pos and end pos is correct
If startPos > 0 And endPos > startPos Then
'get the value in between the start tag and end tag
str = Mid(testString, startPos, endPos - startPos)
resultString = resultString + str + ","
'remove the value retrieved from the original string
testString = Mid(testString, endPos + 4)
startPos = InStr(1, testString, "<b>") + 3
endPos = InStr(startPos, testString, "</b>")
End If
Loop
End Sub

Web scraping html page with no tags as delimiter

I'm trying to import into a string array all lines of text in a web page. The URL is here: Vaticano-La Sacra Bibbia-Genesi-Cap.1.
Unfortunately (maybe a choice of the web designer), in the tag there aren't ID's or CLASS. All the rows are separated by 1 or more < BR > element. Start and end text is separated from a simple menu by 2 tag < HR >.
A clean extract of page code is here: jsfiddle.
I find a way to bring the text. And now what I do in VBA till now:
Note: objDoc is a Public variable coming from another module, fill with a .responseText without problems.
Public Sub ScriviXHTML(strBook As String, intNumCap As Integer)
Dim strDati2 As String
Dim TagBr As IHTMLElementCollection
Dim BrElement As IHTMLElement
Dim intElement As Integer
Dim objChild as Object
Dim strData, strTextCont, strNodeVal, strWholeText As String
Set objDoc2 = New HTMLDocument
Set objDoc2 = objDoc
Set objDoc = Nothing
'Put in variable string HTML code of the web page.
strDati2 = objDoc2.body.innerHTML
'Set in the variable object TAG type BR.
Set TagBr = objDoc2.body.getElementsByTagName("BR")
'Loop for all BRs in the page.
For Each BrElement In TagBr
'Here I try to get the NextSibling element of the <br>
' because seems contain the text I'm looking for.
Set objChild = BrElement.NextSibling
With objChild
' Here I try to put in the variables
strData = Trim("" & .Data & "")
strTextCont = Trim("" & .textContent & "")
strNodeVal = Trim("" & .NodeValue & "")
strWholeText = Trim("" & .wholeText & "")
End With
intElement = intElement + 1
Next BrElement
Two questions:
1) Is it, about you, the best way to achieve what I'm trying to do?
2) Sometimes the Element.NextSibling.Data doesn't exist, with an Error of runtime '438', so I manually move the point of sospension of the routine to by-pass the error. How can I intercept this error? [Please not with a simple On Error Resume Next!]... better: how can I use an If...Then... End If statement to check if in NextSibling exist the Data member?
Thanks at all.
Well you can get all the text as follows:
Public Sub GetInfo()
Dim sResponse As String, xhr As Object, html As New HTMLDocument
Set xhr = CreateObject("MSXML2.XMLHTTP")
With xhr
.Open "GET", "http://www.vatican.va/archive/ITA0001/__P1.HTM", False
.send
sResponse = StrConv(.responseBody, vbUnicode)
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
html.body.innerHTML = sResponse
[A1] = Replace$(Replace$(regexRemove(html.body.innerHTML, "<([^>]+)>"), " ", Chr$(32)), Chr$(10), Chr$(32))
End With
End Sub
Public Function regexRemove(ByVal s As String, ByVal pattern As String) As String
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
With regex
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
End With
If regex.test(s) Then
regexRemove = regex.Replace(s, vbNullString)
Else
regexRemove = s
End If
End Function

How do I extract text of a single HTML element by tag name using MSXML in VBA?

I'm trying to extract US Patent titles using MSXML6.
On the full-text html view of a patent document on the USPTO website, the patent title appears as the first and only "font" element that is a child of "body".
Here is my function that is not working (I get no error; the cell with the formula just stays blank).
Can somebody help me figure out what is wrong?
An example URL that I am feeding into the function is http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=6293874.PN.&OS=PN/6293874&RS=PN/6293874
Function getUSPatentTitle(url As String)
Static colTitle As New Collection
Dim title As String
Dim pageSource As String
Dim xDoc As MSXML2.DOMDocument
Dim xNode As IXMLDOMNode
On Error Resume Next
title = colTitle(url)
If Err.Number <> 0 Then
Set html_doc = CreateObject("htmlfile")
Set xml_obj = CreateObject("MSXML6.XMLHTTP60")
xml_obj.Open "GET", url, False
xml_obj.send
pageSource = xml_obj.responseText
Set xml_obj = Nothing
Set xDoc = New MSXML2.DOMDocument
If Not xDoc.LoadXML(pageSource) Then
Err.Raise xDoc.parseError.ErrorCode, , xDoc.parseError.reason
End If
Set xNode = xDoc.getElementsByTagName("font").Item(1)
title = xNode.Text
If Not title = "" Then colTitle.Add Item:=title, Key:=url
End If
On Error GoTo 0 ' I understand "GoTo" is dangerous coding but copied from somebody and so far haven't thought of a more natural substitute for a GoTo statement
getUSPatentTitle = title
End Function
Just a few points:
"On Error Goto 0" is not really a traditional Goto statement - it's just how you turn off user error handling in VBA. There were a few errors in your code but the "On Error Resume Next" skipped them so you saw nothing.
The data from the web page is in HTML format not XML.
There were a few "font" elements before the one with the title.
This should work:
Function getUSPatentTitle(url As String)
Static colTitle As New Collection
Dim title As String
Dim pageSource As String
Dim errorNumber As Integer
On Error Resume Next
title = colTitle(url)
errorNumber = Err.Number
On Error GoTo 0
If errorNumber <> 0 Then
Dim xml_obj As XMLHTTP60
Set xml_obj = CreateObject("MSXML2.XMLHTTP")
xml_obj.Open "GET", url, False
xml_obj.send
pageSource = xml_obj.responseText
Set xml_obj = Nothing
Dim html_doc As HTMLDocument
Set html_doc = CreateObject("HTMLFile")
html_doc.body.innerHTML = pageSource
Dim fontElement As IHTMLElement
Set fontElement = html_doc.getElementsByTagName("font").Item(3)
title = fontElement.innerText
If Not title = "" Then colTitle.Add Item:=title, Key:=url
End If
getUSPatentTitle = title
End Function
CSS selector:
You can re-write what you described, which in fact is first font tag within a body tag as a CSS selector of:
body > font
CSS query:
VBA:
As it is the first match/only you want you can use the querySelector method of document to apply the selector and retrieve a single element.
Debug.Print html_doc.querySelector("body > font").innerText
You may need to add a reference to HTML Object Library and use an early bound call of Dim html_doc As HTMLDocument to access the method. The late bound method may expose the querySelector method but if the interface doesn't then use early binding.