VBA: Web scraping with <ul and <li and <div and <span - html

I am using VBA to extract the data from HTML in <span code which is under <Div which is under <li which is under <ul
I'm trying to extract the "date and matter" from HTML. "Date" should be in A column and "Matter" should be in B Column in Excel.
The drawback of my code is, it is pulling all the Date and matter into single cell.
Sub GetDat()
Dim IE As New InternetExplorer, html As HTMLDocument
Dim elem As Object, data As String
With IE
.Visible = True
.navigate "https://www.MyURL/sc/wo/Worders/index?id=76888564"
Do While .readyState <> READYSTATE_COMPLETE: Loop
Set html = .document
End With
data = ""
For Each elem In html.getElementsByClassName("simple-list")(0).getElementsByTagName("li")
data = data & " " & elem.innerText
Next elem
Range("A1").Value = data
IE.Quit
End Sub
The output that I need is shown in the image:
HTML:

You could grab two nodeLists, one for dates and one for matters, and then loop those writing out to sheet. Match dates based on data-bind attribute value; matters on classname:
Dim dates As Object, matters As Object, i As Long, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set dates = ie.document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
Set matters = ie.document.querySelectorAll(".wo-notes")
With ws
For i = 0 To dates.Length - 1
.Cells(i + 1, 1) = dates.Item(i).innertext
.Cells(i + 1, 2) = matters.Item(i).innertext
Next
End With
Example reading values from column C:
Option Explicit
Public Sub GetMatters()
Dim ws As Worksheet, lastRow As Long, urls(), results(), ie As SHDocVw.InternetExplorer, r As Long
Set ie = New SHDocVw.InternetExplorer
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.Rows.Count, "C").End(xlUp).Row
urls = Application.Transpose(ws.Range("C2:C" & lastRow).Value)
ReDim results(1 To 1000, 1 To 2)
With ie
.Visible = True
For i = LBound(urls) To UBound(urls)
.navigate2 "https://www.MyURL/sc/wo/Worders/index?id=" & urls(i)
While .Busy Or .readyState <> 4: DoEvents: Wend
Dim dates As Object, matters As Object, i As Long
Set dates = .document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
Set matters = .document.querySelectorAll(".wo-notes")
For i = 0 To dates.Length - 1
r = r + 1
results(r, 1) = dates.Item(i).innertext
results(r, 2) = matters.Item(i).innertext
Next
Set dates = Nothing: Set matter = dates
Next
.Quit
End With
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
References:
document.querySelectorAll
css selectors

Related

How to get data from class name inside a id name in excel VBA?

How can I get data from the classes date and number (all 3) data?
Sub Fii_dii()
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
IE.Navigate "https://www.nseindia.com/products/content/equities/equities/fii_dii_market_today.htm"
While IE.readyState <> 4
DoEvents
Wend
'Copy data
MsgBox IE.Document.all.Item("fiiTable").innerText
End Sub
Those values are retrieved by the browser dynamically from another URI you can find in the network tab of the browser when refreshing the page. I would simply issue an xmlhttp request (faster and no browser) and use css class selectors to match on the required nodes by class. Loop the returned nodeList and write out to Excel
Option Explicit
Public Sub ScrapeValues()
Dim html As HTMLDocument, values As Object, i As Long, ws As Worksheet
Set html = New HTMLDocument
Set ws = ThisWorkbook.Worksheets("Sheet1")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.nseindia.com/products/dynaContent/equities/equities/htms/fiiEQ.htm", False
.send
html.body.innerHTML = .responseText
End With
Set values = html.querySelectorAll(".date, .number")
For i = 0 To values.Length - 1
With ws
.Cells(i + 1, 1) = values.Item(i).innerText
End With
Next
End Sub
If you want to use IE you need to have a test for those elements to be present and a timed loop e.g.
Option Explicit
Public Sub ScrapeValues()
Dim ie As InternetExplorer, values As Object, i As Long, ws As Worksheet, t As Date
Const MAX_WAIT_SEC As Long = 5 '<==adjust time here
Set ie = New InternetExplorer
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 "https://www.nseindia.com/products/content/equities/equities/fii_dii_market_today.htm"
While .Busy Or .readyState <> 4: DoEvents: Wend
t = Timer
Do
Set values = .document.querySelectorAll(".date, .number")
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While values.Length = 0
If values.Length > 0 Then
For i = 0 To values.Length - 1
With ws
.Cells(i + 1, 1) = values.Item(i).innerText
End With
Next
End If
.Quit
End With
End Sub
--
References (VBE>Tools>References):
Microsoft HTML Object Library
Microsoft Internet Controls

Data Scraping Elements By ClassName

I am trying to pull data from a web-site, I want to copy the '10' x 5'unit (class name is "unit_size medium") ' in row 1 for which I am able to copy data successfully but I also want promo (Class name is "promo_offers") '1st Month Free!' in row 2, the problem is this promo is given for specific cells only. hence the data is misleading and I am getting promo in 1st 4 cells and then getting error. However, I want to copy promo for only those units where promo information is given else the cell should be blank or any other value needs to be set. Below is the code...
Please suggest how to frame the code.
Sub GetClassNames()
Dim html As HTMLDocument
Dim objIE As Object
Dim element As IHTMLElement
Dim ie As InternetExplorer
Dim elements As IHTMLElementCollection
Dim result As String 'string variable that will hold our result link
Dim count As Long
Dim erow As Long
'initiating a new instance of Internet Explorer and asigning it to objIE
Set objIE = New InternetExplorer
'make IE browser visible (False would allow IE to run in the background)
objIE.Visible = True
'navigate IE to this web page (a pretty neat search engine really)
objIE.navigate "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
'wait here a few seconds while the browser is busy
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop
count = 0
Set html = objIE.document
Set elements = html.getElementsByClassName("unit_size medium")
For Each element In elements
If element.className = "unit_size medium" Then
erow = Sheet2.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = html.getElementsByClassName("unit_size medium")(count).innerText
Cells(erow, 2) = html.getElementsByClassName("promo_offers")(count).innerText
count = count + 1
End If
Next element
End Sub
I would simply wrap in an On Error Resume Next when attempting to access the element. Have a place already reserved for it in an output array so if not present the place remains empty.
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
While .Busy Or .readyState < 4: DoEvents: Wend
Dim listings As Object, listing As Object, headers(), results(), r As Long, c As Long
headers = Array("size", "features", "promo", "in store", "web")
Set listings = .document.getElementById("small_units_accordion_panel").getElementsByTagName("li")
'.unit_size medium, .features, .promo_offers, .board_rate_wrapper p, .board_rate
ReDim results(1 To listings.Length, 1 To UBound(headers) + 1)
For Each listing In listings
r = r + 1
On Error Resume Next
results(r, 1) = listing.getElementsByClassName("unit_size medium")(0).innerText
results(r, 2) = listing.getElementsByClassName("features")(0).innerText
results(r, 3) = listing.getElementsByClassName("promo_offers")(0).innerText
results(r, 4) = listing.getElementsByClassName("board_rate")(0).innerText
results(r, 5) = listing.getElementsByClassName("price")(0).innerText
On Error GoTo 0
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
End Sub
All boxes:
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
While .Busy Or .readyState < 4: DoEvents: Wend
Dim listings As Object, listing As Object, headers(), results()
Dim r As Long, list As Object, item As Object
headers = Array("size", "features", "promo", "in store", "web")
Set list = .document.getElementsByClassName("main_unit")
'.unit_size medium, .features, .promo_offers, .board_rate_wrapper p, .board_rate
Dim rowCount As Long
rowCount = .document.querySelectorAll(".main_unit li").Length
ReDim results(1 To rowCount, 1 To UBound(headers) + 1)
For Each listing In list
For Each item In listing.getElementsByTagName("li")
r = r + 1
On Error Resume Next
results(r, 1) = item.getElementsByClassName("unit_size medium")(0).innerText
results(r, 2) = item.getElementsByClassName("features")(0).innerText
results(r, 3) = item.getElementsByClassName("promo_offers")(0).innerText
results(r, 4) = item.getElementsByClassName("board_rate")(0).innerText
results(r, 5) = item.getElementsByClassName("price")(0).innerText
On Error GoTo 0
Next
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
End Sub

Retrieving all Excel file links from a webpage

I'm trying to get all the downloadable Excel file links from the website, but having difficulty. Please help to guide me. Thanks.
Sub TYEX()
Dim internet As Object
Dim internetdata As Object
Dim div_result As Object
Dim header_links As Object
Dim link As Object
Dim URL As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = True
URL = "https://www.jpx.co.jp/markets/public/short-selling/index.html"
internet.Navigate URL
Do Until internet.ReadyState >= 4
DoEvents
Loop
Application.Wait Now + TimeSerial(0, 0, 5)
Set internetdata = internet.Document
Set div_result = internetdata.getElementById("readArea")
Set header_links = div_result.getElementsByTagName("td")
For Each h In header_links
Set link = h.ChildNodes.item(0)
Cells(Range("A" & Rows.Count).End(xlUp).Row + 1, 1) = link.href
Next
MsgBox "done"
End Sub
You had the idea down correctly, but here's a different approach:
Sub TYEX()
Dim ie As Object
Set ie = CreateObject("InternetExplorer.Application")
With ie
.navigate "https://www.jpx.co.jp/markets/public/short-selling/index.html"
.Visible = True
Do While .Busy Or .readyState < 4
DoEvents
Loop
Dim doc As Object, tbl As Object
Set doc = .document
Set tbl = doc.getElementsByClassName("component-normal-table")(0).Children(0)
Dim r As Long, xlsArr(), a As Object
With tbl.Rows
ReDim xlsArr(1 To .Length - 1)
For r = 1 To .Length - 1 ' 0 is the table header
xlsArr(r) = .Item(r).Children(1).innerHTML
Next r
End With
With CreateObject("VBScript.RegExp")
.Pattern = "<a href=""(\/markets.*?\.xls)"
For r = 1 To UBound(xlsArr)
xlsArr(r) = "https://www.jpx.co.jp" & .Execute(xlsArr(r))(0).SubMatches(0)
Debug.Print xlsArr(r)
Next
End With
End With
'Add to sheet
Dim ws As Worksheet, rng As Range
Set ws = ThisWorkbook.Worksheets(1)
With ws
Set rng = .Range(.Cells(NextRow(ws), 1), .Cells(NextRow(ws) + UBound( _
xlsArr) - 1, 1))
rng.Value = Application.Transpose(xlsArr)
End With
End Sub
Public Function NextRow(ByVal ws As Worksheet, Optional ByVal col As Variant = 1) As Long
With ws
NextRow = .Cells(.Rows.Count, col).End(xlUp).Row + 1
End With
End Function
Breaking Down the Code
This will loop your html table rows. We start at 1, because 0 is actually just the table header.
With tbl.Rows
ReDim xlsArr(1 To .Length - 1)
For r = 1 To .Length - 1 ' 0 is the table header
xlsArr(r) = .Item(r).Children(1).innerHTML
Next r
End With
This uses regular expressions to extract the url from the innerHTML property. You can see how this particular regex works here: Regex101
With CreateObject("VBScript.RegExp")
.Pattern = "<a href=""(\/markets.*?\.xls)"
For r = 1 To UBound(xlsArr)
xlsArr(r) = "https://www.jpx.co.jp" & .Execute(xlsArr(r))(0).SubMatches(0)
Debug.Print xlsArr(r)
Next
End With
You will size your range to be the same size as your array that contains the links, then write the array to the worksheet. This is usually much faster than writing cells one-by-one.
'Add to sheet
Dim ws As Worksheet, rng As Range
Set ws = ThisWorkbook.Worksheets(1)
With ws
Set rng = .Range(.Cells(NextRow(ws), 1), .Cells(NextRow(ws) + UBound( _
xlsArr) - 1, 1))
rng.Value = Application.Transpose(xlsArr)
End With
You can use an attribute = value CSS selector with $ operator to say the href value must end with .xls. Then use querySelectorAll to retrieve all matched results. Using CSS selectors is a very fast and generally robust method.
Dim list As Object
Set list = ie.document.querySelectorAll("[href$='.xls']")
It is much faster to use XMLHTTP as well, rather than opening IE. Note that you can then pass these links to a function to perform a binary download or to URLMon for downloading.
Option Explicit
Public Sub Links()
Dim sResponse As String, html As HTMLDocument, list As Object, i As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.jpx.co.jp/markets/public/short-selling/index.html", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Set list = html.querySelectorAll("[href$='.xls']")
End With
For i = 0 To list.Length - 1
Debug.Print Replace$(list.item(i), "about:", "https://www.jpx.co.jp")
Next
End Sub
Example download function (though you could re-use your existing XMLHTTP object - this is just to illustrate):
Public Function DownloadFile(ByVal downloadFolder As String, ByVal downloadURL As String) As String
Dim http As Object , tempArr As Variant
Set http = CreateObject("WinHttp.WinHttpRequest.5.1")
http.Open "GET", downloadURL, False
http.send
On Error GoTo errhand
With CreateObject("ADODB.Stream")
.Open
.Type = 1
.write http.responseBody
tempArr = Split(downloadURL, "/")
tempArr = tempArr(UBound(tempArr))
.SaveToFile downloadFolder & tempArr, 2 '< "/" on enter of downloadFolder. 2 for overwrite which is Ok if no file modifications.
.Close
End With
DownloadFile = downloadFolder & tempArr
Exit Function
errhand:
If Err.Number <> 0 Then
Debug.Print Err.Number, Err.Description
MsgBox "Download failed"
End If
DownloadFile = vbNullString
End Function
References (VBE > Tools > References):
Microsoft HTML Object Library

Click related to doPostBack & href in VBA

I was having trouble with coding and I could no longer work.
https://tradingeconomics.com/
I posted the coding logic I was working on below. I imported the excel sheet to Portugal of the countries shown on the homepage.
But,
<a id="ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_LinkButton1" class="btn-group btn-group-sm" href="javascript:__doPostBack('ctl00$ContentPlaceHolder1$defaultUC1$CurrencyMatrixAllCountries1$LinkButton1','')">
<button type = "button" class = "btn btn-default">
<i class = "glyphicon glyphicon-plus"> </ i>
</ button>
</a>
How can I code doPostBack to finish my work here? I looked through stackoverflow homepage and tried various trial and error, but I could not finish it.
Option Explicit
Public Sub New_Listing()
Application.ScreenUpdating = False
Dim IE As New InternetExplorer
Const MAX_WAIT_SEC As Long = 5
Dim http As New MSXML2.XMLHTTP60
Dim html As New HTMLDocument
Dim ws As Worksheet
Dim sResponse0 As String
Dim g As Integer
Set http = CreateObject("MSXML2.XMLHTTP")
Set ws = ThisWorkbook.Worksheets("Sheet1")
Dim url As String
url = "https://tradingeconomics.com/"
With IE
.Visible = True
.navigate "https://tradingeconomics.com/"
While .Busy Or .readyState < 4: DoEvents: Wend
End With
Dim tarTable As HTMLTable
Dim hTable As HTMLTable
For Each tarTable In IE.document.getElementsByTagName("table")
If InStr(tarTable.ID, "ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1") <> 0 Then
Set hTable = tarTable
End If
Next
Dim startRow As Long
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long
r = startRow
With ws
Set tRow = hTable.getElementsByTagName("tr")
ReDim arr0(tRow.Length - 1, 0)
For Each tr In tRow
r = r + 1
Set tCell = tr.getElementsByTagName("td")
If tCell.Length > UBound(arr0, 2) Then
ReDim Preserve arr0(tRow.Length - 1, tCell.Length)
End If
c = 1
For Each td In tCell
arr0(r - 1, c - 1) = td.innerText
c = c + 1
Next td
Next tr
Dim k As Integer
Dim i As Integer
k = 0
For i = LBound(arr0, 1) To UBound(arr0, 1)
.Cells(2 + k, 2) = arr0(i, 0)
.Cells(2 + k, 3) = arr0(i, 1)
.Cells(2 + k, 4) = arr0(i, 2)
.Cells(2 + k, 5) = arr0(i, 3)
.Cells(2 + k, 6) = arr0(i, 4)
.Cells(2 + k, 7) = arr0(i, 5)
.Cells(2 + k, 8) = arr0(i, 6)
.Cells(2 + k, 9) = arr0(i, 7)
k = k + 1
Next i
End With
With IE
.Visible = True
.document.querySelector("a.btn-group btn-group-sm[href='javascript:__doPostBack('ctl00$ContentPlaceHolder1$defaultUC1$CurrencyMatrixAllCountries1$LinkButton1','')']").Click
End With
Set tRow = Nothing: Set tCell = Nothing: Set tr = Nothing: Set td = Nothing
Set hTable = Nothing: Set tarTable = Nothing
Application.ScreenUpdating = True
End Sub
I have completed my work up to Portugal, how can I fix it to get the next data, Czech Republic ?, I would be very grateful if you could give me details of how to modify the coding. I have learned vba very soon, so I have a lot of difficulties.
enter image description here
The Czech Republic value is part of a values set which is returned from the server after clicking a button in the web page itself, you'll have to simulate a click on this button by the VBA code, wait for IE to fetch the result and then continue with your code:
As this is the HTML of the button for fetching the next set of values:
<button type="button" class="btn btn-default">...
Use this to simulate a click and wait for the result fetched:
IE.document.getElementsByClassName("btn-default")(0).Click
Application.Wait(Now + TimeValue("0:00:05"))
After it you will be able to read the Czech Republic and the next set of values by the VBA code.
Internet Explorer
Here with IE - get's all results (the whole World) in one go including links.
Condition based wait with timeout:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, ws As Worksheet, clipboard As Object, t As Date
Const MAX_WAIT_SEC As Long = 5
Const URL = "https://tradingeconomics.com/"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
With IE
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("[value=world]").Selected = True
.querySelector("select").FireEvent "onchange"
t = Timer
Do
DoEvents
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_ParameterContinent").Value <> "world"
clipboard.SetText .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").outerHTML
clipboard.PutInClipboard
End With
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub
Explicit wait based:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, ws As Worksheet, clipboard As Object
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Const URL = "https://tradingeconomics.com/"
With IE
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("[value=world]").Selected = True
.querySelector("select").FireEvent "onchange"
Application.Wait Now + TimeSerial(0, 0, 5)
clipboard.SetText .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").outerHTML
clipboard.PutInClipboard
End With
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub
Selenium basic
This is easy enough using selenium basic and allowing enough time for the postBack to update the page. After selenium basic install add reference via VBE> Tools > References > Selenium Type Library. More selenium info [here]. The below gets all the World data in one go.
Option Explicit
Public Sub GetInfo()
Dim d As WebDriver, ws As Worksheet, clipboard As Object
Set d = New ChromeDriver
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Const URL = "https://tradingeconomics.com/"
With d
.Start "Chrome"
.get URL
.FindElementByCss("[value=world]").Click
Application.Wait Now + TimeSerial(0, 0, 5)
clipboard.SetText .FindElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").Attribute("outerHTML")
clipboard.PutInClipboard
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub

Fetching data from web page

I am trying to fetch the publication date corresponding to each patent number.
Here is the Excel sheet:
The database is espacenet.com
Here's the link for the first patent you see in the Excel sheet:
http://worldwide.espacenet.com/searchResults?compact=false&PN=US7055777B2&ST=advanced&locale=en_EP&DB=EPODOC
Under the "Publication Info" header, I need to get the date after matching the patent number with the one in the Excel sheet.
Here's the code:
Sub tryextraction()
Dim ie As New InternetExplorer
Dim sdd As String
Dim tdd() As String
Dim num0 As Integer
Dim num1 As Integer
Dim doc As HTMLDocument
Dim i As Integer
Dim j As Integer
ie.Visible = True
num1 = ActiveSheet.UsedRange.Rows.Count
For num0 = 2 To num1
ie.navigate "http://worldwide.espacenet.com/searchResults?compact=false&PN=" & Range("A" & num0) & "&ST=advanced&locale=en_EP&DB=EPODOC"
Do
DoEvents
Loop Until ie.readyState = READYSTATE_COMPLETE
Set doc = ie.document
sdd = Trim(doc.getElementsByTagName("td")(5).innerText)
tdd() = Split(sdd, vbLf)
j = UBound(tdd)
For i = 0 To j
If InStr(tdd(i), "(") <> 0 Then
tdd(i) = Replace(tdd(i), " ", "")
tdd(i) = Replace(tdd(i), "(", "")
tdd(i) = Replace(tdd(i), ")", "")
If tdd(i) = Range("A" & num0).Value Then
Range("B" & num0).Value = tdd(i + 1)
End If
End If
Next i
Next num0
ie.Quit
End Sub
The code is not giving any error. The column "Publication Date" remains blank after the code finishes running.
The html tag which contains the publication info has been taken correctly.
There are some trailing white space characters after the ID you are searching for in the document so tdd(i) = Range("A" & num0).Value never evaluates to true. It's not just a space, so a simple Trim(tdd(i)) = Range("A" & num0).Value call does not help. Try instead InStr(tdd(i), Range("A" & num0).Value) If that is not good enough, you'll have to specifically remove CRLF from the end of the string before doing the compare.
There are often multiple publication dates under the publication info header.
Example:
The following script obtains all of these and the preceeding line (so you have the associated publication along with date).
It loops from row 2 of the Activesheet, to the last populated row, picking up the Publication Numbers from column A and writing out the results starting from column B. Depending on how many dates there are, the data will extend across multiple columns from B.
Regex:
A regex of ^(.*)\s\d{4}-\d{2}-\d{2} is used to retrieve the date pattern and the preceeding line i.e. The publication identifier and the date. Try it
Example output:
VBA:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, html As New HTMLDocument, url As String, pubInfo As Object
Dim loopRange As Range, iRow As Range, counter As Long
'example US7055777B2
Application.ScreenUpdating = False
With ActiveSheet
Set loopRange = Range("A2:A" & .Cells(.Rows.Count, "A").End(xlUp).Row)
End With
With IE
.Visible = True
counter = 2 '<== start from row 2
For Each iRow In loopRange
If Not IsEmpty(iRow) Then
url = "https://worldwide.espacenet.com/searchResults?compact=false&PN=" & iRow.Value & "&ST=advanced&locale=en_EP&DB=EPODOC"
.navigate url
While .Busy Or .readyState < 4: DoEvents: Wend
Set html = .document
Do
DoEvents
On Error Resume Next
Set pubInfo = html.querySelector(".publicationInfoColumn")
On Error GoTo 0
Loop While pubInfo Is Nothing
Dim tempArr()
tempArr = GetDateAndPatent(pubInfo.innerText, "^(.*)\s\d{4}-\d{2}-\d{2}") '"(?m)^(.*)\s\d{4}-\d{2}-\d{2}" '<==This is not supported
With ActiveSheet
.Cells(counter, 2).Resize(1, UBound(tempArr) + 1) = tempArr
End With
End If
counter = counter + 1
Next iRow
.Quit '<== Remember to quit application
End With
Application.ScreenUpdating = True
End Sub
Public Function GetDateAndPatent(ByVal inputString As String, ByVal sPattern As String) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With CreateObject("vbscript.regexp")
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .test(inputString) Then
Set matches = .Execute(inputString)
For Each iMatch In matches
ReDim Preserve arrMatches(i)
arrMatches(i) = iMatch.Value
i = i + 1
Next iMatch
End If
End With
GetDateAndPatent = arrMatches
End Function