I got this code from a competent user, not sure if he wants to be named. The code searches the HTML content for innerText of certain tags and transfers them to an Excel table, well sorted under the headers, structured as pivot.
Public Sub GetDataFromURL()
Const URL = "URL"
Dim html As MSHTML.HTMLDocument, xhr As Object
Set html = New MSHTML.HTMLDocument
Set xhr = CreateObject("MSXML2.ServerXMLHTTP.6.0")
With xhr
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send "...parameters..."
html.body.innerHTML = .responseText
End With
Dim table As MSHTML.HTMLTable, r As Long, c As Long, headers(), row As MSHTML.HTMLTableRow
Dim results() As Variant, html2 As MSHTML.HTMLDocument
headers = Array("HDR01", "HDR02", "HDR03", "HDR04")
ReDim results(1 To 100, 1 To UBound(headers) + 1)
Set table = html.querySelector("table")
Set html2 = New MSHTML.HTMLDocument
Dim lastRow As Boolean
For Each row In table.Rows
lastRow = False
Dim header As String
html2.body.innerHTML = row.innerHTML
header = Trim$(row.Children(0).innerText)
If header = "HDR01" Then
r = r + 1
Dim dict As Scripting.Dictionary: Set dict = GetBlankDictionary(headers)
On Error Resume Next
dict("HDR02") = Replace$(html2.querySelector("a").href, "about:", "https://URL")
On Error GoTo 0
End If
If dict.Exists(header) Then dict(header) = Trim$(row.Children(1).innerText)
If (header = vbNullString And html2.querySelectorAll("a").Length > 0) Then
dict("HDR03") = Replace$(html2.querySelector("a").href, "about:blank", "URL")
lastRow = True
ElseIf header = "HDR04" Then
If row.NextSibling.NodeType = 1 Then lastRow = True
End If
If lastRow Then
populateArrayFromDict dict, results, r
End If
Next
results = Application.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Application.Transpose(results)
Dim re As Object
Set re = CreateObject("VBScript.RegExp")
With re
.Global = False
.MultiLine = False
.IgnoreCase = True
.Pattern = "\s([0-9.]+)\sm²"
End With
Dim ie As SHDocVw.InternetExplorer
Set ie = New SHDocVw.InternetExplorer
With ie
.Visible = True
For r = LBound(results, 1) To UBound(results, 1)
If results(r, 7) <> vbNullString Then
.Navigate2 results(r, 7), headers:="Referer: " & URL
While .Busy Or .readyState <> READYSTATE_COMPLETE: DoEvents: Wend
'On Error Resume Next
results(r, 8) = re.Execute(.document.querySelector("#anz").innerHTML)(0).Submatches(0)
'On Error GoTo 0
End If
Next
.Quit
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
It works perfectly in Excel, but I need it for an Access-table. My Aceess-table named tblTab01 contains all the fields that are present in the code in the headers = array("..."), and I have disabled the following lines in the code:
results = Application.Transpose(results)
and
ActiveSheet.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
Instead, I added the following lines:
Dim db As DAO.Database
Dim strInsert
Set db = CurrentDb
strInsert = "INSERT INTO tblTab01 VALUES (results);"
db.Execute strInsert
But I only get all possible errors!
How would the code need to be modified for use with the Access table? THX
This produces same output as the Excel code. I attempted a solution that eliminated looping array but this version is actually faster.
Had to use Excel WorksheetFunction to make the Transpose method work. Make sure Excel library is selected in References.
results = Excel.WorksheetFunction.Transpose(results)
ReDim Preserve results(1 To UBound(headers) + 1, 1 To r)
results = Excel.WorksheetFunction.Transpose(results)
Uncomment the On Error lines:
On Error Resume Next
results(r, 8) = re.Execute(.document.querySelector("#anz").innerHTML)(0).Submatches(0)
On Error GoTo 0
Then instead of the With ActiveSheet block, loop through array.
Dim db As DAO.Database
Dim rs As DAO.Recordset
CurrentDb.Execute "DELETE * FROM tblNetzPortDwnLd"
Set db = CurrentDb
Set rs = db.OpenRecordset("tblNetzPortDwnLd", dbOpenDynaset)
For r = LBound(results, 1) To UBound(results, 1)
With rs
.AddNew
.Fields("zpID") = r
.Fields("zpAktenzeichen") = results(r, 1)
.Fields("zpAmtsgericht") = results(r, 2)
.Fields("zpObjekt") = results(r, 3)
.Fields("zpVerkehrswert") = results(r, 4)
.Fields("zpTermin") = results(r, 5)
.Fields("zpPdfLink") = results(r, 6)
.Fields("zpAdditLink") = results(r, 7)
.Fields("zpm2") = results(r, 8)
.Update
End With
Next
All fields in table are text type, per our chat discussion.
I have looked at the solution provided in this link Extract Table from Webpage in Excel using VBA and it was very helpful. But I need to extract the values in the div classes (cscore_score) and not a table Please refer to image below
The URL is: https://www.espncricinfo.com/scores
The div class is: cscore_score
The scores to extract is in nested divs. The sample data for each nested div I want to extract is like Country and Score i.e INDIA and in the next column "416..." into the Excel sheet.
Here's a screenshot of the table structure:
Public Sub GetInfo()
Const URL As String = "https://www.espncricinfo.com/scores"
Dim html As HTMLDocument, hTable As HTMLTable, ws As Worksheet, headers()
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
html.body.innerHTML = .responseText
End With
Set hDiv = html.querySelector("div.cscore")
Dim ul As Object, div As Object, r As Long, c As Long
r = 1
With ws
For Each div In hDiv.getElementsByClassName("cscore_link")
r = r + 1: c = 1
If r > 3 Then
For Each ul In div.getElementsByClassName("cscore_score")
.Cells(r - 2, c) = IIf(c = 2, "'" & div.innerText, div.innerText)
c = c + 1
Next
End If
Next
End With
End Sub
I would be grateful to receive any help to extract those scores from each div into the sheet.
You could use faster css selectors (using only class is faster than tag/type) which if used as shown below will allow you to also reduce your code complexity and improve performance by having only a single loop. Results can then be stored in an array and written out in one go - again another efficiency gain.
Note I am ensuring scores remain correctly formatted on output by concatenating "'" in front.
If you want scores for same match on same row:
Option Explicit
Public Sub GetData()
Dim html As HTMLDocument, ws As Worksheet, countries As Object, scores As Object, results(), i As Long, r As Long
Set ws = ThisWorkbook.Worksheets("Sheet1"): Set html = New HTMLDocument: r = 1
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.espncricinfo.com/scores", False
.send
html.body.innerHTML = .responseText
End With
Set countries = html.querySelectorAll(".cscore--watchNotes .cscore_name--long"): Set scores = html.querySelectorAll(".cscore_score ")
ReDim results(1 To countries.Length / 2, 1 To 4)
For i = 0 To countries.Length - 1 Step 2
results(r, 1) = countries.item(i).innerText: results(r, 2) = "'" & scores.item(i).innerText
results(r, 3) = countries.item(i + 1).innerText: results(r, 4) = "'" & scores.item(i + 1).innerText
r = r + 1
Next
ws.Cells(1, 1).Resize(1, 4) = Array("Home", "Score", "Away", "Score")
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Scores on different rows for every team:
Option Explicit
Public Sub GetData()
Dim html As HTMLDocument, ws As Worksheet, countries As Object, scores As Object, results(), i As Long
Set ws = ThisWorkbook.Worksheets("Sheet1"): Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.espncricinfo.com/scores", False
.send
html.body.innerHTML = .responseText
End With
Set countries = html.querySelectorAll(".cscore--watchNotes .cscore_name--long"): Set scores = html.querySelectorAll(".cscore_score ")
ReDim results(1 To countries.Length, 1 To 2)
For i = 0 To countries.Length - 1
results(i + 1, 1) = countries.item(i).innerText: results(i + 1, 2) = "'" & scores.item(i).innerText
Next
ws.Cells(1, 1) = "Country": ws.Cells(1, 2) = "Score"
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Additional column:
Public Sub GetData()
Dim html As HTMLDocument, ws As Worksheet, countries As Object, scores As Object
Dim descs As Object, results(), i As Long, r As Long
Set ws = ThisWorkbook.Worksheets("Sheet1"): Set html = New HTMLDocument: r = 1
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.espncricinfo.com/scores", False
.send
html.body.innerHTML = .responseText
End With
Set countries = html.querySelectorAll(".cscore--watchNotes .cscore_name--long"): Set scores = html.querySelectorAll(".cscore_score ")
Set descs = html.querySelectorAll(".cscore--watchNotes .cscore_info-overview")
ReDim results(1 To countries.Length / 2, 1 To 5)
For i = 0 To countries.Length - 1 Step 2
results(r, 1) = descs.Item(i / 2).innerText
results(r, 2) = countries.Item(i).innerText: results(r, 3) = "'" & scores.Item(i).innerText
results(r, 4) = countries.Item(i + 1).innerText: results(r, 5) = "'" & scores.Item(i + 1).innerText
r = r + 1
Next
ws.Cells(1, 1).Resize(1, 5) = Array("Desc", "Home", "Score", "Away", "Score")
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Your request seems to be just fine. Parsing the HTML is where your problem is.
You could do something like the following (you can ignore the request part):
Option Explicit
Sub espn()
Dim req As New WinHttpRequest
Dim HTMLDocument As New HTMLDocument
Dim listElement As HTMLUListElement
Dim listItem As HTMLLIElement
Dim sht As Worksheet
Dim i As Long
Dim j As Long
Dim url As String
url = "https://www.espncricinfo.com/scores"
With req
.Open "GET", url, False
.send
HTMLDocument.body.innerHTML = .responseText
End With
Set sht = ThisWorkbook.Worksheets("Name of your worksheet")
i = 2
For Each listElement In HTMLDocument.getElementsByClassName("cscore_competitors")
j = 1
For Each listItem In listElement.getElementsByTagName("li")
sht.Cells(i, j) = listItem.getElementsByClassName("cscore_name cscore_name--long")(0).innerText
sht.Cells(i, j + 1) = listItem.getElementsByClassName("cscore_score")(0).innerText
j = j + 2
Next listItem
i = i + 1
Next listElement
End Sub
The results would look like so:
Basically each game is represented by a ul (unnumbered list) element which consists of two li elements which contain the info about the names and the score.
Very new to VBA. Trying to learn as much as I can. I can get the output that I want in the immediate window, but how can I move all of that to my worksheet?
I'm honestly not sure what to try or where to try it.
Option Explicit
Sub JsonMain()
Dim dict
Dim subDict
Dim strLine As String
' Read from file
Dim FilePath As String
FilePath = ThisWorkbook.Path + "\" + "Main.json"
Dim nFile As Integer
Dim strJson As String
nFile = FreeFile
Open FilePath For Input As #nFile
strJson = Input(LOF(nFile), nFile)
Close #nFile
Dim jp As Scripting.Dictionary
Set jp = JsonConverter.ParseJson(strJson)
Dim gameData As Scripting.Dictionary
Set gameData = jp("data")
Dim theseMonsters As Object
Set theseMonsters = gameData("monsters")
Debug.Print "there are " & theseMonsters.Count & " monsters in the profile"
Dim i As Long
Dim monster As Dictionary
Dim monsterName As Variant
Dim monsterDetails As Variant
For Each monsterName In theseMonsters.Keys
Debug.Print "Monster #" & monsterName
Set monsterDetails = theseMonsters(monsterName)
Debug.Print " -- name: " & monsterDetails("class_name")
Debug.Print " -- total level: " & monsterDetails("total_level")
Debug.Print " -- perfection: " & monsterDetails("perfect_rate")
Debug.Print " -- catch number: " & monsterDetails("create_index")
Dim battleStats As Collection
Set battleStats = monsterDetails("total_battle_stats")
Debug.Print " -- battle stats: ";
For i = 1 To battleStats.Count
Debug.Print battleStats.Item(i) & " ";
Next i
Debug.Print ""
' ...
Next monsterName
End Sub
Edit 1:
Expected results would be bold titles for each category that are printed in row A, with data going down in columns under those titles.
Here is an example output that I get in the immediate window:
Monster #47103
-- name: Monstratos
-- total level: 20
-- perfection: 53.763
-- catch number: 39
-- battle stats: 218 288 221 198 227 201
I would like Row A to contain these bold headers: Monster #, Name, Total Level, Perfection, Catch Number, HP, PA, PD, SA, SD, SPD (Battle Stats is not a header, but the individual battle stats are).
Below that, for this mon as an example, would be: 47103, Monstratos, 20, 53.763, 39, 218, 288, 221, 198, 227, 201.
I think you want something like the following. You incremement the row counter, r, each time you hit a new monster dictionary. For each item of interest within the monster dictionary the column increases by 1.
Option Explicit
Public Sub WriteOutBattleInfo()
Dim headers(), r As Long, i As Long, json As Object, key As Variant, ws As Worksheet, battleStats As Object
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("Monster #", "Name", "Total Level", "Perfection", "Catch Number", "HP", "PA", "PD", "SA", "SD", "SPD")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.etheremon.com/api/user/get_my_monster?trainer_address=0x2Fef65e4D69a38bf0dd074079f367CDF176eC0De", False
.send
Set json = JsonConverter.ParseJson(.responseText)("data")("monsters") 'dictionary of dictionaries
End With
r = 2
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For Each key In json.keys
With ws
.Cells(r, 1) = key
.Cells(r, 2) = json(key)("class_name")
.Cells(r, 3) = json(key)("total_level")
.Cells(r, 4) = json(key)("perfect_rate")
.Cells(r, 5) = json(key)("create_index")
Set battleStats = json(key)("total_battle_stats")
For i = 1 To battleStats.Count
.Cells(r, i + 5) = battleStats.item(i)
Next i
End With
r = r + 1
Next
End Sub
I'm trying to webscrape multiple tables from a website. So far I have built an excel VBA macro to do this. I also figured out how to get all the data when it is on multiple pages in the website. For instance, if I have 1000 results but 50 are displayed on each page. The problem is that I have the same 5 tables on multiple pages because each table has 1000 results.
My code can only loop through each page for 1 table. I also have written code to grab each table, but I cannot figure out how to do that for each of the 50 search results (each page).
How can I loop through multiple tables and click the next page in the process to capture all the data?
Sub ETFDat()
Dim IE As Object
Dim i As Long
Dim strText As String
Dim jj As Long
Dim hBody As Object
Dim hTR As Object
Dim hTD As Object
Dim tb As Object
Dim bb As Object
Dim Tr As Object
Dim Td As Object
Dim ii As Long
Dim doc As Object
Dim hTable As Object
Dim y As Long
Dim z As Long
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
Sheets("Fund Basics").Activate
Cells.Select
Selection.Clear
IE.navigate "http://www.etf.com/channels/smart-beta-etfs/channels/smart- beta-etfs?qt-tabs=0#qt-tabs" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
Do While IE.busy: DoEvents: Loop
Do While IE.ReadyState <> 4: DoEvents: Loop
Set doc = IE.document
Set hTable = doc.getElementsByTagName("table") '.GetElementByID("tablePerformance")
ii = 1
Do While ii <= 17
For Each tb In hTable
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each Td In hTD
ws.Cells(z, y).Value = Td.innerText
y = y + 1
Next Td
DoEvents
z = z + 1
Next Tr
Exit For
Next bb
Exit For
Next tb
With doc
Set elems = .getElementsByTagName("a")
For Each e In elems
If (e.getAttribute("id") = "nextPage") Then
e.Click
Exit For
End If
Next e
End With
ii = ii + 1
Application.Wait (Now + TimeValue("00:00:05"))
Loop
MsgBox "Done"
End Sub
There is the example showing how the data could be retrieved from the website using XHRs and JSON parsing, it consists of several steps.
Retrieve the data.
I looked into a little with XHRs using Chrome Developer Tools Network tab.
Most relevant data I found is JSON string returned by GET XHR from http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/50/50/1 after I clicked the next page button:
The response has the following structure for single row item:
[
{
"productId": 576,
"fund": "iShares Russell 1000 Value ETF",
"ticker": "IWD",
"inceptionDate": "2000-05-22",
"launchDate": "2000-05-22",
"hasSegmentReport": "true",
"genericReport": "false",
"hasReport": "true",
"fundsInSegment": 20,
"economicDevelopment": "Developed Markets",
"totalRows": 803,
"fundBasics": {
"issuer": "<a href='/channels/blackrock-etfs' alt='BlackRock'>BlackRock</a>",
"expenseRatio": {
"value": 20
},
"aum": {
"value": 36957230250
},
"spreadPct": {
"value": 0.000094
},
"segment": "Equity: U.S. - Large Cap Value"
},
"performance": {
"priceTrAsOf": "2017-02-27",
"priceTr1Mo": {
"value": 0.031843
},
"priceTr3Mo": {
"value": 0.070156
},
"priceTr1Yr": {
"value": 0.281541
},
"priceTr3YrAnnualized": {
"value": 0.099171
},
"priceTr5YrAnnualized": {
"value": 0.13778
},
"priceTr10YrAnnualized": {
"value": 0.061687
}
},
"analysis": {
"analystPick": null,
"opportunitiesList": null,
"letterGrade": "A",
"efficiencyScore": 97.977103,
"tradabilityScore": 99.260541,
"fitScore": 84.915658,
"leveragedFactor": null,
"exposureReset": null,
"avgDailyDollarVolume": 243848188.037378,
"avgDailyShareVolume": 2148400.688889,
"spread": {
"value": 0.010636
},
"fundClosureRisk": "Low"
},
"fundamentals": {
"dividendYield": {
"value": 0.021543
},
"equity": {
"pe": 27.529645,
"pb": 1.964124
},
"fixedIncome": {
"duration": null,
"creditQuality": null,
"ytm": {
"value": null
}
}
},
"classification": {
"assetClass": "Equity",
"strategy": "Value",
"region": "North America",
"geography": "U.S.",
"category": "Size and Style",
"focus": "Large Cap",
"niche": "Value",
"inverse": "false",
"leveraged": "false",
"etn": "false",
"selectionCriteria": "Multi-Factor",
"weightingScheme": "Multi-Factor",
"activePerSec": "false",
"underlyingIndex": "Russell 1000 Value Index",
"indexProvider": "Russell",
"brand": "iShares"
},
"tax": {
"legalStructure": "Open-Ended Fund",
"maxLtCapitalGainsRate": 20,
"maxStCapitalGainsRate": 39.6,
"taxReporting": "1099"
}
}
]
The property "totalRows": 803 specifies the total rows count. So to make data retrieving as fast as it possible, better to make the request to get the first row. As you can see from the URL, there is ../-aum/50/50/.. tail, which points sorting order, item to start from, and total items to return. Thus to get the only row it should be http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1
Parse retrieved JSON, get the total number of rows from totalRows property.
Make another one request to get the entire table.
Parse the entire table JSON, convert it to 2d array and output. You can perform further processing with direct access to the array.
For the table shown below:
The resulting table contains 803 rows and header with columns as follows:
productId
fund
ticker
inceptionDate
launchDate
hasSegmentReport
genericReport
hasReport
fundsInSegment
economicDevelopment
totalRows
fundBasics_issuer
fundBasics_expenseRatio_value
fundBasics_aum_value
fundBasics_spreadPct_value
fundBasics_segment
performance_priceTrAsOf
performance_priceTr1Mo_value
performance_priceTr3Mo_value
performance_priceTr1Yr_value
performance_priceTr3YrAnnualized_value
performance_priceTr5YrAnnualized_value
performance_priceTr10YrAnnualized_value
analysis_analystPick
analysis_opportunitiesList
analysis_letterGrade
analysis_efficiencyScore
analysis_tradabilityScore
analysis_fitScore
analysis_leveragedFactor
analysis_exposureReset
analysis_avgDailyDollarVolume
analysis_avgDailyShareVolume
analysis_spread_value
analysis_fundClosureRisk
fundamentals_dividendYield_value
fundamentals_equity_pe
fundamentals_equity_pb
fundamentals_fixedIncome_duration
fundamentals_fixedIncome_creditQuality
fundamentals_fixedIncome_ytm_value
classification_assetClass
classification_strategy
classification_region
classification_geography
classification_category
classification_focus
classification_niche
classification_inverse
classification_leveraged
classification_etn
classification_selectionCriteria
classification_weightingScheme
classification_activePerSec
classification_underlyingIndex
classification_indexProvider
classification_brand
tax_legalStructure
tax_maxLtCapitalGainsRate
tax_maxStCapitalGainsRate
tax_taxReporting
Put the below code into VBA Project standard module:
Option Explicit
Sub GetData()
Dim sJSONString As String
Dim vJSON As Variant
Dim sState As String
Dim lRowsQty As Long
Dim aData()
Dim aHeader()
' Download and parse the only first row to get total rows qty
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1")
JSON.Parse sJSONString, vJSON, sState
lRowsQty = vJSON(0)("totalRows")
' Download and parse the entire data
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/" & lRowsQty & "/1")
JSON.Parse sJSONString, vJSON, sState
' Convert JSON to 2d array
JSON.ToArray vJSON, aData, aHeader
' Output
With Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), aHeader
Output2DArray .Cells(2, 1), aData
.Cells.Columns.AutoFit
End With
End Sub
Function GetXHR(sURL As String) As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, False
.Send
GetXHR = .responseText
End With
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
1, _
UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "#"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "#"
.Value = aCells
End With
End With
End Sub
Create one more standard module, name it JSON and put the below code into it, this code provides JSON processing functionality:
Option Explicit
Private sBuffer As String
Private oTokens As Object
Private oRegEx As Object
Private bMatch As Boolean
Private oChunks As Object
Private oHeader As Object
Private aData() As Variant
Private i As Long
Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
' Backus–Naur form JSON parser implementation based on RegEx
' Input:
' sSample - source JSON string
' Output:
' vJson - created object or array to be returned as result
' sState - string Object|Array|Error depending on processing
sBuffer = sSample
Set oTokens = CreateObject("Scripting.Dictionary")
Set oRegEx = CreateObject("VBScript.RegExp")
With oRegEx ' Patterns based on specification http://www.json.org/
.Global = True
.MultiLine = True
.IgnoreCase = True ' Unspecified True, False, Null accepted
.Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
Tokenize "s"
.Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
Tokenize "d"
.Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
Tokenize "c"
.Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
Tokenize "n"
.Pattern = "\s+"
sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
.MultiLine = False
Do
bMatch = False
.Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
Tokenize "p"
.Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
Tokenize "o"
.Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
Tokenize "a"
Loop While bMatch
.Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
Retrieve sBuffer, vJSON
sState = IIf(IsObject(vJSON), "Object", "Array")
Else
vJSON = Null
sState = "Error"
End If
End With
Set oTokens = Nothing
Set oRegEx = Nothing
End Sub
Private Sub Tokenize(sType)
Dim aContent() As String
Dim lCopyIndex As Long
Dim i As Long
Dim sKey As String
With oRegEx.Execute(sBuffer)
If .Count = 0 Then Exit Sub
ReDim aContent(0 To .Count - 1)
lCopyIndex = 1
For i = 0 To .Count - 1
With .Item(i)
sKey = "<" & oTokens.Count & sType & ">"
oTokens(sKey) = .Value
aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
lCopyIndex = .FirstIndex + .Length + 1
End With
Next
End With
sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
bMatch = True
End Sub
Private Sub Retrieve(sTokenKey, vTransfer)
Dim sTokenValue As String
Dim sName As String
Dim vValue As Variant
Dim aTokens() As String
Dim i As Long
sTokenValue = oTokens(sTokenKey)
With oRegEx
.Global = True
Select Case Left(Right(sTokenKey, 2), 1)
Case "o"
Set vTransfer = CreateObject("Scripting.Dictionary")
aTokens = Split(sTokenValue, "<")
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
Next
Case "p"
aTokens = Split(sTokenValue, "<", 4)
Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(sName) = vValue
Else
vTransfer(sName) = vValue
End If
Case "a"
aTokens = Split(sTokenValue, "<")
If UBound(aTokens) = 0 Then
vTransfer = Array()
Else
ReDim vTransfer(0 To UBound(aTokens) - 1)
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(i - 1) = vValue
Else
vTransfer(i - 1) = vValue
End If
Next
End If
Case "n"
vTransfer = sTokenValue
Case "s"
vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
"\""", """"), _
"\\", "\"), _
"\/", "/"), _
"\b", Chr(8)), _
"\f", Chr(12)), _
"\n", vbLf), _
"\r", vbCr), _
"\t", vbTab)
.Global = False
.Pattern = "\\u[0-9a-fA-F]{4}"
Do While .Test(vTransfer)
vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
Loop
Case "d"
vTransfer = Evaluate(sTokenValue)
Case "c"
Select Case LCase(sTokenValue)
Case "true"
vTransfer = True
Case "false"
vTransfer = False
Case "null"
vTransfer = Null
End Select
End Select
End With
End Sub
Function Serialize(vJSON As Variant) As String
Set oChunks = CreateObject("Scripting.Dictionary")
SerializeElement vJSON, ""
Serialize = Join(oChunks.Items(), "")
Set oChunks = Nothing
End Function
Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "{}"
Else
.Item(.Count) = "{" & vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
SerializeElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "}"
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "[]"
Else
.Item(.Count) = "[" & vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & vbTab
SerializeElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "]"
End If
Case vbInteger, vbLong
.Item(.Count) = vElement
Case vbSingle, vbDouble
.Item(.Count) = Replace(vElement, ",", ".")
Case vbNull
.Item(.Count) = "null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "true", "false")
Case Else
.Item(.Count) = """" & _
Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
"\", "\\"), _
"""", "\"""), _
"/", "\/"), _
Chr(8), "\b"), _
Chr(12), "\f"), _
vbLf, "\n"), _
vbCr, "\r"), _
vbTab, "\t") & _
""""
End Select
End With
End Sub
Function ToString(vJSON As Variant) As String
Select Case VarType(vJSON)
Case vbObject, Is >= vbArray
Set oChunks = CreateObject("Scripting.Dictionary")
ToStringElement vJSON, ""
oChunks.Remove 0
ToString = Join(oChunks.Items(), "")
Set oChunks = Nothing
Case vbNull
ToString = "Null"
Case vbBoolean
ToString = IIf(vJSON, "True", "False")
Case Else
ToString = CStr(vJSON)
End Select
End Function
Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & aKeys(i) & ": "
ToStringElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
Next
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & i & ": "
ToStringElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
Next
End If
Case vbNull
.Item(.Count) = "Null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "True", "False")
Case Else
.Item(.Count) = CStr(vElement)
End Select
End With
End Sub
Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
' Input:
' vJSON - Array or Object which contains rows data
' Output:
' aData - 2d array representing JSON data
' aHeader - 1d array of property names
Dim sName As Variant
Set oHeader = CreateObject("Scripting.Dictionary")
Select Case VarType(vJSON)
Case vbObject
If vJSON.Count > 0 Then
ReDim aData(0 To vJSON.Count - 1, 0 To 0)
oHeader("#") = 0
i = 0
For Each sName In vJSON
aData(i, 0) = "#" & sName
ToArrayElement vJSON(sName), ""
i = i + 1
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Is >= vbArray
If UBound(vJSON) >= 0 Then
ReDim aData(0 To UBound(vJSON), 0 To 0)
For i = 0 To UBound(vJSON)
ToArrayElement vJSON(i), ""
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Else
ReDim aData(0 To 0, 0 To 0)
aData(0, 0) = ToString(vJSON)
End Select
aHeader = oHeader.Keys()
Set oHeader = Nothing
aRows = aData
Erase aData
End Sub
Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
Dim sName As Variant
Dim j As Long
Select Case VarType(vElement)
Case vbObject ' collection of objects
For Each sName In vElement
ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
Next
Case Is >= vbArray ' collection of arrays
For j = 0 To UBound(vElement)
ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
Next
Case Else
If Not oHeader.Exists(sFieldName) Then
oHeader(sFieldName) = oHeader.Count
If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
End If
j = oHeader(sFieldName)
aData(i, j) = ToString(vElement)
End Select
End Sub
I tried useful the original question and found little bug that I fixed
The answer to you question is to
1 loop on Doc elements until next page link is found, and you to
2 set/reset the doc var to avoid the Doc lost
Do
Do While IE.Busy: DoEvents: Loop
Do While IE.readyState <> 4: DoEvents: Loop
Set doc = IE.document
....
Loop While nextPageFound 'exit if "next page" not found
when following the next page hyper link, during loop on Doc elements.
And then I added the
3 split of Table Header and Body (if exist) for 1st row field name set
4 RESCUE of cell value from specific class by name cell against simple td.innerText
5 loop on doc for link search until NextPageFound using
e.getAttribute("title") instead id=nextPage
6 use of a SetUp sheet with parameters to customize the script with url and destination data sheet
strUrl = ThisWorkbook.Sheets("Setup").Range("b1").Value
strDestSheet = ThisWorkbook.Sheets("Setup").Range("b2").Value
And here is the VBA function revisited:
Sub ETFDatNew()
Dim IE As Object
Dim i As Long
Dim strText As String
Dim jj As Long
Dim hBody As Object
Dim hTR As Object
Dim hTD As Object
Dim tb As Object
Dim bb As Object
Dim Tr As Object
Dim Td As Object
Dim ii As Long
Dim doc As Object
Dim hTable As Object
Dim y As Long
Dim z As Long
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
'6 I used a SetUp sheet with parameters to customize the script with url and destination data sheet
strUrl = ThisWorkbook.Sheets("Setup").Range("b1").Value
strDestSheet = ThisWorkbook.Sheets("Setup").Range("b2").Value
Sheets(strDestSheet).Activate ' Destination sheet
Cells.Select
Selection.Clear
'IE.navigate "http://halleyweb.com/c058057/mc/mc_p_ricerca.php" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
IE.navigate strUrl
ii = 1
nextPageFound = True 'set to init scope var / settato per iniziare il loop
Do '1) loop until nextPageFound instead of '' ii <= 17 Or
'2 set/reset of doc to avoid doc lost after next page link click (see below)
Do While IE.Busy: DoEvents: Loop
Do While IE.readyState <> 4: DoEvents: Loop
Set doc = IE.document
' all tables or single one by Id
Set hTable = doc.getElementsByTagName("table") 'or by .getElementById("table-albo-pretorio")
For Each tb In hTable
'3) splitting header vs body for 1st row field name search
'tHeader
If z = 1 Then
Set hBody = tb.getElementsByTagName("thead")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("th") 'header th
y = 1 ' Resets back to column A
For Each Th In hTD
ws.Cells(z, y).Value = Th.innerText
y = y + 1
Next Th
DoEvents
z = z + 1
Next Tr
Exit For
Next bb
End If
'tBody
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each Td In hTD
'4) RESCUE cell value from td.innerText against specific class name cell
'ws.Cells(z, y).Value = Td.innerText
If CBool(Td.getElementsByClassName("tablesaw-cell-content").Length) Then 'there is at least 1
'use the first
ws.Cells(z, y).Value = Td.getElementsByClassName("tablesaw-cell-content")(0).innerText
End If
y = y + 1 'colonna successiva /next col
Next Td
DoEvents
z = z + 1 'riga successiva /next row
Next Tr
Exit For
Next bb
Exit For
Next tb
'5 looping on doc for link search until NextPageFound using e.getAttribute("title") instead id=nextPage
With doc 'ricerca dei link
Set elems = .getElementsByTagName("a")
nextPageFound = False ' si predispone per concludere nel caso non sia presente una pagina successiva
For Each e In elems
If (e.getAttribute("title") = "Pagina successiva") Then ' alla ricerca di link con title="Pagina successiva" / instead of id=nextPage
e.Click
nextPageFound = True ' trovata pagina successiva /found next page
Exit For
End If
Next e
End With
ii = ii + 1
Application.Wait (Now + TimeValue("00:00:01"))
Loop While nextPageFound ' conclude nel caso non sia stato trovato il link Pagina successiva / exit if not found
IE.Quit
Set IE = Nothing
Application.StatusBar = ""
MsgBox "Estrazione completata" ' completed
End Sub