Loop through each table on javascrape webpage with VBA macro - json
I'm trying to webscrape multiple tables from a website. So far I have built an excel VBA macro to do this. I also figured out how to get all the data when it is on multiple pages in the website. For instance, if I have 1000 results but 50 are displayed on each page. The problem is that I have the same 5 tables on multiple pages because each table has 1000 results.
My code can only loop through each page for 1 table. I also have written code to grab each table, but I cannot figure out how to do that for each of the 50 search results (each page).
How can I loop through multiple tables and click the next page in the process to capture all the data?
Sub ETFDat()
Dim IE As Object
Dim i As Long
Dim strText As String
Dim jj As Long
Dim hBody As Object
Dim hTR As Object
Dim hTD As Object
Dim tb As Object
Dim bb As Object
Dim Tr As Object
Dim Td As Object
Dim ii As Long
Dim doc As Object
Dim hTable As Object
Dim y As Long
Dim z As Long
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
Sheets("Fund Basics").Activate
Cells.Select
Selection.Clear
IE.navigate "http://www.etf.com/channels/smart-beta-etfs/channels/smart- beta-etfs?qt-tabs=0#qt-tabs" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
Do While IE.busy: DoEvents: Loop
Do While IE.ReadyState <> 4: DoEvents: Loop
Set doc = IE.document
Set hTable = doc.getElementsByTagName("table") '.GetElementByID("tablePerformance")
ii = 1
Do While ii <= 17
For Each tb In hTable
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each Td In hTD
ws.Cells(z, y).Value = Td.innerText
y = y + 1
Next Td
DoEvents
z = z + 1
Next Tr
Exit For
Next bb
Exit For
Next tb
With doc
Set elems = .getElementsByTagName("a")
For Each e In elems
If (e.getAttribute("id") = "nextPage") Then
e.Click
Exit For
End If
Next e
End With
ii = ii + 1
Application.Wait (Now + TimeValue("00:00:05"))
Loop
MsgBox "Done"
End Sub
There is the example showing how the data could be retrieved from the website using XHRs and JSON parsing, it consists of several steps.
Retrieve the data.
I looked into a little with XHRs using Chrome Developer Tools Network tab.
Most relevant data I found is JSON string returned by GET XHR from http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/50/50/1 after I clicked the next page button:
The response has the following structure for single row item:
[
{
"productId": 576,
"fund": "iShares Russell 1000 Value ETF",
"ticker": "IWD",
"inceptionDate": "2000-05-22",
"launchDate": "2000-05-22",
"hasSegmentReport": "true",
"genericReport": "false",
"hasReport": "true",
"fundsInSegment": 20,
"economicDevelopment": "Developed Markets",
"totalRows": 803,
"fundBasics": {
"issuer": "<a href='/channels/blackrock-etfs' alt='BlackRock'>BlackRock</a>",
"expenseRatio": {
"value": 20
},
"aum": {
"value": 36957230250
},
"spreadPct": {
"value": 0.000094
},
"segment": "Equity: U.S. - Large Cap Value"
},
"performance": {
"priceTrAsOf": "2017-02-27",
"priceTr1Mo": {
"value": 0.031843
},
"priceTr3Mo": {
"value": 0.070156
},
"priceTr1Yr": {
"value": 0.281541
},
"priceTr3YrAnnualized": {
"value": 0.099171
},
"priceTr5YrAnnualized": {
"value": 0.13778
},
"priceTr10YrAnnualized": {
"value": 0.061687
}
},
"analysis": {
"analystPick": null,
"opportunitiesList": null,
"letterGrade": "A",
"efficiencyScore": 97.977103,
"tradabilityScore": 99.260541,
"fitScore": 84.915658,
"leveragedFactor": null,
"exposureReset": null,
"avgDailyDollarVolume": 243848188.037378,
"avgDailyShareVolume": 2148400.688889,
"spread": {
"value": 0.010636
},
"fundClosureRisk": "Low"
},
"fundamentals": {
"dividendYield": {
"value": 0.021543
},
"equity": {
"pe": 27.529645,
"pb": 1.964124
},
"fixedIncome": {
"duration": null,
"creditQuality": null,
"ytm": {
"value": null
}
}
},
"classification": {
"assetClass": "Equity",
"strategy": "Value",
"region": "North America",
"geography": "U.S.",
"category": "Size and Style",
"focus": "Large Cap",
"niche": "Value",
"inverse": "false",
"leveraged": "false",
"etn": "false",
"selectionCriteria": "Multi-Factor",
"weightingScheme": "Multi-Factor",
"activePerSec": "false",
"underlyingIndex": "Russell 1000 Value Index",
"indexProvider": "Russell",
"brand": "iShares"
},
"tax": {
"legalStructure": "Open-Ended Fund",
"maxLtCapitalGainsRate": 20,
"maxStCapitalGainsRate": 39.6,
"taxReporting": "1099"
}
}
]
The property "totalRows": 803 specifies the total rows count. So to make data retrieving as fast as it possible, better to make the request to get the first row. As you can see from the URL, there is ../-aum/50/50/.. tail, which points sorting order, item to start from, and total items to return. Thus to get the only row it should be http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1
Parse retrieved JSON, get the total number of rows from totalRows property.
Make another one request to get the entire table.
Parse the entire table JSON, convert it to 2d array and output. You can perform further processing with direct access to the array.
For the table shown below:
The resulting table contains 803 rows and header with columns as follows:
productId
fund
ticker
inceptionDate
launchDate
hasSegmentReport
genericReport
hasReport
fundsInSegment
economicDevelopment
totalRows
fundBasics_issuer
fundBasics_expenseRatio_value
fundBasics_aum_value
fundBasics_spreadPct_value
fundBasics_segment
performance_priceTrAsOf
performance_priceTr1Mo_value
performance_priceTr3Mo_value
performance_priceTr1Yr_value
performance_priceTr3YrAnnualized_value
performance_priceTr5YrAnnualized_value
performance_priceTr10YrAnnualized_value
analysis_analystPick
analysis_opportunitiesList
analysis_letterGrade
analysis_efficiencyScore
analysis_tradabilityScore
analysis_fitScore
analysis_leveragedFactor
analysis_exposureReset
analysis_avgDailyDollarVolume
analysis_avgDailyShareVolume
analysis_spread_value
analysis_fundClosureRisk
fundamentals_dividendYield_value
fundamentals_equity_pe
fundamentals_equity_pb
fundamentals_fixedIncome_duration
fundamentals_fixedIncome_creditQuality
fundamentals_fixedIncome_ytm_value
classification_assetClass
classification_strategy
classification_region
classification_geography
classification_category
classification_focus
classification_niche
classification_inverse
classification_leveraged
classification_etn
classification_selectionCriteria
classification_weightingScheme
classification_activePerSec
classification_underlyingIndex
classification_indexProvider
classification_brand
tax_legalStructure
tax_maxLtCapitalGainsRate
tax_maxStCapitalGainsRate
tax_taxReporting
Put the below code into VBA Project standard module:
Option Explicit
Sub GetData()
Dim sJSONString As String
Dim vJSON As Variant
Dim sState As String
Dim lRowsQty As Long
Dim aData()
Dim aHeader()
' Download and parse the only first row to get total rows qty
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1")
JSON.Parse sJSONString, vJSON, sState
lRowsQty = vJSON(0)("totalRows")
' Download and parse the entire data
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/" & lRowsQty & "/1")
JSON.Parse sJSONString, vJSON, sState
' Convert JSON to 2d array
JSON.ToArray vJSON, aData, aHeader
' Output
With Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), aHeader
Output2DArray .Cells(2, 1), aData
.Cells.Columns.AutoFit
End With
End Sub
Function GetXHR(sURL As String) As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, False
.Send
GetXHR = .responseText
End With
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
1, _
UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "#"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "#"
.Value = aCells
End With
End With
End Sub
Create one more standard module, name it JSON and put the below code into it, this code provides JSON processing functionality:
Option Explicit
Private sBuffer As String
Private oTokens As Object
Private oRegEx As Object
Private bMatch As Boolean
Private oChunks As Object
Private oHeader As Object
Private aData() As Variant
Private i As Long
Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
' Backus–Naur form JSON parser implementation based on RegEx
' Input:
' sSample - source JSON string
' Output:
' vJson - created object or array to be returned as result
' sState - string Object|Array|Error depending on processing
sBuffer = sSample
Set oTokens = CreateObject("Scripting.Dictionary")
Set oRegEx = CreateObject("VBScript.RegExp")
With oRegEx ' Patterns based on specification http://www.json.org/
.Global = True
.MultiLine = True
.IgnoreCase = True ' Unspecified True, False, Null accepted
.Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
Tokenize "s"
.Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
Tokenize "d"
.Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
Tokenize "c"
.Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
Tokenize "n"
.Pattern = "\s+"
sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
.MultiLine = False
Do
bMatch = False
.Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
Tokenize "p"
.Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
Tokenize "o"
.Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
Tokenize "a"
Loop While bMatch
.Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
Retrieve sBuffer, vJSON
sState = IIf(IsObject(vJSON), "Object", "Array")
Else
vJSON = Null
sState = "Error"
End If
End With
Set oTokens = Nothing
Set oRegEx = Nothing
End Sub
Private Sub Tokenize(sType)
Dim aContent() As String
Dim lCopyIndex As Long
Dim i As Long
Dim sKey As String
With oRegEx.Execute(sBuffer)
If .Count = 0 Then Exit Sub
ReDim aContent(0 To .Count - 1)
lCopyIndex = 1
For i = 0 To .Count - 1
With .Item(i)
sKey = "<" & oTokens.Count & sType & ">"
oTokens(sKey) = .Value
aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
lCopyIndex = .FirstIndex + .Length + 1
End With
Next
End With
sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
bMatch = True
End Sub
Private Sub Retrieve(sTokenKey, vTransfer)
Dim sTokenValue As String
Dim sName As String
Dim vValue As Variant
Dim aTokens() As String
Dim i As Long
sTokenValue = oTokens(sTokenKey)
With oRegEx
.Global = True
Select Case Left(Right(sTokenKey, 2), 1)
Case "o"
Set vTransfer = CreateObject("Scripting.Dictionary")
aTokens = Split(sTokenValue, "<")
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
Next
Case "p"
aTokens = Split(sTokenValue, "<", 4)
Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(sName) = vValue
Else
vTransfer(sName) = vValue
End If
Case "a"
aTokens = Split(sTokenValue, "<")
If UBound(aTokens) = 0 Then
vTransfer = Array()
Else
ReDim vTransfer(0 To UBound(aTokens) - 1)
For i = 1 To UBound(aTokens)
Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
If IsObject(vValue) Then
Set vTransfer(i - 1) = vValue
Else
vTransfer(i - 1) = vValue
End If
Next
End If
Case "n"
vTransfer = sTokenValue
Case "s"
vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
"\""", """"), _
"\\", "\"), _
"\/", "/"), _
"\b", Chr(8)), _
"\f", Chr(12)), _
"\n", vbLf), _
"\r", vbCr), _
"\t", vbTab)
.Global = False
.Pattern = "\\u[0-9a-fA-F]{4}"
Do While .Test(vTransfer)
vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
Loop
Case "d"
vTransfer = Evaluate(sTokenValue)
Case "c"
Select Case LCase(sTokenValue)
Case "true"
vTransfer = True
Case "false"
vTransfer = False
Case "null"
vTransfer = Null
End Select
End Select
End With
End Sub
Function Serialize(vJSON As Variant) As String
Set oChunks = CreateObject("Scripting.Dictionary")
SerializeElement vJSON, ""
Serialize = Join(oChunks.Items(), "")
Set oChunks = Nothing
End Function
Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "{}"
Else
.Item(.Count) = "{" & vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
SerializeElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "}"
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "[]"
Else
.Item(.Count) = "[" & vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & vbTab
SerializeElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
.Item(.Count) = vbCrLf
Next
.Item(.Count) = sIndent & "]"
End If
Case vbInteger, vbLong
.Item(.Count) = vElement
Case vbSingle, vbDouble
.Item(.Count) = Replace(vElement, ",", ".")
Case vbNull
.Item(.Count) = "null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "true", "false")
Case Else
.Item(.Count) = """" & _
Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
"\", "\\"), _
"""", "\"""), _
"/", "\/"), _
Chr(8), "\b"), _
Chr(12), "\f"), _
vbLf, "\n"), _
vbCr, "\r"), _
vbTab, "\t") & _
""""
End Select
End With
End Sub
Function ToString(vJSON As Variant) As String
Select Case VarType(vJSON)
Case vbObject, Is >= vbArray
Set oChunks = CreateObject("Scripting.Dictionary")
ToStringElement vJSON, ""
oChunks.Remove 0
ToString = Join(oChunks.Items(), "")
Set oChunks = Nothing
Case vbNull
ToString = "Null"
Case vbBoolean
ToString = IIf(vJSON, "True", "False")
Case Else
ToString = CStr(vJSON)
End Select
End Function
Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
Dim aKeys() As Variant
Dim i As Long
With oChunks
Select Case VarType(vElement)
Case vbObject
If vElement.Count = 0 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
aKeys = vElement.Keys
For i = 0 To UBound(aKeys)
.Item(.Count) = sIndent & aKeys(i) & ": "
ToStringElement vElement(aKeys(i)), sIndent & vbTab
If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
Next
End If
Case Is >= vbArray
If UBound(vElement) = -1 Then
.Item(.Count) = "''"
Else
.Item(.Count) = vbCrLf
For i = 0 To UBound(vElement)
.Item(.Count) = sIndent & i & ": "
ToStringElement vElement(i), sIndent & vbTab
If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
Next
End If
Case vbNull
.Item(.Count) = "Null"
Case vbBoolean
.Item(.Count) = IIf(vElement, "True", "False")
Case Else
.Item(.Count) = CStr(vElement)
End Select
End With
End Sub
Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
' Input:
' vJSON - Array or Object which contains rows data
' Output:
' aData - 2d array representing JSON data
' aHeader - 1d array of property names
Dim sName As Variant
Set oHeader = CreateObject("Scripting.Dictionary")
Select Case VarType(vJSON)
Case vbObject
If vJSON.Count > 0 Then
ReDim aData(0 To vJSON.Count - 1, 0 To 0)
oHeader("#") = 0
i = 0
For Each sName In vJSON
aData(i, 0) = "#" & sName
ToArrayElement vJSON(sName), ""
i = i + 1
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Is >= vbArray
If UBound(vJSON) >= 0 Then
ReDim aData(0 To UBound(vJSON), 0 To 0)
For i = 0 To UBound(vJSON)
ToArrayElement vJSON(i), ""
Next
Else
ReDim aData(0 To 0, 0 To 0)
End If
Case Else
ReDim aData(0 To 0, 0 To 0)
aData(0, 0) = ToString(vJSON)
End Select
aHeader = oHeader.Keys()
Set oHeader = Nothing
aRows = aData
Erase aData
End Sub
Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
Dim sName As Variant
Dim j As Long
Select Case VarType(vElement)
Case vbObject ' collection of objects
For Each sName In vElement
ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
Next
Case Is >= vbArray ' collection of arrays
For j = 0 To UBound(vElement)
ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
Next
Case Else
If Not oHeader.Exists(sFieldName) Then
oHeader(sFieldName) = oHeader.Count
If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
End If
j = oHeader(sFieldName)
aData(i, j) = ToString(vElement)
End Select
End Sub
I tried useful the original question and found little bug that I fixed
The answer to you question is to
1 loop on Doc elements until next page link is found, and you to
2 set/reset the doc var to avoid the Doc lost
Do
Do While IE.Busy: DoEvents: Loop
Do While IE.readyState <> 4: DoEvents: Loop
Set doc = IE.document
....
Loop While nextPageFound 'exit if "next page" not found
when following the next page hyper link, during loop on Doc elements.
And then I added the
3 split of Table Header and Body (if exist) for 1st row field name set
4 RESCUE of cell value from specific class by name cell against simple td.innerText
5 loop on doc for link search until NextPageFound using
e.getAttribute("title") instead id=nextPage
6 use of a SetUp sheet with parameters to customize the script with url and destination data sheet
strUrl = ThisWorkbook.Sheets("Setup").Range("b1").Value
strDestSheet = ThisWorkbook.Sheets("Setup").Range("b2").Value
And here is the VBA function revisited:
Sub ETFDatNew()
Dim IE As Object
Dim i As Long
Dim strText As String
Dim jj As Long
Dim hBody As Object
Dim hTR As Object
Dim hTD As Object
Dim tb As Object
Dim bb As Object
Dim Tr As Object
Dim Td As Object
Dim ii As Long
Dim doc As Object
Dim hTable As Object
Dim y As Long
Dim z As Long
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
'6 I used a SetUp sheet with parameters to customize the script with url and destination data sheet
strUrl = ThisWorkbook.Sheets("Setup").Range("b1").Value
strDestSheet = ThisWorkbook.Sheets("Setup").Range("b2").Value
Sheets(strDestSheet).Activate ' Destination sheet
Cells.Select
Selection.Clear
'IE.navigate "http://halleyweb.com/c058057/mc/mc_p_ricerca.php" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
IE.navigate strUrl
ii = 1
nextPageFound = True 'set to init scope var / settato per iniziare il loop
Do '1) loop until nextPageFound instead of '' ii <= 17 Or
'2 set/reset of doc to avoid doc lost after next page link click (see below)
Do While IE.Busy: DoEvents: Loop
Do While IE.readyState <> 4: DoEvents: Loop
Set doc = IE.document
' all tables or single one by Id
Set hTable = doc.getElementsByTagName("table") 'or by .getElementById("table-albo-pretorio")
For Each tb In hTable
'3) splitting header vs body for 1st row field name search
'tHeader
If z = 1 Then
Set hBody = tb.getElementsByTagName("thead")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("th") 'header th
y = 1 ' Resets back to column A
For Each Th In hTD
ws.Cells(z, y).Value = Th.innerText
y = y + 1
Next Th
DoEvents
z = z + 1
Next Tr
Exit For
Next bb
End If
'tBody
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each Td In hTD
'4) RESCUE cell value from td.innerText against specific class name cell
'ws.Cells(z, y).Value = Td.innerText
If CBool(Td.getElementsByClassName("tablesaw-cell-content").Length) Then 'there is at least 1
'use the first
ws.Cells(z, y).Value = Td.getElementsByClassName("tablesaw-cell-content")(0).innerText
End If
y = y + 1 'colonna successiva /next col
Next Td
DoEvents
z = z + 1 'riga successiva /next row
Next Tr
Exit For
Next bb
Exit For
Next tb
'5 looping on doc for link search until NextPageFound using e.getAttribute("title") instead id=nextPage
With doc 'ricerca dei link
Set elems = .getElementsByTagName("a")
nextPageFound = False ' si predispone per concludere nel caso non sia presente una pagina successiva
For Each e In elems
If (e.getAttribute("title") = "Pagina successiva") Then ' alla ricerca di link con title="Pagina successiva" / instead of id=nextPage
e.Click
nextPageFound = True ' trovata pagina successiva /found next page
Exit For
End If
Next e
End With
ii = ii + 1
Application.Wait (Now + TimeValue("00:00:01"))
Loop While nextPageFound ' conclude nel caso non sia stato trovato il link Pagina successiva / exit if not found
IE.Quit
Set IE = Nothing
Application.StatusBar = ""
MsgBox "Estrazione completata" ' completed
End Sub
Related
web scraping from google page no longer returns anything
The below Excel vba code use to work and return the market cap, 52 week low and current price into my spreadsheet. It no longer does though and cannot work out why. Class names haven't changed but the getElementsByClassName doesn't seem to return anything anymore I think. Sub get_title_header() Dim wb As Object Dim doc As Object Dim sURL As String Dim i As Integer Dim allElements As IHTMLElementCollection Application.DisplayAlerts = False Set wb = CreateObject("internetExplorer.Application") sURL = "https://www.google.com/search?q=aapl+stock+quote" wb.navigate sURL wb.Visible = False While wb.Busy Application.Wait Now + #12:00:01 AM# DoEvents Wend Set doc = wb.document.body On Error GoTo err_clear Set allElements = doc.getElementsByClassName("iyjjgb") x = allElements(3).innerText Sheet6.Cells(i + 1, 2).Value = x x = allElements(8).innerText Sheet6.Cells(i + 1, 3).Value = x x = "" x = allElements(0).innerText Sheet6.Cells(i + 1, 4).Value = x x = "" err_clear: If Err <> 0 Then Err.Clear Resume Next End If wb.Quit End Sub
Can't fill excel spreadsheet from json using WebHelpers
I am having trouble using VBA to fill a table from JSON using WebHelpers. The JSON can be accessed here: http://54.152.85.66:5000/get-product-info. The table is very simple, just six columns and about 8k rows. Here is the code: Sub LoadRLSiteData() Dim helperData As Object Dim helperDict As Dictionary Set helperData = WebHelpers.ParseJson(getXMLPage("http://54.152.85.66:5000/get-product-info")) Debug.Print "helperData has " & helperData.Count & " items" ' HERE YOU SHOULD LOOP OVER helperData AND PUT INTO SHEET "Helper" End Sub Function getXMLPage(link) As String On Error GoTo recovery Dim retryCount As Integer retryCount = 0 Dim ie As MSXML2.XMLHTTP60 Set ie = New MSXML2.XMLHTTP60 the_start: ie.Open "GET", link, False ie.setRequestHeader "Content-type", "application/json" ie.send While ie.readyState <> 4 DoEvents Wend Debug.Print " " Debug.Print "MSXML HTTP Request to " & link Debug.Print ie.Status; "XMLHTTP status "; ie.statusText; " at "; Time getXMLPage = ie.responseText Exit Function recovery: retryCount = retryCount + 1 Debug.Print "Error number: " & Err.Number _ & " " & Err.Description & " Retry " & retryCount Application.StatusBar = "Error number: " & Err.Number _ & " " & Err.Description & " Retry " & retryCount If retryCount < 4 Then GoTo the_start Else Exit Function End Function Here is what the table should look like: WebHelpers.ParseJson(getXMLPage("http://54.152.85.66:5000/get-product-info")) returns an object that appears to be a collection of nine dictionaries but I can't seem to unravel how to access the items in the dictionary so I can put them into a sheet. I modified the code based on QHarr's answer like so: Option Explicit Sub LoadRLSiteData() Dim newHeaders() As Variant newHeaders = Array("category", "products_category", "products_master_prod_id", "products_page_name_dub", "products_product_webcat", "products_url") GetInfo "Helper Sample", "http://54.152.85.66:5000/get-product-info", newHeaders newHeaders = Array("category", "products_category", "products_master_prod_id", "products_page_name_dub", "products_product_webcat", "products_url") GetInfo "Images Sample", "http://54.152.85.66:5000/query-missing-images", newHeaders newHeaders = Array("category", "problem", "url") GetInfo "Problems Sample", "http://54.152.85.66:5000/get-problems", newHeaders End Sub Public Sub GetInfo(mySheet As String, link As String, myHeaders As Variant) Dim helperData As Object Dim headers(), item As Object, results(), key As Variant Dim subItem As Object, r As Long, c As Long, cat As String Worksheets(mySheet).Activate Set helperData = WebHelpers.ParseJson(getXMLPage(link)) headers = myHeaders ReDim results(1 To 100000, 1 To UBound(headers) + 1) r = 1 Debug.Print "GetInfo unpacking JSON dictionaries" For Each item In helperData 'col of dict DoEvents cat = item("category") For Each subItem In item("products") c = 2 results(r, 1) = cat For Each key In subItem.Keys results(r, c) = subItem(key) c = c + 1 Next r = r + 1 Next Next Debug.Print "GetInfo loading values to worksheet" ActiveSheet.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers ActiveSheet.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results Debug.Print "GetInfo finished" End Sub Function getXMLPage(link) As String On Error GoTo recovery Dim retryCount As Integer retryCount = 0 Dim ie As MSXML2.XMLHTTP60 Set ie = New MSXML2.XMLHTTP60 the_start: ie.Open "GET", link, False ie.setRequestHeader "Content-type", "application/json" ie.send While ie.readyState <> 4 DoEvents Wend Debug.Print " " Debug.Print "MSXML HTTP Request to " & link Debug.Print ie.Status; "XMLHTTP status "; ie.statusText; " at "; Time getXMLPage = ie.responseText Exit Function recovery: retryCount = retryCount + 1 Debug.Print "Error number: " & Err.Number _ & " " & Err.Description & " Retry " & retryCount Application.StatusBar = "Error number: " & Err.Number _ & " " & Err.Description & " Retry " & retryCount If retryCount < 4 Then GoTo the_start Else Exit Function End Function Except for the third URL ("get-problems"), which has a different schema, this solution works great but it seems like the headers could be extracted from the schema instead of being hardcoded and the same for the variables in the For Each item In helperData loop. This would make the solution cleaner and more generalizable.
I am using a different json parser but this unravels the dictionaries and collections. If installing the code from jsonconverter.bas to your project then go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. You can see how to use from below first End With The [] are collections looped with For Each and accessed by index; the {} are dictionaries accessed by key. You can see some of the structure here: VBA: Option Explicit Public Sub GetInfo() Dim helperData As Object With CreateObject("MSXML2.XMLHTTP") .Open "GET", "http://54.152.85.66:5000/get-product-info", False .send Set helperData = jsonConverter.ParseJson(.responseText) End With Dim headers(), item As Object, results(), key As Variant Dim subItem As Object, r As Long, c As Long, cat As String headers = Array("category", "products_category", "products_master_prod_id", "products_page_name_dub", "products_product_webcat", "products_url") ReDim results(1 To 100000, 1 To UBound(headers) + 1) r = 1 For Each item In helperData 'col of dict cat = item("category") For Each subItem In item("products") c = 2 results(r, 1) = cat For Each key In subItem.keys results(r, c) = subItem(key) c = c + 1 Next r = r + 1 Next Next ActiveSheet.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers ActiveSheet.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results End Sub Sample output: Integrated with yours I would expect something like: Option Explicit Public Sub GetInfo() Dim helperData As Object Dim headers(), item As Object, results(), key As Variant Dim subItem As Object, r As Long, c As Long, cat As String Set helperData = WebHelpers.ParseJson(getXMLPage("http://54.152.85.66:5000/get-product-info")) headers = Array("category", "products_category", "products_master_prod_id", "products_page_name_dub", "products_product_webcat", "products_url") ReDim results(1 To 100000, 1 To UBound(headers) + 1) r = 1 For Each item In helperData 'col of dict cat = item("category") For Each subItem In item("products") c = 2 results(r, 1) = cat For Each key In subItem.keys results(r, c) = subItem(key) c = c + 1 Next r = r + 1 Next Next ActiveSheet.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers ActiveSheet.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results End Sub
How can I move VBA-JSON output to specific cells in worksheet?
Very new to VBA. Trying to learn as much as I can. I can get the output that I want in the immediate window, but how can I move all of that to my worksheet? I'm honestly not sure what to try or where to try it. Option Explicit Sub JsonMain() Dim dict Dim subDict Dim strLine As String ' Read from file Dim FilePath As String FilePath = ThisWorkbook.Path + "\" + "Main.json" Dim nFile As Integer Dim strJson As String nFile = FreeFile Open FilePath For Input As #nFile strJson = Input(LOF(nFile), nFile) Close #nFile Dim jp As Scripting.Dictionary Set jp = JsonConverter.ParseJson(strJson) Dim gameData As Scripting.Dictionary Set gameData = jp("data") Dim theseMonsters As Object Set theseMonsters = gameData("monsters") Debug.Print "there are " & theseMonsters.Count & " monsters in the profile" Dim i As Long Dim monster As Dictionary Dim monsterName As Variant Dim monsterDetails As Variant For Each monsterName In theseMonsters.Keys Debug.Print "Monster #" & monsterName Set monsterDetails = theseMonsters(monsterName) Debug.Print " -- name: " & monsterDetails("class_name") Debug.Print " -- total level: " & monsterDetails("total_level") Debug.Print " -- perfection: " & monsterDetails("perfect_rate") Debug.Print " -- catch number: " & monsterDetails("create_index") Dim battleStats As Collection Set battleStats = monsterDetails("total_battle_stats") Debug.Print " -- battle stats: "; For i = 1 To battleStats.Count Debug.Print battleStats.Item(i) & " "; Next i Debug.Print "" ' ... Next monsterName End Sub Edit 1: Expected results would be bold titles for each category that are printed in row A, with data going down in columns under those titles. Here is an example output that I get in the immediate window: Monster #47103 -- name: Monstratos -- total level: 20 -- perfection: 53.763 -- catch number: 39 -- battle stats: 218 288 221 198 227 201 I would like Row A to contain these bold headers: Monster #, Name, Total Level, Perfection, Catch Number, HP, PA, PD, SA, SD, SPD (Battle Stats is not a header, but the individual battle stats are). Below that, for this mon as an example, would be: 47103, Monstratos, 20, 53.763, 39, 218, 288, 221, 198, 227, 201.
I think you want something like the following. You incremement the row counter, r, each time you hit a new monster dictionary. For each item of interest within the monster dictionary the column increases by 1. Option Explicit Public Sub WriteOutBattleInfo() Dim headers(), r As Long, i As Long, json As Object, key As Variant, ws As Worksheet, battleStats As Object Set ws = ThisWorkbook.Worksheets("Sheet1") headers = Array("Monster #", "Name", "Total Level", "Perfection", "Catch Number", "HP", "PA", "PD", "SA", "SD", "SPD") With CreateObject("MSXML2.XMLHTTP") .Open "GET", "https://www.etheremon.com/api/user/get_my_monster?trainer_address=0x2Fef65e4D69a38bf0dd074079f367CDF176eC0De", False .send Set json = JsonConverter.ParseJson(.responseText)("data")("monsters") 'dictionary of dictionaries End With r = 2 ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers For Each key In json.keys With ws .Cells(r, 1) = key .Cells(r, 2) = json(key)("class_name") .Cells(r, 3) = json(key)("total_level") .Cells(r, 4) = json(key)("perfect_rate") .Cells(r, 5) = json(key)("create_index") Set battleStats = json(key)("total_battle_stats") For i = 1 To battleStats.Count .Cells(r, i + 5) = battleStats.item(i) Next i End With r = r + 1 Next End Sub
Modifying the program for parsing
There is a program that parse a certain table from the site . Works great . I want to parse another table from the site . By the tag number “table” they are the same . I am trying to use the same program , but it gives an error : Run-time error 91 in the line : If oRow.Cells(y).Children.Length > 0 Then New table : http://allscores.ru/soccer/fstats.php?champ=2604&team=439&team2=420&tour=110 Old table : http://allscores.ru/soccer/new_ftour.php?champ=2604&f_team=439 New table : in the attached picture Function extractTable(Ssilka As String, book1 As Workbook, iLoop As Long) Dim oDom As Object, oTable As Object, oRow As Object Dim iRows As Integer, iCols As Integer Dim x As Integer, y As Integer Dim data() Dim vata() Dim tata() Dim oHttp As Object Dim oRegEx As Object Dim sResponse As String Dim oRange As Range Dim odRange As Range ' get page Set oHttp = CreateObject("MSXML2.XMLHTTP") oHttp.Open "GET", Ssilka, False oHttp.Send ' cleanup response sResponse = StrConv(oHttp.responseBody, vbUnicode) Set oHttp = Nothing sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE ")) Set oRegEx = CreateObject("vbscript.regexp") With oRegEx .MultiLine = True .Global = True .IgnoreCase = False .Pattern = "<(script|SCRIPT)[\w\W]+?</\1>" sResponse = .Replace(sResponse, "") End With Set oRegEx = Nothing ' create Document from response Set oDom = CreateObject("htmlFile") oDom.Write sResponse DoEvents ' table with results, indexes starts with zero Set oTable = oDom.getelementsbytagname("table")(3) DoEvents iRows = oTable.Rows.Length iCols = oTable.Rows(1).Cells.Length ' first row and first column contain no intresting data ReDim data(1 To iRows - 1, 1 To iCols - 1) ReDim vata(1 To iRows - 1, 1 To iCols - 1) ReDim tata(1 To iRows - 1, 1 To iCols - 1) ' fill in data array For x = 1 To iRows - 1 Set oRow = oTable.Rows(x) For y = 1 To iCols - 1 If oRow.Cells(y).Children.Length > 0 Then data(x, y) = oRow.Cells(y).getelementsbytagname("a")(0).getattribute("href") data(x, y) = Replace(data(x, y), "about:", "http://allscores.ru/soccer/") vata(x, y) = oRow.Cells(y).innerText End If Next y Next x Set oRow = Nothing Set oTable = Nothing Set oDom = Nothing Set oRange = book1.ActiveSheet.Cells(110, 26 + (iLoop * 21)).Resize(iRows - 1, iCols - 1) oRange.NumberFormat = "#" oRange.Value = data Set odRange = book1.ActiveSheet.Cells(34, 26 + (iLoop * 21)).Resize(iRows - 1, iCols - 1) odRange.NumberFormat = "#" odRange.Value = vata Set oRange = Nothing Set odRange = Nothing End Function
This is not particularly robust but does grab the values from the table. iLoop is not used. Option Explicit Public Sub test() extractTable "http://allscores.ru/soccer/fstats.php?champ=2604&team=439&team2=420&tour=110", ThisWorkbook, 1 End Sub Public Sub extractTable(Ssilka As String, book1 As Workbook) Dim oDom As Object, oTable As Object Dim oHttp As Object Dim oRegEx As Object Dim sResponse As String Set oHttp = CreateObject("MSXML2.XMLHTTP") oHttp.Open "GET", Ssilka, False oHttp.send sResponse = StrConv(oHttp.responseBody, vbUnicode) Set oHttp = Nothing sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE ")) Set oRegEx = CreateObject("vbscript.regexp") With oRegEx .MultiLine = True .Global = True .IgnoreCase = False .Pattern = "<(script|SCRIPT)[\w\W]+?</\1>" sResponse = .Replace(sResponse, "") End With Set oRegEx = Nothing Set oDom = CreateObject("htmlFile") oDom.Write sResponse Set oTable = oDom.getElementsByTagName("table")(3) Dim b As Object, a As Object Set b = oTable.getElementsByTagName("TR") 'DispHTMLElementCollection Dim i As Long, y As Long With ActiveSheet For i = 3 To 17 '17-3 gives the 15 rows of interest. Start at 3 to avoid header and empty row. Set a = b(i).ChildNodes For y = 1 To a.Length - 1 .Cells(i - 2, y) = a(y).innerText Next y Next i End With End Sub
Loading only HTML without any style and scripts
I have one VBA script, that find some data on web (csfd.cz) and put into Excel. But it is quite slow because it takes some time to load the website (in IE) and then extract data. My thought is how to make that IE only loads HTML and it doesn't load any graphics and functional things - only pure HTML. Is it somehow possible? Thanks for help... here is my code: Sub InputData() Dim cursor As String Dim i As Long Dim ie As Object Dim lastRow As Long Dim releasesLength As Long Dim releases As Object Dim oneRelease As Object Dim datumKino As String Dim datumDVD As String Dim origins As String Dim year As Long Dim time As Long Dim name As String Dim genreLong As String Dim genre As String 'zapamatování kurzoru cursor = ActiveCell.Address 'zjištění posledního řádku With ActiveSheet lastRow = .Cells(.Rows.Count, "L").End(xlUp).Row End With 'první viditelná buňka Range("L2").Select ActiveCell.Offset(1, 0).Select Do Until ActiveCell.Height <> 0 ActiveCell.Offset(1, 0).Select Loop 'inicializace Set ie = CreateObject("InternetExplorer.Application") 'ZAČÁTEK SMYČKY-------------------------------------------------------------- For i = ActiveCell.Row To lastRow Cells(i, 12).Select 'resetování proměných releasesLength = vbNullLong Set releases = Nothing Set oneRelease = Nothing datumKino = "" datumDVD = "" origins = "" year = vbNullLong time = vbNullLong name = "" genreLong = "" genre = "" 'vyřazení If (InStr(Cells(i, 12).Value, "csfd.cz") = 0 Or ActiveCell.Height = 0) Then GoTo NextRow End If 'otevření stránky ie.Visible = False ie.navigate Cells(i, 12).Value Application.StatusBar = "Načítám údaje. Prosím počkejte..." Do While ie.busy Application.Wait DateAdd("s", 1, Now) Loop 'úprava procent a datumů Cells(i, 9).Value = ie.document.getElementById("rating").Children(0).innerText releasesLength = ie.document.getElementById("releases").getElementsByClassName("content")(0).getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).Children.Length Set releases = ie.document.getElementById("releases").getElementsByClassName("content")(0).getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).Children For j = 0 To releasesLength - 1 Set oneRelease = releases(j) If (oneRelease.getElementsByTagName("th")(0).getElementsByTagName("img")(0).getAttribute("title") = "Česko") Then If (InStr(oneRelease.getElementsByTagName("th")(0).innerHTML, "V kinech")) Then If (datumKino = "") Then datumKino = Left(Replace(Replace(Replace(oneRelease.getElementsByTagName("td")(0).innerHTML, " ", ""), vbLf, ""), vbTab, ""), 10) End If ElseIf (InStr(oneRelease.getElementsByTagName("th")(0).innerHTML, "Na DVD")) Then If (datumDVD = "") Then datumDVD = Left(Replace(Replace(Replace(oneRelease.getElementsByTagName("td")(0).innerHTML, " ", ""), vbLf, ""), vbTab, ""), 10) End If ElseIf (InStr(oneRelease.getElementsByTagName("th")(0).innerHTML, "Na Blu-ray")) Then If (datumDVD = "") Then datumDVD = Left(Replace(Replace(Replace(oneRelease.getElementsByTagName("td")(0).innerHTML, " ", ""), vbLf, ""), vbTab, ""), 10) End If End If Else GoTo NextRelease End If NextRelease: Next j If (Len(datumKino) <> 0) Then Cells(i, 1).Value = datumKino End If If (Len(datumDVD) <> 0) Then Cells(i, 2).Value = datumDVD End If '1. první zápis do řádku If (Cells(i, 4).Value = "") Then year = ie.document.getElementsByClassName("origin")(0).getElementsByTagName("span")(0).innerHTML Cells(i, 4).Value = year origin = ie.document.getElementsByClassName("origin")(0).innerHTML originSplit = Split(origin, ",") time = Replace(originSplit(UBound(originSplit)), " min", "") Cells(i, 10).Value = time name = Replace(Replace(ie.document.getElementsByClassName("info")(0).getElementsByClassName("header")(0).getElementsByTagName("h1")(0).innerHTML, vbLf, ""), vbTab, "") Cells(i, 3).Value = name genreLong = ie.document.getElementsByClassName("genre")(0).innerHTML genre = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(genreLong, " ", ""), "Akční", "Akč."), "Animovaný", "Anim."), "Dobrodružný", "Dobr."), "Dokumentární", "Dokument."), "Fantasy", "Fant."), "Historický", "Histor."), "Katastrofický", "Katastrof."), "Komedie", "Kom."), "Mysteriózní", "Myster."), "Rodinný", "Rod."), "Romantický", "Romant."), "Thriller", "Thril."), "Životopisný", "Životopis.") Cells(i, 5).Value = genre End If NextRow: Next i 'KONEC SMYČKY---------------------------------------------------------------- 'Clean ie.Quit Set ie = Nothing Set releases = Nothing Set oneRelease = Nothing Application.StatusBar = "" Range(cursor).Select End Sub