Related
Hi I am trying to scrape the Product name (Cohiba Robusto), Product Size (Single Cigar, Pack of 3, Box of 25) and prices (£33.65, £90, £730) from this website: https://www.jjfox.co.uk/cohiba-robusto-621.html
I am trying to get something like this:
I am using the code below, which gives an error ("Object variable or with variable not set").
Will appreciate any help with this.
Sub getproducts()
Sheets("JJFox").Select
Dim oHtml As HTMLDocument
Dim oElement As Object
Dim Elements As IHTMLElementCollection
Dim Document As HTMLDocument
Set oHtml = New HTMLDocument
'Cells(1, 6) = Time()
lastrow = ActiveSheet.Cells(Rows.Count, "A").End(xlUp).Row
cnt = lastrow + 1
counter1 = cnt
Dim gg As String
gg = "https://www.jjfox.co.uk/cohiba-robusto-621.html"
Dim objHTTP As New WinHttp.WinHttpRequest
url = gg
objHTTP.Open "POST", url, False
objHTTP.setRequestHeader "Content-Type", "application/json"
objHTTP.send ("{""key"":null,""from"":""me#me.com"",""to"":null,""cc"":null,""bcc"":null,""date"":null,""subject"":""My Subject"",""body"":null,""attachments"":null}")
oHtml.body.innerHTML = objHTTP.responseText
'Cells(rw, 2) = oHtml.getElementsByTagName("description").innerText
' If Not .Document.querySelector("button[aria-label='Close']") Is Nothing Then
' .Document.querySelector("button[aria-label='Close']").Click
' End If
txttitle = oHtml.getElementsByClassName("productcart")(0).innerText
txttitlehtml = oHtml.getElementsByClassName("packsize")(0).innerHTML
txttitle = Mid(txttitle, 1, InStr(1, txttitle, Chr(10)))
'Debug.Print txttitlehtml
'txttitle2 = oHtml.getElementsByClassName("price")(0).innerText
Dim Text As String
Text = GetHTML(gg)
starts = InStr(1, Text, "spConfig =")
endS = InStr(starts + 1, Text, "spConfig")
If starts = 0 Then
Cells(counter1, 1) = txttitle
Cells(counter1, 2) = "Single"
starts = InStr(starts + 1, Text, "productPrice")
endl = InStr(starts + 1, Text, ",")
Cells(counter1, 3) = Val(Mid(Text, starts + 14, endl - (starts + 14)))
Cells(counter1, 4) = "JJFox"
Cells(counter1, 5) = Now()
Cells(counter1, 7) = gg ' link to the page
counter1 = counter1 + 1
Else
Text = Mid(Text, starts, endS - starts)
'Debug.Print Text
'find how many pack options are avaialble
myTxt = Text
countTxt = "label"
bb = (Len(myTxt) - Len(replace(myTxt, countTxt, ""))) / Len(countTxt) - 1
'End find////////////////////////////////////
varlabel = "class=" & Chr(34) & "label" & Chr(34)
starts = InStr(1, Text, "label") + 1
Text = Mid(Text, starts, Len(Text))
For i = 1 To bb
starts = InStr(1, Text, "label")
If InStr(starts, Text, "label") Then
'Show the element's properties
Cells(counter1, 1) = txttitle
Cells(counter1, 2) = Mid(Text, starts + 8, InStr(starts, Text, " \") - (starts + 8))
starts = InStr(starts + 1, Text, "oldPrice")
endl = InStr(starts + 1, Text, ",")
Cells(counter1, 3).FormulaR1C1 = Val(Mid(Text, starts + 11, endl - (starts + 11)))
'Debug.Print Val(Mid(Text, startS + chrs, 6))
Cells(counter1, 4) = "JJFox"
Cells(counter1, 5) = Now()
starts = starts + 1
Text = Mid(Text, starts, Len(Text))
Cells(counter1, 7) = gg ' link to the page
counter1 = counter1 + 1
End If
Next i
End If
'Cells(2, 6) = Time()
End Sub
Function GetHTML(url As String) As String
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
.Open "GET", url, False
.send
GetHTML = .responseText
End With
End Function
The prices and labels are pulled dynamically from a script tag who content you can parse as json with a json parser. You need to grab the name from the html however.
With a little knowledge of html and css, it is easy enough to define a css pattern to target the script node of interest with:
.fieldset [type='text/x-magento-init']
That looks for a child script with type attribute having attribute value text/x-magento-init, and a parent with class fieldset.
I have used a tiny bit less efficient (you won't notice):
For i = 1 To optionsCollection.Count
Simply because I know the collection is small and to allow me to index into two variables with a single loop.
Json library:
I use jsonconverter.bas. Download raw code from here and add to standard module called JsonConverter . Remove the top Attribute line from the copied code.
You then need to go:
VBE > Tools > References > Add references to:
Microsoft Scripting Runtime
Microsoft HTML Object Library
Microsoft XML Library.
In VBA for json the [] denotes a collection and the {} represents a dictionary.
Option Explicit
Public Sub GetCigarData()
'< VBE > Tools > References:
'Microsoft Scripting Runtime
'Microsoft HTML Object Library
'Microsoft XML Library
Dim json As Object, html As MSHTML.HTMLDocument, xhr As MSXML2.XMLHTTP60, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set xhr = New MSXML2.XMLHTTP60
Set html = New MSHTML.HTMLDocument
With xhr
.Open "GET", "https://www.jjfox.co.uk/cohiba-robusto-621.html", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End With
Set json = jsonConverter.ParseJson(html.querySelector(".fieldset [type='text/x-magento-init']").innerHTML)("#product_addtocart_form")("configurable")("spConfig")
Dim prices As Scripting.Dictionary, options As Scripting.Dictionary, optionsCollection As Collection
Set prices = json("optionPrices")
Set options = json("attributes")
Set optionsCollection = options(options.Keys(0))("options")
Dim results() As Variant, headers() As Variant, i As Long, name As String
ReDim results(1 To optionsCollection.Count, 1 To 3)
name = html.querySelector(".base").innerText
For i = 1 To optionsCollection.Count
results(i, 1) = name
results(i, 2) = optionsCollection.item(i)("label")
results(i, 3) = prices(prices.Keys(i - 1))("finalPrice")("amount")
Next
headers = Array("Name", "Size", "Price")
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Read about css selectors:
https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
I want to import restaurant data like Restaurant name, phone number, website & address to excel but unfortunately, I am getting sponsored results & also not getting website & full address as it in on the inner page when we click on the hotel name. I with some help at platforms have created a code using but it is not helping out. Please rectify the issue in my code. Website:https://www.yelp.com/searchcflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=
Here is my code:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&
For page = 0 To 1 ' this is where you change the last number for the pages to traverse
With Http
.Open "GET", URL & page * 30, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult']")
For I = 0 To .Length - 1
Htmldoc.body.innerHTML = .Item(I).outerHTML
On Error Resume Next
r = r + 1: Cells(r, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
Cells(r, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
' Cells(r, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
'Cells(r, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
'Inner loop creation
Cells(r, 5) = Htmldoc.querySelector("[class*='container'] > website").href ' Extract from window after clicking on hotel name
Cells(r, 6) = Htmldoc.querySelector("[class*='container'] > fulladdress").innerText ' Extract from window after clicking on hotel name
On Error GoTo 0
Next I
End With
Next page
End Sub
You can use the free API to get the top 50 from the business_search endpoint. Pass sort parameter in query string to get top rated.
Use a json parser, such as jsonconverter.bas to handle the response. After installing the code from that link in a standard module called JsonConverter, go to VBE > Tools > References > Add a reference to Microsoft Scripting Runtime.
API instructions are here. You need to set up a test app, which requires some basic user info, and verify your email. You will then receive an API key for authentication which is passed in the authorization header as shown below.
There is other info returned which you can parse if wanted.
Option Explicit
Public Sub GetTopRestuarants()
Dim json As Object, headers(), r As Long, c As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://api.yelp.com/v3/businesses/search?term=restuarant&location=san-francisco&limit=50&sort_by=rating", False
.setRequestHeader "Authorization", "Bearer yourAPIkey"
.send
Set json = JsonConverter.ParseJson(.responseText)("businesses")
headers = Array("Restaurant name", "phone", "website", "address")
Dim results(), item As Object
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1
results(r, 1) = item("name")
results(r, 2) = item("phone")
results(r, 3) = item("url")
Dim subItem As Variant, address As String
address = vbNullString
For Each subItem In item("location")("display_address")
address = address & Chr$(32) & subItem
Next
results(r, 4) = Trim$(address)
Next
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Example top 20 of 50 returned:
Caveat emptor
Note that specifying the sort_by is a suggestion (not strictly enforced) to Yelp's search, which considers multiple input parameters to return the most relevant results. For example, the rating sort is not strictly sorted by the rating value, but by an adjusted rating value that takes into account the number of ratings, similar to a Bayesian average. This is to prevent skewing results to businesses with a single review.
This is one of the ways which will let you parse the results from it's inner pages. I can't access that webpage anymore to assist you further. However, give it a shot. I suppose it will work:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Const base$ = "https://www.yelp.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object
[A1:D1] = [{"Name","Phone","Address","Website"}]
For page = 1 To 3 'this is where you change the last number for this script to traverse
With Http
.Open "GET", URL & page * 30, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
For I = 0 To .Length - 1
If Not InStr(.item(I).getAttribute("href"), "/adredir?") > 0 Then
oTitle = .item(I).innerText
newUrl = Replace(.item(I).getAttribute("href"), "about:", base)
With Http
.Open "GET", newUrl, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Htmldoc.body.innerHTML = .responseText
End With
R = R + 1: Cells(R + 1, 1) = oTitle
Set oPhone = Htmldoc.querySelector(".biz-phone")
If Not oPhone Is Nothing Then
Cells(R + 1, 2) = oPhone.innerText
End If
Set oAddress = Htmldoc.querySelector(".map-box-address")
If Not oAddress Is Nothing Then
Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
End If
Set oWeb = Htmldoc.querySelector(".biz-website > a")
If Not oWeb Is Nothing Then
Cells(R + 1, 4) = oWeb.innerText
End If
End If
Next I
End With
Next page
End Sub
Btw, the ads have been kicked out.
In this thread [Link}(Scraping table from local HTML with unicode characters), QHarr has helped me to scrape a table from local html file.
I have a html file at this Link
And I used the same code and edited a little for the variables 'startTableNumber' and 'endTableNumber' and 'numColumns'
Public Sub Test()
Dim fStream As ADODB.Stream, html As HTMLDocument
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile "C:\Users\Future\Desktop\Sample 2.html"
html.body.innerHTML = .ReadText
.Close
End With
Dim hTables As Object, startTableNumber As Long, i As Long, r As Long, c As Long
Dim counter As Long, endTableNumber, numColumns As Long
startTableNumber = 91
endTableNumber = 509
numColumns = 14
Set hTables = html.getElementsByTagName("table")
r = 2: c = 1
For i = startTableNumber To endTableNumber Step 2
counter = counter + 1
If counter = 10 Then
c = 1: r = r + 1: counter = 1
End If
Cells(r, c) = hTables(i).innerText
c = c + 1
Next
End Sub
But I got scattered data of the table further more I would like to find a flexible way so as to make the code recognize those variables without assigning them manually
I hope to find solution using selenium. Hope also not to receive negative rep. I have done my best to clarify the issue
Regards
So, as I said in my comments you need to study how the data appears in the later table tags and perform a mapping to get the correct ordering. The following writes out the table. As I also mentioned, this is not robust and only the methodology may possibly be transferable to other documents.
In your case you wouldn't be reading from file but would use
Set tables = driver.FindElementsByCss("table[width='100%'] table:first-child")
You would then For Each over the web elements in the collection adjusting the syntax as required e.g. .Text instead of .innerText. There may be a few other adaptations for selenium due to its indexing of webElements but everything you need to should be evident below.
VBA:
Option Explicit
Public Sub ParseInfo()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, i As Long
Set ws = ThisWorkbook.Worksheets("Sheet2")
Dim fStream As ADODB.Stream
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
With fStream
.Charset = "UTF-8"
.Open
.LoadFromFile "C:\Users\User\Desktop\test.html"
html.body.innerHTML = .ReadText
.Close
End With
Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
Dim rowCounter: rowCounter = 2
Dim mappings(), j As Long, headers(), arr(13)
headers = Array("Notes", "Type", "Enrollment status", "Governorate of birth", "Year", "Month", "Day", "Date of Birth", "Religion", _
"Nationality", "Student Name", "National Number", "Student Code", "M")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For i = 89 To 504 Step 26
arr(0) = vbNullString
For j = 0 To 12
arr(mappings(j)) = tables.item(2 * j + i).innerText
Next
ws.Cells(rowCounter + 1, 1).Resize(1, UBound(arr) + 1) = arr
rowCounter = rowCounter + 1
Next
End Sub
Public Sub D_Galoplar()
Application.ScreenUpdating = False
Dim Asay(1 To 250)
Dim Jsay(1 To 100)
For q = 2 To Sheets("Y").Columns("A:A").Find(What:="boş").Row - 1
Asay(q - 1) = Sheets("Y").Range("A" & q)
Next q
For q = 2 To Sheets("Y").Columns("C:C").Find(What:="boş").Row - 1
Jsay(q - 1) = Sheets("Y").Range("C" & q)
Next q
For w = 1 To 250
Cells.Delete Shift:=xlUp
Range("A1").Select
If Asay(w) < 1 Then Exit For
Dim elem As Object, trow As Object
Dim R&, C&, s$
With New XMLHTTP60
.Open "POST", "https://yenibeygir.com/at/getatdetaytab", False
.setRequestHeader "content-type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "tab=galopTab&id=" & Asay(w)
s = .responseText
End With
With New HTMLDocument
.body.innerHTML = s
For Each elem In .getElementsByClassName("at_Galoplar")(0).Rows
For Each trow In elem.Cells
C = C + 1: Cells(R + 1, C) = trow.innerText
Next trow
C = 0: R = R + 1
Next elem
End With
Cells.UnMerge
Range("A1").Select
If Range("A1048576").End(xlUp).Row < 2 Then GoTo ATLA2
Columns("A:A").Insert
For i = 2 To Range("B1048576").End(xlUp).Row - 1
Range("A" & i) = Asay(w)
Next i
Range("O2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/4,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/400))"
Range("P2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/6,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/600))"
Range("Q2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/8,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/800))"
Range("R2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/10,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/1000))"
Range("S2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/12,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/1200))"
Range("T2").FormulaR1C1 = "=IF(ISBLANK(RC[-9]),""-"",IF(ISNUMBER(RC[-9]),RC[-9]/14,((LEFT(RC[-9],1)*6000)+(MID(RC[-9],3,2)*100)+(RIGHT(RC[-9],1)*10))/1400))"
Range("O2:T2").Copy
Range("O2:O" & Range("A1048576").End(xlUp).Row).PasteSpecial Paste:=xlPasteFormulas
Cells.Copy
Cells.PasteSpecial Paste:=xlPasteValues
Columns("O:T").Cut Columns("F:K")
Range("A2:N" & Range("A1048576").End(xlUp).Row).Copy
Sheets("Galop").Range("A" & Sheets("Galop").Range("A1048576").End(xlUp).Row + 1).PasteSpecial Paste:=xlPasteValues
ATLA2:
Cells.Delete Shift:=xlUp
Next w
End Sub
I want to get a lot of data with the For Next cycle, but after a while the page hangs. How can I reset objects at the end of each cycle?
Asay numbers
10182
10221
10279
10303
10316
10325
10360
10370
10680
11598
11629
11715
11745
12335
12385
12533
12559
13154
13393
13635
13641
13669
13673
14027
14057
14062
14228
14619
14674
14687
14743
14770
14778
15197
15217
15323
15382
15507
15775
15828
16077
16335
16510
17149
17513
17867
18532
37964
60176
66067
66255
66581
66582
66896
66998
67056
67309
67356
67379
67473
68008
68012
68162
68298
68312
68320
68332
68333
68353
68383
68545
68702
68775
68922
69445
69606
69817
69963
69968
69985
69986
70048
70202
71372
(boş)
Slowing down maybe due to throttling of network if you are trying to hit the site too many times in quick succession. This is particularly likely given your access method. Better would be to see if an API is available to bulk access info. You are likely going through many networks to get to this page as well. It may be possible to get some basic info about delays from TRACERT command from a command prompt.
You are doing a POST so remember there is a fair amount of server side stuff going on as well.
You don't need to set elem to Nothing as it only exists during your For Loop. Same for tRow.
Putting .getElementsByClassName("at_Galoplar")(0).Rows into a variable will provided faster referencing.
Write the results to an array first and then dump the array out to the sheet in one go will provide significant improvement in speed.
Using New keyword can lead to unexpected behaviour. You can create one instance of HTMLDocument and work with that provided you have good error handling in. I have had occassional cases in a loop where I have had to set HTMLDocument to Nothing before looping back round.
Personally, I would cheat and re-write this to leverage that you can issue GET requests to get the same info. I use a class to hold the XMLHTTP object, and an array to hold the results. I write the results out in one go. This takes a few seconds to run for me. The asay numbers are in Sheet1 range A1:A84.
Class module clsHTTP
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal url As String) As String
Dim sResponse As String
With http
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
GetString = sResponse
End With
End Function
Standard module 1
Option Explicit
Public Sub DGaloplar()
Dim asays(), ws As Worksheet, asay As Long, html As HTMLDocument
Dim http As clsHTTP, url As String, headers(), numberOfRequests As Long
headers = Array("Asay", "Tarih", "Sehir", "Kg", "Jokey", "400", "600", "800", "1000", "1200", "1400", "Ç", "Pist", "Durum")
Set http = New clsHTTP
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
asays = Application.Transpose(ws.Range("A1:A84").Value) 'Load asay values from sheet 1
Const numTableRows As Long = 11
Const numTableColumns As Long = 15
Const BASE_URL As String = "https://yenibeygir.com/at/getatdetaytab/?tab=galopTab&id="
numberOfRequests = UBound(asays)
Dim results(), headerRow As Boolean, tRows As Object, tRow As Object, iRow As Long
Dim tCells As Object, tCell As Object, r As Long, c As Long, hTable As HTMLTable
ReDim results(1 To numTableRows * numberOfRequests, 1 To numTableColumns)
Application.ScreenUpdating = False
For asay = 1 To numberOfRequests
headerRow = True
url = BASE_URL & asays(asay)
html.body.innerHTML = http.GetString(url)
Set hTable = html.querySelector(".at_Galoplar")
Set tRows = hTable.getElementsByTagName("tr")
For Each tRow In tRows
If Not headerRow Then
c = 2: r = r + 1
results(r, 1) = asays(asay)
Set tCells = tRow.getElementsByTagName("td")
For Each tCell In tCells
results(r, c) = tCell.innerText
c = c + 1
Next
End If
headerRow = False
Next
Next
With ws
.Cells(1, 3).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 3).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
Application.ScreenUpdating = True
End Sub
Refereces:
Microsoft HTML Object Library
In general Set elem = Nothing is what you need.
In your code you are assigning the variables within a for-each loop, thus even if you set them to Nothing later, there would not be a performance bonus.
Try setting those object to Nothing, like below:
Set elem = Nothing
Set trow = Nothing
I am not sure whether you need variable declarations in your loop, you can take them out of the loop, this might save some time.
But I think your HTTP requests are taking so long, not any VBA code.
UPDATE
Try setting Application.EnableEvents and Application.ScreenUpdating to False at the beggining of macro and setting them back to True at the end.
I am trying to gather all of the property data from this website:
http://taxsales.lgbs.com/
There are currently 7,000+ properties, but on any given view of the page I can only see 15 - 20 depending on screen resolution.
I have figured out, roughly, how to search through the HTML for the property names and details. Because the initial search has a warning screen, I'm using the following code to "click" the "Agree" button to see the subsequent search page. You can also see that I don't know how to find only the property details, and am instead taking (basically) all of the HTML from the whole site and sifting through it later in excel.
Questions:
1) Is there any way to see data for all of the properties at once? -or- How can I "page" through each portion of the results to eventually collect all of them?
2) How can I collect only the data for property address, sale date, sale type, etc.?
Sub HTML_scrape()
Dim IE As Object
Dim objElement As Object
Dim objCollection As Object
Dim objCollection2 As Object
Dim r As Integer
Dim v As Object
' Create InternetExplorer Object
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
' Using the URL with "full" map
IE.navigate "http://taxsales.lgbs.com/map?lat=39.576604&lon=-96.72178200000002&zoom=4&offset=0&ordering=sale_date,address_full,uid&sale_type=SALE,RESALE,STRUCK%20OFF,FUTURE%20SALE&in_bbox=-137.2217809271164,15.247775193567845,-56.221783072883625,57.63696077532424"
' Wait while IE loading...
Do While (IE.Busy Or IE.READYSTATE <> 4)
Application.Wait DateAdd("s", 2, Now)
Loop
' Click the Agree Button
Set objCollection = IE.document.getElementsByClassName("btn btn-primary")
objCollection(0).Click
Do While (IE.Busy Or IE.READYSTATE <> 4)
Application.Wait DateAdd("s", 2, Now)
Loop
' Find all tags and collect the associated data
' This should only find the relevant property data, but I can not
' figure out how to only take the information within a
' <a class="ng-binding" ng-click="listing.addressClick()"> tag
Set objCollection2 = IE.document.getElementsByTagName("*")
r = 1
For Each v In objCollection2
Sheets("Sheet1").Range("A" & r).Value = v.outerHTML
r = r + 1
Next
With Sheets("Sheet1").Columns("A")
Dim DQ As String
DQ = Chr(34)
.Replace What:="#", Replacement:="'"
.Replace What:=DQ, Replacement:="'"
End With
End Sub
Property Sale Date HTML:
<li ng-if="listing.property.sale_date" class="ng-binding ng-scope"><label>Sale Date:</label> 4/5/18 9:00 AM</li>
Property Address HTML:
<a ng-click="listing.addressClick()" class="ng-binding"> 02863 Stouton St, Philadelphia PA 19134-3515 </a>
Next Button HTML:
Next
Here's a hacky approach to use web requests (xhr) instead. Looking at the requests for the page it looks like it returns a JSON object. I'm parsing this JSON object and dumping it to a range on the first sheet. Here's some code that should get you started.
'You'll need the following references:
'MSXML v6.0
'Microsoft Scripting Runtime
'JSON project from: https://github.com/VBA-tools/VBA-JSON
Public Sub Scraper()
Dim webrequest As MSXML2.XMLHTTP60
Dim JSON As Object
Dim responses As Object
Dim itemdict As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim item As Variant
Dim myarray As Variant: ReDim myarray(0 To 20, 0 To 5000)
Dim url As String: url = "http://taxsales.lgbs.com/api/property_sales/?in_bbox=-139.04111793750002%2C7.97834134877145%2C-54.40244606250002%2C61.39968867373271&offset=10&ordering=sale_date%2Caddress_full%2Cuid&sale_type=SALE%2CRESALE%2CSTRUCK+OFF%2CFUTURE+SALE"
For i = 0 To 10 'Do a loop to get SOME of the data, probably need a different loop here
With New MSXML2.XMLHTTP60
.Open "GET", url
.setRequestHeader "accept", "application/json, text/plain, */*"
.send
'Parse the response into a JSON dict
Set JSON = JsonConverter.ParseJson(.responseText)
url = JSON("next") ' the next URl to send a GET request
Set responses = JSON("results") 'Get the results Dict
On Error Resume Next ' getting an error, just ignoring for now
For Each itemdict In responses
j = 0
'add headers
If k = 0 Then
For Each item In itemdict
myarray(j, k) = item
j = j + 1
Next
End If
'add values
For Each item In itemdict
myarray(j, k) = itemdict(item)
j = j + 1
Next
k = k + 1
Next
On Error GoTo 0
End With
Next
ReDim Preserve myarray(0 To 20, 0 To k - 1)
ThisWorkbook.Sheets(1).Range("A1:T" & k - 1).Value = TransposeArray(myarray)
End Sub
'using this function as worksheetfunction.transpose causing issues
Public Function TransposeArray(myarray As Variant) As Variant
Dim X As Long
Dim Y As Long
Dim Xupper As Long
Dim Yupper As Long
Dim tempArray As Variant
Xupper = UBound(myarray, 2)
Yupper = UBound(myarray, 1)
ReDim tempArray(Xupper, Yupper)
For X = 0 To Xupper
For Y = 0 To Yupper
tempArray(X, Y) = myarray(Y, X)
Next
Next
TransposeArray = tempArray
End Function
Output: