I was having trouble with coding and I could no longer work.
https://tradingeconomics.com/
I posted the coding logic I was working on below. I imported the excel sheet to Portugal of the countries shown on the homepage.
But,
<a id="ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_LinkButton1" class="btn-group btn-group-sm" href="javascript:__doPostBack('ctl00$ContentPlaceHolder1$defaultUC1$CurrencyMatrixAllCountries1$LinkButton1','')">
<button type = "button" class = "btn btn-default">
<i class = "glyphicon glyphicon-plus"> </ i>
</ button>
</a>
How can I code doPostBack to finish my work here? I looked through stackoverflow homepage and tried various trial and error, but I could not finish it.
Option Explicit
Public Sub New_Listing()
Application.ScreenUpdating = False
Dim IE As New InternetExplorer
Const MAX_WAIT_SEC As Long = 5
Dim http As New MSXML2.XMLHTTP60
Dim html As New HTMLDocument
Dim ws As Worksheet
Dim sResponse0 As String
Dim g As Integer
Set http = CreateObject("MSXML2.XMLHTTP")
Set ws = ThisWorkbook.Worksheets("Sheet1")
Dim url As String
url = "https://tradingeconomics.com/"
With IE
.Visible = True
.navigate "https://tradingeconomics.com/"
While .Busy Or .readyState < 4: DoEvents: Wend
End With
Dim tarTable As HTMLTable
Dim hTable As HTMLTable
For Each tarTable In IE.document.getElementsByTagName("table")
If InStr(tarTable.ID, "ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1") <> 0 Then
Set hTable = tarTable
End If
Next
Dim startRow As Long
Dim tRow As Object, tCell As Object, tr As Object, td As Object, r As Long, c As Long
r = startRow
With ws
Set tRow = hTable.getElementsByTagName("tr")
ReDim arr0(tRow.Length - 1, 0)
For Each tr In tRow
r = r + 1
Set tCell = tr.getElementsByTagName("td")
If tCell.Length > UBound(arr0, 2) Then
ReDim Preserve arr0(tRow.Length - 1, tCell.Length)
End If
c = 1
For Each td In tCell
arr0(r - 1, c - 1) = td.innerText
c = c + 1
Next td
Next tr
Dim k As Integer
Dim i As Integer
k = 0
For i = LBound(arr0, 1) To UBound(arr0, 1)
.Cells(2 + k, 2) = arr0(i, 0)
.Cells(2 + k, 3) = arr0(i, 1)
.Cells(2 + k, 4) = arr0(i, 2)
.Cells(2 + k, 5) = arr0(i, 3)
.Cells(2 + k, 6) = arr0(i, 4)
.Cells(2 + k, 7) = arr0(i, 5)
.Cells(2 + k, 8) = arr0(i, 6)
.Cells(2 + k, 9) = arr0(i, 7)
k = k + 1
Next i
End With
With IE
.Visible = True
.document.querySelector("a.btn-group btn-group-sm[href='javascript:__doPostBack('ctl00$ContentPlaceHolder1$defaultUC1$CurrencyMatrixAllCountries1$LinkButton1','')']").Click
End With
Set tRow = Nothing: Set tCell = Nothing: Set tr = Nothing: Set td = Nothing
Set hTable = Nothing: Set tarTable = Nothing
Application.ScreenUpdating = True
End Sub
I have completed my work up to Portugal, how can I fix it to get the next data, Czech Republic ?, I would be very grateful if you could give me details of how to modify the coding. I have learned vba very soon, so I have a lot of difficulties.
enter image description here
The Czech Republic value is part of a values set which is returned from the server after clicking a button in the web page itself, you'll have to simulate a click on this button by the VBA code, wait for IE to fetch the result and then continue with your code:
As this is the HTML of the button for fetching the next set of values:
<button type="button" class="btn btn-default">...
Use this to simulate a click and wait for the result fetched:
IE.document.getElementsByClassName("btn-default")(0).Click
Application.Wait(Now + TimeValue("0:00:05"))
After it you will be able to read the Czech Republic and the next set of values by the VBA code.
Internet Explorer
Here with IE - get's all results (the whole World) in one go including links.
Condition based wait with timeout:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, ws As Worksheet, clipboard As Object, t As Date
Const MAX_WAIT_SEC As Long = 5
Const URL = "https://tradingeconomics.com/"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
With IE
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("[value=world]").Selected = True
.querySelector("select").FireEvent "onchange"
t = Timer
Do
DoEvents
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_ParameterContinent").Value <> "world"
clipboard.SetText .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").outerHTML
clipboard.PutInClipboard
End With
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub
Explicit wait based:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, ws As Worksheet, clipboard As Object
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Const URL = "https://tradingeconomics.com/"
With IE
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
.querySelector("[value=world]").Selected = True
.querySelector("select").FireEvent "onchange"
Application.Wait Now + TimeSerial(0, 0, 5)
clipboard.SetText .getElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").outerHTML
clipboard.PutInClipboard
End With
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub
Selenium basic
This is easy enough using selenium basic and allowing enough time for the postBack to update the page. After selenium basic install add reference via VBE> Tools > References > Selenium Type Library. More selenium info [here]. The below gets all the World data in one go.
Option Explicit
Public Sub GetInfo()
Dim d As WebDriver, ws As Worksheet, clipboard As Object
Set d = New ChromeDriver
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Const URL = "https://tradingeconomics.com/"
With d
.Start "Chrome"
.get URL
.FindElementByCss("[value=world]").Click
Application.Wait Now + TimeSerial(0, 0, 5)
clipboard.SetText .FindElementById("ctl00_ContentPlaceHolder1_defaultUC1_CurrencyMatrixAllCountries1_GridView1").Attribute("outerHTML")
clipboard.PutInClipboard
.Quit
ws.Cells(1, 1).PasteSpecial
End With
End Sub
Related
I am using VBA to extract the data from HTML in <span code which is under <Div which is under <li which is under <ul
I'm trying to extract the "date and matter" from HTML. "Date" should be in A column and "Matter" should be in B Column in Excel.
The drawback of my code is, it is pulling all the Date and matter into single cell.
Sub GetDat()
Dim IE As New InternetExplorer, html As HTMLDocument
Dim elem As Object, data As String
With IE
.Visible = True
.navigate "https://www.MyURL/sc/wo/Worders/index?id=76888564"
Do While .readyState <> READYSTATE_COMPLETE: Loop
Set html = .document
End With
data = ""
For Each elem In html.getElementsByClassName("simple-list")(0).getElementsByTagName("li")
data = data & " " & elem.innerText
Next elem
Range("A1").Value = data
IE.Quit
End Sub
The output that I need is shown in the image:
HTML:
You could grab two nodeLists, one for dates and one for matters, and then loop those writing out to sheet. Match dates based on data-bind attribute value; matters on classname:
Dim dates As Object, matters As Object, i As Long, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set dates = ie.document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
Set matters = ie.document.querySelectorAll(".wo-notes")
With ws
For i = 0 To dates.Length - 1
.Cells(i + 1, 1) = dates.Item(i).innertext
.Cells(i + 1, 2) = matters.Item(i).innertext
Next
End With
Example reading values from column C:
Option Explicit
Public Sub GetMatters()
Dim ws As Worksheet, lastRow As Long, urls(), results(), ie As SHDocVw.InternetExplorer, r As Long
Set ie = New SHDocVw.InternetExplorer
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.Rows.Count, "C").End(xlUp).Row
urls = Application.Transpose(ws.Range("C2:C" & lastRow).Value)
ReDim results(1 To 1000, 1 To 2)
With ie
.Visible = True
For i = LBound(urls) To UBound(urls)
.navigate2 "https://www.MyURL/sc/wo/Worders/index?id=" & urls(i)
While .Busy Or .readyState <> 4: DoEvents: Wend
Dim dates As Object, matters As Object, i As Long
Set dates = .document.querySelectorAll("[data-bind^='text:createdDate']") '.wo-notes-col-1 [data-bind^='text:createdDate']
Set matters = .document.querySelectorAll(".wo-notes")
For i = 0 To dates.Length - 1
r = r + 1
results(r, 1) = dates.Item(i).innertext
results(r, 2) = matters.Item(i).innertext
Next
Set dates = Nothing: Set matter = dates
Next
.Quit
End With
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
References:
document.querySelectorAll
css selectors
Day two of researching this. I'm just not getting it. The web page is public:
https://register.fca.org.uk/ShPo_FirmDetailsPage?id=001b000000MfF1EAAV
Manually I pgdn x 2 to get to the button [+] Individuals, click it then pgdn x 1 to get to the "results per page" drop down and change it to 500. then copy and paste the results into excel
this is the code that I found on this site "Selecting a dropdown list when inserting data from web (VBA)" answered by QHarr which I tried to adapt and failed miserably. I put "HELP" where I think I should be making the changes but I'm just guessing
Public Sub MakeSelectiongGetData()
Dim IE As New InternetExplorer
Const URL = "https://register.fca.org.uk/ShPo_FirmDetailsPage?id=001b000000Mfe5TAAR#ShPo_FirmDetailsPage"
'Const optionText As String = "RDVT11"
Application.ScreenUpdating = False
With IE
.Visible = True
.navigate URL
While .Busy Or .readyState < 4: DoEvents: Wend
Dim a As Object
Set a = .document.getElementById("HELP")
Dim currentOption As Object
For Each currentOption In a.getElementsByTagName("HELP")
If InStr(currentOption.innerText, optionText) > 0 Then
currentOption.Selected = "HELP"
Exit For
End If
Next currentOption
.document.getElementById("HELP").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Dim nTable As HTMLTable
Do: On Error Resume Next: Set nTable = .document.getElementById("HELP"): On Error GoTo 0: DoEvents: Loop While nTable Is Nothing
Dim nRow As Object, nCell As Object, r As Long, c As Long
With ActiveSheet
Dim nBody As Object
Set nBody = nTable.getElementsByTagName("tbody")(0).getElementsByTagName("tr")
.Cells(1, 1) = nBody(0).innerText
For r = 2 To nBody.Length - 1
Set nRow = nBody(r)
For Each nCell In nRow.Cells
c = c + 1: .Cells(r + 1, c) = nCell.innerText
Next nCell
c = 0
Next r
End With
.Quit
End With
Application.ScreenUpdating = True
End Sub
So I have included your changes and am here.
Public Sub MakeSelections()
Dim IE As New InternetExplorer
With IE
.Visible = True
.Navigate2 "https://register.fca.org.uk/ShPo_FirmDetailsPage?id=001b000000MfF1EAAV"
While .Busy Or .readyState < 4: DoEvents: Wend
.document.querySelector("[href*=FirmIndiv]").Click '<==click the + for indiv
.document.querySelector("#IndividualSearchResults_length[value='500']").Selected = True
End With
Dim nTable As HTMLTable
Do: On Error Resume Next: Set nTable =IE.document.getElementById("IndividualSearchResults"): On Error GoTo 0: DoEvents: Loop While nTable Is Nothing
Dim nRow As Object, nCell As Object, r As Long, c As Long
With ActiveSheet
Dim nBody As Object
Set nBody = nTable.getElementsByTagName("Name")(0) _
.getElementsByTagName("ShG1_IRN_c") _
.getElementsByTagName("ShGl_IndividualStatus__c") _
.getElementsByTagName("ShPo_Registerstatus__c") _
.getElementsByTagName("Id") _
.getElementsByTagName("RecordTypeId") _
.getElementsByTagName("CurrencyIsoCode") _
.Cells(1, 1) = nBody(0).innerText
For r = 2 To nBody.Length - 1
Set nRow = nBody(r)
For Each nCell In nRow.Cells
c = c + 1: .Cells(r + 1, c) = nCell.innerText
Next nCell
c = 0
Next r
End With
End Sub
You can use css attribute = value selectors to target the + for individuals and also to make the option selection for 500
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub MakeSelections()
Dim IE As New InternetExplorer
With IE
.Visible = True
.Navigate2 "https://register.fca.org.uk/ShPo_FirmDetailsPage?id=001b000000MfF1EAAV"
While .Busy Or .readyState < 4: DoEvents: Wend
.document.querySelector("[href*=FirmIndiv]").Click '<==click the + for indiv
.document.querySelector("#IndividualSearchResults_length [value='500']").Selected = True
Dim event_onchange As Object
Set event_onchange = .document.createEvent("HTMLEvents")
event_onchange.initEvent "change", True, False
.document.querySelector("[name=IndividualSearchResults_length]").dispatchEvent event_onchange
Application.Wait Now + TimeSerial(0, 0, 5)
Dim clipboard As Object, ws As Worksheet
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
Set ws = ThisWorkbook.Worksheets("Sheet1")
clipboard.SetText .document.querySelector("#IndividualSearchResults").outerHTML
clipboard.PutInClipboard
ws.Cells(1, 1).PasteSpecial
.Quit
End With
End Sub
This selector, [href*=FirmIndiv], is an attribute = value selector with contains (*) modifier. It looks for the matches for href attributes that contain the substring FirmIndiv in the href value. querySelector all method of HTMLDocument *(ie.Document) will return the first match found.
You can see the match here:
The selector for the option tag element (the parent select tag for result counts contains child option tag elements):
#IndividualSearchResults_length [value='500']
It uses an id (#) selector to target the div parent, of the parent select element, by its id value IndividualSearchResults_length, then uses a descendant combinator (" ") followed by attribute = value selector to specify the option element with value = 500.
You can see that here:
Selenium basic version:
Option Explicit
Public Sub MakeChanges()
'VBE > Tools > References > Selenium Type Library
'Download: https://github.com/florentbr/SeleniumBasic/releases/tag/v2.0.9.0
Dim d As WebDriver
Set d = New ChromeDriver
Const url = "https://register.fca.org.uk/ShPo_FirmDetailsPage?id=001b000000MfF1EAAV"
With d
.Start "Chrome"
.get url
.FindElementByCss("[href*=FirmIndiv]").Click
.FindElementByCss("[name=IndividualSearchResults_length]").WaitDisplayed True, 10000
.FindElementByCss("[name=IndividualSearchResults_length]").AsSelect.SelectByValue "500"
Stop '<==delete me later
.Quit
End With
End Sub
I am trying to pull data from a web-site, I want to copy the '10' x 5'unit (class name is "unit_size medium") ' in row 1 for which I am able to copy data successfully but I also want promo (Class name is "promo_offers") '1st Month Free!' in row 2, the problem is this promo is given for specific cells only. hence the data is misleading and I am getting promo in 1st 4 cells and then getting error. However, I want to copy promo for only those units where promo information is given else the cell should be blank or any other value needs to be set. Below is the code...
Please suggest how to frame the code.
Sub GetClassNames()
Dim html As HTMLDocument
Dim objIE As Object
Dim element As IHTMLElement
Dim ie As InternetExplorer
Dim elements As IHTMLElementCollection
Dim result As String 'string variable that will hold our result link
Dim count As Long
Dim erow As Long
'initiating a new instance of Internet Explorer and asigning it to objIE
Set objIE = New InternetExplorer
'make IE browser visible (False would allow IE to run in the background)
objIE.Visible = True
'navigate IE to this web page (a pretty neat search engine really)
objIE.navigate "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
'wait here a few seconds while the browser is busy
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop
count = 0
Set html = objIE.document
Set elements = html.getElementsByClassName("unit_size medium")
For Each element In elements
If element.className = "unit_size medium" Then
erow = Sheet2.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = html.getElementsByClassName("unit_size medium")(count).innerText
Cells(erow, 2) = html.getElementsByClassName("promo_offers")(count).innerText
count = count + 1
End If
Next element
End Sub
I would simply wrap in an On Error Resume Next when attempting to access the element. Have a place already reserved for it in an output array so if not present the place remains empty.
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
While .Busy Or .readyState < 4: DoEvents: Wend
Dim listings As Object, listing As Object, headers(), results(), r As Long, c As Long
headers = Array("size", "features", "promo", "in store", "web")
Set listings = .document.getElementById("small_units_accordion_panel").getElementsByTagName("li")
'.unit_size medium, .features, .promo_offers, .board_rate_wrapper p, .board_rate
ReDim results(1 To listings.Length, 1 To UBound(headers) + 1)
For Each listing In listings
r = r + 1
On Error Resume Next
results(r, 1) = listing.getElementsByClassName("unit_size medium")(0).innerText
results(r, 2) = listing.getElementsByClassName("features")(0).innerText
results(r, 3) = listing.getElementsByClassName("promo_offers")(0).innerText
results(r, 4) = listing.getElementsByClassName("board_rate")(0).innerText
results(r, 5) = listing.getElementsByClassName("price")(0).innerText
On Error GoTo 0
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
End Sub
All boxes:
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetData()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 "https://www.allstorageonline.com/storage-units/texas/amarillo/all-storage-hardy-115423/#utm_source=GoogleLocal&utm_medium=WWLocal&utm_campaign=115423"
While .Busy Or .readyState < 4: DoEvents: Wend
Dim listings As Object, listing As Object, headers(), results()
Dim r As Long, list As Object, item As Object
headers = Array("size", "features", "promo", "in store", "web")
Set list = .document.getElementsByClassName("main_unit")
'.unit_size medium, .features, .promo_offers, .board_rate_wrapper p, .board_rate
Dim rowCount As Long
rowCount = .document.querySelectorAll(".main_unit li").Length
ReDim results(1 To rowCount, 1 To UBound(headers) + 1)
For Each listing In list
For Each item In listing.getElementsByTagName("li")
r = r + 1
On Error Resume Next
results(r, 1) = item.getElementsByClassName("unit_size medium")(0).innerText
results(r, 2) = item.getElementsByClassName("features")(0).innerText
results(r, 3) = item.getElementsByClassName("promo_offers")(0).innerText
results(r, 4) = item.getElementsByClassName("board_rate")(0).innerText
results(r, 5) = item.getElementsByClassName("price")(0).innerText
On Error GoTo 0
Next
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
End Sub
I had been successfully pulling mutual fund performance data from Marketwatch.com using the following code:
Dim A As Long
Dim B As Long
Dim C As Long
Dim Z As Long
For Z = 1 To 35
Range("A1").Select
ActiveCell.Offset((37 + (Z * 10)), 0).Select
If ActiveCell.Value = "" Then
Exit For
Else
End If
Dim oHTML As Object
Dim oTable As Object
Dim x As Long
Dim Y As Long
Dim vData As Variant
Set oHTML = CreateObject("HTMLFile")
With CreateObject("WinHTTP.WinHTTPRequest.5.1")
.Open "GET", "http://www.marketwatch.com/investing/fund/vfinx", False
.send
oHTML.body.innerhtml = .responsetext
End With
For Each oTable In oHTML.Getelementsbytagname("table")
If oTable.classname = "fundstable" Then
ReDim vData(1 To oTable.Rows.Length, 1 To oTable.Rows(1).Cells.Length)
For x = 1 To UBound(vData)
For Y = 1 To UBound(vData, 2)
vData(x, Y) = oTable.Rows(x - 1).Cells(Y - 1).innertext
Next Y
Next x
With ActiveCell.Offset(1, 0)
.Resize(UBound(vData), UBound(vData, 2)).Value = vData
End With
Exit For
End If
Next oTable
Next Z
Unfortunately, Marketwatch has added a Captcha to stop bots (i.e. me) from scraping their data. I don't know of anyway around this, so I figured I'd try another site.
I looked at Morningstar: http://performance.morningstar.com/fund/performance-return.action?t=VFINX®ion=usa&culture=en_US
It appears that the table I want on that page would be: "table.r_table3 width955px print97" or just "r_table3 width955px print97", but neither one seems to work for me.
Any ideas?
Thanks!
The data is loaded by javascript and won't be available via XMLHTTP request as scripts won't have run to load content.
You can use that second link, for example, with IE and introduce a wait to ensure info is loaded. I show getting the table with that class name at index 1. You can change the index here:
ele.item(1).outerHTML
So, for the next table use clipboard.SetText ele.item(2).outerHTML .
You can also loop the .Length of ele to get each table but ensure you write out to a different cell when you paste:
Dim i As Long
For i = 0 To ele.Length-1
clipboard.SetText ele.item(i).outerHTML
'Etc
Next
VBA:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, clipboard As Object
Dim ele As Object, ws As Worksheet, t As Date, tableCount As Long
Const MAX_WAIT_SEC As Long = 5
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
With IE
.Visible = True
.navigate "http://performance.morningstar.com/fund/performance-return.action?t=VFINX®ion=usa&culture=en_US"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
t = Timer
Do
DoEvents
On Error Resume Next
Set ele = .querySelectorAll(".r_table3.print97")
tableCount = ele.Length
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While tableCount < 3
If Not ele Is Nothing Then
clipboard.SetText ele.item(1).outerHTML
clipboard.PutInClipboard
ws.Cells(1, 1).PasteSpecial
End If
End With
.Quit
End With
End Sub
I am trying to pull data from some 500 urls of a website. All the pages are same in structure. I am facing a problem with understanding the HTML of this particular site
https://www.coworker.com/s-f/6033/united-states_hawaii_honolulu_impact-hub-honolulu
I want to extract Name, Address, Tel and website. My current code:
Sub GetData()
Dim wsSheet As Worksheet, Rows As Long, links As Variant, IE As Object, link As Variant
Set wb = ThisWorkbook
Set wsSheet = wb.Sheets("Sheet1")
Set IE = New InternetExplorer
Rows = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = wsSheet.Range("A1:A" & Rows)
With IE
.Visible = True
For Each link In links
.navigate (link)
While .Busy Or .readyState <> 4: DoEvents: Wend
Next
End With
End Sub
Here you go. Without more links to test with this is very fragile. It relies heavily on consistent styling across pages.
XHR Looping link list:
Option Explicit
Public Sub GetInfo()
Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument
Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
Application.ScreenUpdating = False
With wsSheet
Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
If Rows = 1 Then
ReDim links(1 To 1, 1 To 1)
links(1, 1) = wsSheet.Range("A1")
Else
links = wsSheet.Range("A1:A" & Rows).Value
End If
Dim r As Long
For link = LBound(links, 1) To UBound(links, 1)
r = r + 1
Set html = GetHTML(links(link, 1))
On Error Resume Next
Dim aNodeList As Object: Set aNodeList = html.querySelectorAll(".col-xs-12.pade_none.muchroom_mail")
.Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
.Cells(r, 3) = "Address: " & aNodeList.item(0).innerText
.Cells(r, 4) = "Tel: " & aNodeList.item(1).innerText
.Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
On Error GoTo 0
Next link
End With
Application.ScreenUpdating = True
End Sub
Public Function GetHTML(ByVal url As String) As HTMLDocument
Dim sResponse As String, html As New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With html
.body.innerHTML = sResponse
End With
Set GetHTML = html
End Function
Output:
References (VBE>Tools>References):
HTML object Library
Internet Explorer:
Option Explicit
Public Sub GetInfo()
Dim wsSheet As Worksheet, Rows As Long, links(), link As Long, wb As Workbook, html As HTMLDocument, ie As InternetExplorer
Set wb = ThisWorkbook: Set wsSheet = wb.Sheets("Sheet1")
Application.ScreenUpdating = False
With wsSheet
Rows = .Cells(.Rows.Count, "A").End(xlUp).Row
If Rows = 1 Then
ReDim links(1, 1)
links(1, 1) = wsSheet.Range("A1")
Else
links = wsSheet.Range("A1:A" & Rows).Value
End If
Dim r As Long
Set ie = New InternetExplorer
ie.Visible = True
For link = LBound(links, 1) To UBound(links, 1)
ie.navigate links(link, 1)
While ie.Busy Or ie.readyState < 4: DoEvents: Wend
' Application.Wait Now + TimeSerial(0, 0, 10)
On Error Resume Next
r = r + 1: Set html = ie.document
.Cells(r, 2) = "Name: " & html.querySelector(".col-sm-9.col-md-9.col-xs-12.pade_none.kohub_space_headings h2").innerText
.Cells(r, 3) = "Address: " & html.querySelector(".col-xs-12.pade_none.muchroom_mail").innerText
.Cells(r, 4) = "Tel: " & html.querySelector(".fa.fa-phone.fa-rotate-270 ~ a").innerText
.Cells(r, 5) = "Website: " & html.querySelector(".website-link-text a[href]").getAttribute("href")
On Error GoTo 0
Next link
ie.Quit
End With
Application.ScreenUpdating = True
End Sub
References (VBE>Tools>References):
HTML object Library
Microsoft Internet Controls