I am trying to gather all of the property data from this website:
http://taxsales.lgbs.com/
There are currently 7,000+ properties, but on any given view of the page I can only see 15 - 20 depending on screen resolution.
I have figured out, roughly, how to search through the HTML for the property names and details. Because the initial search has a warning screen, I'm using the following code to "click" the "Agree" button to see the subsequent search page. You can also see that I don't know how to find only the property details, and am instead taking (basically) all of the HTML from the whole site and sifting through it later in excel.
Questions:
1) Is there any way to see data for all of the properties at once? -or- How can I "page" through each portion of the results to eventually collect all of them?
2) How can I collect only the data for property address, sale date, sale type, etc.?
Sub HTML_scrape()
Dim IE As Object
Dim objElement As Object
Dim objCollection As Object
Dim objCollection2 As Object
Dim r As Integer
Dim v As Object
' Create InternetExplorer Object
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
' Using the URL with "full" map
IE.navigate "http://taxsales.lgbs.com/map?lat=39.576604&lon=-96.72178200000002&zoom=4&offset=0&ordering=sale_date,address_full,uid&sale_type=SALE,RESALE,STRUCK%20OFF,FUTURE%20SALE&in_bbox=-137.2217809271164,15.247775193567845,-56.221783072883625,57.63696077532424"
' Wait while IE loading...
Do While (IE.Busy Or IE.READYSTATE <> 4)
Application.Wait DateAdd("s", 2, Now)
Loop
' Click the Agree Button
Set objCollection = IE.document.getElementsByClassName("btn btn-primary")
objCollection(0).Click
Do While (IE.Busy Or IE.READYSTATE <> 4)
Application.Wait DateAdd("s", 2, Now)
Loop
' Find all tags and collect the associated data
' This should only find the relevant property data, but I can not
' figure out how to only take the information within a
' <a class="ng-binding" ng-click="listing.addressClick()"> tag
Set objCollection2 = IE.document.getElementsByTagName("*")
r = 1
For Each v In objCollection2
Sheets("Sheet1").Range("A" & r).Value = v.outerHTML
r = r + 1
Next
With Sheets("Sheet1").Columns("A")
Dim DQ As String
DQ = Chr(34)
.Replace What:="#", Replacement:="'"
.Replace What:=DQ, Replacement:="'"
End With
End Sub
Property Sale Date HTML:
<li ng-if="listing.property.sale_date" class="ng-binding ng-scope"><label>Sale Date:</label> 4/5/18 9:00 AM</li>
Property Address HTML:
<a ng-click="listing.addressClick()" class="ng-binding"> 02863 Stouton St, Philadelphia PA 19134-3515 </a>
Next Button HTML:
Next
Here's a hacky approach to use web requests (xhr) instead. Looking at the requests for the page it looks like it returns a JSON object. I'm parsing this JSON object and dumping it to a range on the first sheet. Here's some code that should get you started.
'You'll need the following references:
'MSXML v6.0
'Microsoft Scripting Runtime
'JSON project from: https://github.com/VBA-tools/VBA-JSON
Public Sub Scraper()
Dim webrequest As MSXML2.XMLHTTP60
Dim JSON As Object
Dim responses As Object
Dim itemdict As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim item As Variant
Dim myarray As Variant: ReDim myarray(0 To 20, 0 To 5000)
Dim url As String: url = "http://taxsales.lgbs.com/api/property_sales/?in_bbox=-139.04111793750002%2C7.97834134877145%2C-54.40244606250002%2C61.39968867373271&offset=10&ordering=sale_date%2Caddress_full%2Cuid&sale_type=SALE%2CRESALE%2CSTRUCK+OFF%2CFUTURE+SALE"
For i = 0 To 10 'Do a loop to get SOME of the data, probably need a different loop here
With New MSXML2.XMLHTTP60
.Open "GET", url
.setRequestHeader "accept", "application/json, text/plain, */*"
.send
'Parse the response into a JSON dict
Set JSON = JsonConverter.ParseJson(.responseText)
url = JSON("next") ' the next URl to send a GET request
Set responses = JSON("results") 'Get the results Dict
On Error Resume Next ' getting an error, just ignoring for now
For Each itemdict In responses
j = 0
'add headers
If k = 0 Then
For Each item In itemdict
myarray(j, k) = item
j = j + 1
Next
End If
'add values
For Each item In itemdict
myarray(j, k) = itemdict(item)
j = j + 1
Next
k = k + 1
Next
On Error GoTo 0
End With
Next
ReDim Preserve myarray(0 To 20, 0 To k - 1)
ThisWorkbook.Sheets(1).Range("A1:T" & k - 1).Value = TransposeArray(myarray)
End Sub
'using this function as worksheetfunction.transpose causing issues
Public Function TransposeArray(myarray As Variant) As Variant
Dim X As Long
Dim Y As Long
Dim Xupper As Long
Dim Yupper As Long
Dim tempArray As Variant
Xupper = UBound(myarray, 2)
Yupper = UBound(myarray, 1)
ReDim tempArray(Xupper, Yupper)
For X = 0 To Xupper
For Y = 0 To Yupper
tempArray(X, Y) = myarray(Y, X)
Next
Next
TransposeArray = tempArray
End Function
Output:
Related
I have tried many proven methods from various posts to get some data from a web page without success. I am able to get a list of linked items on the opening page but once I navigate to any other page, I draw a blank with the code below.
When I run the code, I get no results in Cats.
Sub Main()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim Cats As MSHTML.IHTMLElementCollection
Dim Cat As MSHTML.IHTMLElement
Dim NextHref As String
Dim NextURL As String
XMLReq.Open "GET", URL, False
XMLReq.send
If XMLReq.Status <> 200 Then
MsgBox "Problem"
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLReq.responseText
Set XMLReq = Nothing
Set Cats = HTMLDoc.getElementsByClassName("ng-tns-c329-5 product-grid--tile ng-star-inserted")
Debug.Print Cats.Length 'Returns 0
'For Each Cat In Cats
' NextHref = Cat.getAttribute("href")
' NextURL = URL & Mid(NextHref, InStr(NextHref, ":") + 2)
' ListItemsInCats Cat.innerText, NextURL
'Next Cat
End Sub
Expanded Element structure
Collased structure
Thanks for any assistance.
The problem with the website you are trying to scrape from is that:
In XMLHTTP Request method - The product details are dynamic content that is pulled from Fetch/XHR which XMLHTTP does not run, XMLHTTP only gives you the HTML document as it is without any script running.
In Internet Explorer method - The webpage is considered ready before the product details are actually loaded so the usual loop check for Busy and ReadyState is not sufficient.
The code below uses Internet Explorer and to resolve the issue mentioned above, I have put up some checks (Which is not perfect I believe but it works so far in my testing) that will wait until the first product has been loaded before proceeding to pull the product details:
Private Sub GetBakeryProducts()
Const URL As String = "https://www.woolworths.com.au/shop/browse/bakery"
Dim ieObj As InternetExplorer
Set ieObj = New InternetExplorer
ieObj.navigate URL
ieObj.Visible = True
Do While ieObj.Busy Or ieObj.readyState <> READYSTATE_COMPLETE
DoEvents
Loop
Do While ieObj.document.getElementsByClassName("productCarousel-header").Length = 0
DoEvents
Loop
Dim ieDoc As MSHTML.HTMLDocument
Set ieDoc = ieObj.document
Dim productList As Object
Set productList = ieDoc.getElementsByClassName("product-grid--tile")
'==== Test if the website has finish loading the 1st product details
On Error Resume Next
Dim testStatus As String
Do
Err.Clear
testStatus = productList(0).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
Loop Until Err.Number = 0
'====
Dim outputArr() As String
ReDim outputArr(1 To productList.Length, 1 To 2) As String
Dim outputIndex As Long
Dim i As Long
For i = 0 To productList.Length - 1
If productList(i).getElementsByClassName("shelfProductTile-descriptionLink").Length <> 0 Then
If Err.Number <> 0 Then
Err.Clear
Exit For
End If
Dim productName As String
Dim productPrice As String
productName = productList(i).getElementsByClassName("shelfProductTile-descriptionLink")(0).innerText
productPrice = Replace(productList(i).getElementsByClassName("price")(0).innerText, vbNewLine, vbNullString)
outputIndex = outputIndex + 1
outputArr(outputIndex, 1) = productName
outputArr(outputIndex, 2) = productPrice
End If
Next i
ReDim Preserve outputArr(1 To outputIndex, 1 To 2) As String
ieObj.Quit
Set ieObj = Nothing
ThisWorkbook.Sheets("Sheet1").Range("A1").Resize(outputIndex, UBound(outputArr, 2)).Value = outputArr
End Sub
Running this will pull the data from the website and paste the output starting from cell A1 in Sheet1, please change the worksheet name and range as you see fits.
I found the code below which works nicely and I think I can repurpose it for my needs, but does not include the precipitation. I'm relatively new to HTML so having trouble understanding what each line of code's purpose is. I've gone to the website and looked at the elements and console but can't find "p[data-testid='wxPhrase']" or the word 'children' or 'child'.
I presumed precipitation was just another child so tried adding lines like these after editing the column headers in the first sub:
Results(r + 1, 3) = Children(r).FirstChild.innerText
Results(r + 1, 4) = Children(r).PreviousSibling.PreviousSibling.PreviousSibling.FirstChild.innerText
but it gives Run-time error '438': Object doesn't support this property or method. I appreciate very much some help and education. Thanks, in advance!
Sub MiamiWeather()
Dim Data As Variant
Data = MiamiWeatherData
Range("A1:B1").Value = Array("Date", "Temperature")
Range("A2").Resize(UBound(Data), 2).Value = Data
End Sub
Function MiamiWeatherData()
Const URL = "https://weather.com/weather/tenday/l/3881cd527264bc7c99b6b541473c0085e75aa026b6bd99658c56ad9bb55bd96e"
Dim responseText As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
responseText = .responseText
End With
Dim Document As HTMLDocument
Set Document = CreateObject("HTMLFILE")
Document.body.innerHTML = responseText
Dim Children As IHTMLDOMChildrenCollection
Set Children = Document.querySelectorAll("p[data-testid='wxPhrase']")
Dim Results As Variant
ReDim Results(1 To Children.Length, 1 To 2)
Dim r As Long
For r = 0 To Children.Length - 1
Results(r + 1, 1) = Children(r).PreviousSibling.PreviousSibling.FirstChild.innerText
Results(r + 1, 2) = Children(r).PreviousSibling.FirstChild.innerText
Next
MiamiWeatherData = Results
End Function
Assuming you want the percentage, you need to resize the array to hold an extra dimension then add an extra selector within the loop. That selector can select by attribute = value and will need to work off .Children(r).PreviousSibling. Assuming, you have a still maintained Microsoft set-up you can chain querySelector at this point as shown below.
For older versions e.g., <= Windows 7 then use Results(r + 1, 3) = Children(r).PreviousSibling.Children(2).Children(0).Children(1).innerText
Option Explicit
Public Sub MiamiWeather()
Dim Data As Variant
Data = MiamiWeatherData
Range("A1:C1").value = Array("Date", "Temperature", "Precipitation")
Range("A2").Resize(UBound(Data), 3).value = Data
End Sub
Function MiamiWeatherData()
Const URL = "https://weather.com/weather/tenday/l/3881cd527264bc7c99b6b541473c0085e75aa026b6bd99658c56ad9bb55bd96e"
Dim responseText As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
responseText = .responseText
End With
Dim Document As HTMLDocument
Set Document = CreateObject("HTMLFILE")
Document.body.innerHTML = responseText
Dim Children As IHTMLDOMChildrenCollection
Set Children = Document.querySelectorAll("p[data-testid='wxPhrase']")
Dim Results As Variant
ReDim Results(1 To Children.length, 1 To 3)
Dim r As Long
For r = 0 To Children.length - 1
Results(r + 1, 1) = Children(r).PreviousSibling.PreviousSibling.FirstChild.innerText
Results(r + 1, 2) = Children(r).PreviousSibling.FirstChild.innerText
Results(r + 1, 3) = Children(r).PreviousSibling.querySelector("[data-testid=PercentageValue]").innerText
Next
MiamiWeatherData = Results
End Function
With Google Translate, the French to English is not returning the correct values when using the code below.
If I manually use the Google Translate app I get the correct translation.
For example;
From code of "salle de l'émetteur", returns "director's room".
From the Google Translate app, correctly returns "transmitter room".
If I inspect the elements in the Google Translate app, I see the correct translation at
span class="tlid-translation translation" lang = "en"
I can't figure out how to get this "innertext" from 'span title class'
Is there a way to get the translation from
span class="tlid-translation translation" lang = "en"'
instead of
objDivs = objHTML.getElementsByTagName("div"), objDiv.className = "t0"
Public Function Translate(strInput As String, strSourceLng As String, strTargetLng As String) As String
Dim strURL As String
Dim objHTTP As Object
Dim objHTML As Object
Dim objDivs As Object
Dim objDiv As Object
Dim strTranslated As String
' send query to web page
strURL = "https://translate.google.com/m?hl=" & strSourceLng & _
"&sl=" & strSourceLng & _
"&tl=" & strTargetLng & _
"&ie=UTF-8&prev=_m&q=" & strInput
Set objHTTP = CreateObject("MSXML2.ServerXMLHTTP") 'late binding
objHTTP.Open "GET", strURL, False
objHTTP.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"
objHTTP.send ""
' create an html document
Set objHTML = CreateObject("htmlfile")
With objHTML
.Open
.Write objHTTP.responseText
.Close
End With
Set objDivs = objHTML.getElementsByTagName("div")
For Each objDiv In objDivs
If objDiv.className = "t0" Then
strTranslated = objDiv.innerText
If strTranslated <> "" Then
Translate = strTranslated
End If
End If
Next objDiv
Set objHTML = Nothing
Set objHTTP = Nothing
End Function
If you use early bound html document i.e. MSHTML.HTMLDocument then you get access to querySelector and can try using css selectors to target that element
e.g.
Dim html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
html.body.innerHTML = objHTTP.responseText
Debug.Print html.querySelector(".translation[lang=en]").innerText
Requires VBA>Tools>References> Add reference to Microsoft HTML Object Library.
This assumes the translation is in the .innerText. It would help to share the relevant part of the response showing the translation.
Thanks QHarr,
I tried your recommendation, but get error 91 "Object variable or With block not set"... I must be missing something or it's just not clear to me.
Maybe the translation issue is with "XMLHTTP" because it seems to return the same translation results when you use try the =Translate(A1,"fr","en") to a cell in Excel.
I did however try your recommendation Excel vba Translate IE.Document empty
and this does return the correct translated response "salle de l'émetteur" now = "transmitter room" with ie.navigate2 "https://translate.google.com/#" & "fr" & '/' & "en" & "/" & str". However, this method is too slow to translate worksheets, since I need to translate worksheets 60 rows x 36 cols.
I tweaked your code up so that it will open IE and then loop through each cell with just the one IE (Goggle Translate) window open, instead of open/close the window for each cell data translation. Works super quick in comparison. I just need to figure out timing issues because it's doesn't always put the correct data to the cells when it fetches the translation from Google Translate.
I sometimes get a 'run time error' at MyStrings = ie.document.querySelector (".translation").innerText because it's empty. So I added a dwell time to the process. It seems that <0.5 sec not long enough but seems to work at >0.8 sec per translated cell.
At least it's a step in the right direction. I'll keep playing with the timing to optimize the process, unless there's some other workable solutions or recommendations.
So after play around with this, my crud code below seems to be doing the job and seem to translate faster...so far!
Function Translate_fr_en()
'***This function loops through an array called Data0 loaded with all the values of the worksheet "Translate_Sheet".
' It translates each row of select columns to a mirror image temp worksheet called "Temp".
' It skip past columns not requiring translations since the values are numeric.
Dim str, strTranslated, TempStr As String
Dim ws As Worksheet
Dim i, ii As Integer
Dim col As Integer
Dim LastRow As Integer
Dim ie As Object
Dim Flag As Boolean
Set wb = ThisWorkbook
Set ws = wb.Sheets("Temp")
Set ie = CreateObject("InternetExplorer.Application")
rcount = 3 'Temp worksheet row counter
ii = 0 'array row counter
lrow = wb.Sheets("Translate_Sheet").UsedRange.rows.count 'worksheet requiring translation
LastRow = lrow - 2
With ie
For col = 1 To 36
Select Case col
Case 1, 5, 6, 9, 10, 11, 18, 21, 31, 32, 33, 34, 36 'Selected columns containing text to translate...other columns not need translation because of numeric values
'Translate columns with text values requiring translation
Do Until ii = LastRow '# of rows
i = col - 1
str = Data0(i, ii) '"Translate_Worksheet" worksheet data preloaded into Data0() array
If str = "" Then 'If blank row than do nothing and skip translation
ii = ii + 1 'Array row counter
Else 'then translate columns with text
skip:
.navigate2 "https://translate.google.com/#" & "fr" & "/" & "en" & "/" & str
While .busy Or .readystate < 4: DoEvents: Wend
If Flag = False Then
Application.Wait (Now + timevlue("0:00:01") / 1.3) 'wait 0.769 second for the first tanslation otherwise returns blank value
Flag = True
End If
strTranslated = ie.document.querySelector(".translation").innerText 'get translated text
'String comparison...if translated text has "..." at the end of string do again.
If strTranslated = CStr(TempStr & "...") Then
GoTo skip
End If
ws.Cells(rcount, col) = strTranslated 'load cell with translation
TempStr = ws.Cells(rcount, col) 'TempStr is temp string to hold value for comparison
rcount = rcount + 1
End If
Loop
ii = 0
rcount = 3
Case Else 'load Temp worksheet columns with numeric values not requiring translation
ii = 0
rcount = 3
Do Until ii = LastRow
str = Data0(i, ii) 'worksheet data
ws.Cells(rcount, col) = str
ii = ii + 1
rcount = rcount + 1
Loop
ii = 0
rcount = 3
End Select
Next col
End With
Set ie = Nothing
End Function
I am trying to parse a HTML table (it has no id but class name) from a website. However, since it has no id, I have difficulty in printing the contents from the table. But I couldn't figure it out.
Edited
Here you can see the image of Excel file. GTIP Numbers are located at column A. My aim is when the VBA code runs, these GTIP numbers from column A are forwarded to the Search Field named "GTİP Ara" in the website https://www.isib.gov.tr/urun-arama/. As a result, the companies having the selected GTIP will be returned to the columns next specified row.
For the third row, GTIP number "841013000000" has forwarded to the Search Field named GTIP Ara and as a result; Company 2, ... Company 9 are returned to neighbor columns.
Sometimes GTIP numbers return nothing since none of the companies are holding specified number.
For example: 841410819000 will return the companies but 841112101000 will return an error "Aradığınız Sonuç Bulunamadı!". That's why I am trying to add an if statement but it doesn't work properly.
Now, because of a mistake somewhere in my code block; the returned values are same for each GTIP, here you can see the result on second image.
Sub GrabLastNames()
Dim objIE As InternetExplorer
Dim ele As Object
Dim y As Integer
Set objIE = New InternetExplorer
objIE.Visible = True
objIE.navigate "https://www.isib.gov.tr/urun-arama"
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop
y = 2
For i = 2 To 269
objIE.document.getElementById("gtip-ara").Value = _
Sheets("Sheet1").Range("A:A").Cells(i, 1).Value
objIE.document.getElementById("ara").Click
'If objIE.document.getElementsByClassName("error").getElementsByTagName("span").Value <> "Aradığınız Sonuç Bulunamadı!" Then
For Each ele In objIE.document.getElementsByClassName("urun-arama-table table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
Sheets("Sheet1").Cells(i, y).Value = ele.Children(0).textContent
y = y + 1
Next
y = 2
Next i
End Sub
I only had one working GTIP 841410819000, so cant test this more thoroughly.
You don't really need to know the ID, the page has one table, so getting it using getElemenetsByClassName like you did, or just getElementsByTagName like in my example should work fine. My code is probably the same as yours, just with a few pauses to a: not spam the website, and b: give IE a chance to get itself together (its IE after all).
If you can provide a few more working GTIP values I can test a bit more.
' requires reference 'Microft HTML Object Library'
Sub Main()
Dim Browser As New InternetExplorer
Browser.Visible = True
Browser.navigate "https://www.isib.gov.tr/urun-arama"
Do While Browser.Busy And Not Browser.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1")
Dim TableRows As IHTMLElementCollection
Dim TableRow As IHTMLElement
Dim SourceRow As Integer
Dim ResultColumn As Integer
Application.Wait (Now + TimeValue("0:00:05"))
SourceRow = 2 ' Skip Header
Do
Debug.Print "Trying " & Sheet.Cells(SourceRow, 1).Value
Browser.Document.getElementById("gtip-ara").Value = Sheet.Cells(SourceRow, 1).Value
Browser.Document.getElementById("ara").Click
Application.Wait (Now + TimeValue("0:00:02"))
Do While Browser.Busy
DoEvents
Loop
If Browser.Document.getElementsByTagName("table").Length > 0 Then
Debug.Print " > Found Results"
Set TableRows = Browser.Document.getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
ResultColumn = 2 ' dont overwrite search term
For Each TableRow In TableRows
Sheet.Cells(SourceRow, ResultColumn).Value = TableRow.innerText
ResultColumn = ResultColumn + 1
Next TableRow
Else
Debug.Print " - No Results Found"
End If
If Sheet.Cells(SourceRow + 1, 1).Value = "" Then
Exit Do
Else
SourceRow = SourceRow + 1
End If
Application.Wait (Now + TimeValue("0:00:05"))
Loop
Browser.Quit
Set Browser = Nothing
End Sub
Update
Updated my code again, it doesn't spawn lots of windows anymore and prints only the company name (as your example did).
' requires Microsoft HTML Object Library
' requires Microsoft XML, v6.0
Sub Main()
Dim XHR As XMLHTTP60
Dim Document As HTMLDocument
Dim ResultRows As IHTMLElementCollection
Dim ResultRow As IHTMLElement
Dim Sheet As Worksheet: Set Sheet = ThisWorkbook.Worksheets("Sheet1")
Dim SheetRow As Integer
Dim SheetColumn As Integer
Dim LastRow As Integer
LastRow = Sheet.Cells(Sheet.Rows.Count, "A").End(xlUp).Row
For SheetRow = 2 To LastRow
Debug.Print "Trying GTIP:" & Sheet.Cells(SheetRow, 1).Value
Application.StatusBar = "Status: " & Right(String(Len(CStr(LastRow - 1)), "0") & CStr(SheetRow - 1), Len(CStr(LastRow - 1))) & "/" & CStr(LastRow - 1)
Set XHR = New XMLHTTP60
XHR.Open "POST", "https://www.isib.gov.tr/urun-arama", False
XHR.setRequestHeader "content-type", "application/x-www-form-urlencoded"
XHR.send "gtipkategori=" & Sheet.Cells(SheetRow, 1).Value
Set Document = New HTMLDocument
Document.body.innerHTML = XHR.responseText
If Document.getElementsByTagName("table").Length > 0 Then
Debug.Print " > Found Results"
SheetColumn = 2 ' First Column to output data into
Set ResultRows = Document.getElementsByTagName("table")(0).getElementsByTagName("tbody")(0).getElementsByTagName("tr")
For Each ResultRow In ResultRows
Sheet.Cells(SheetRow, SheetColumn).Value = ResultRow.getElementsByTagName("td")(0).innerText ' 0 - company name
SheetColumn = SheetColumn + 1
Next
Else
Debug.Print " - No Results"
End If
Set XHR = Nothing
Set Document = Nothing
Application.Wait (Now + TimeValue("0:00:01")) ' slow down requests
Next
Application.StatusBar = "Complete"
End Sub
I have an Excel workbook of around 25,000 company keywords from which I'd like to get the company website URL.
I am looking to run a VBA script which can run these keywords as a Google search, and pull the URL of the first result into a spreadsheet.
I found a similar thread.
The results of this to be hit-and-miss; some keywords return the URL in the next column, others remain blank.
It also seemed to pull the URL of Google's optimised sub-links in the first search result rather than the main website URL: Google Search Result example
I then found the below code here which I ran on a sample list of 1,000 keywords. The author of this blog stipulates that this code works for Mozilla Firefox.
I tested IE code that he has also written but this did not achieve the same results (it was adding hyperlinks consisting of descriptive text from the search results rather than the raw URL).
The Firefox code worked until the 714th row, then returned a error message
"Run time error 91: object variable or with block variable not set"
Spreadsheet layout showing successful results and row at which macro stopped
Sub GoogleURL ()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object
Dim html As Object
Dim objResultDiv As Object
Dim objH As Object
lastRow = Range(“A” & Rows.Count).End(xlUp).Row
For i = 2 To lastRow
url = “https://www.google.co.uk/search?q=” & Cells(i, 1) & “&rnd=” & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject(“MSXML2.serverXMLHTTP”)
XMLHTTP.Open “GET”, url, False
XMLHTTP.setRequestHeader “Content-Type”, “text/xml”
XMLHTTP.setRequestHeader “User-Agent”, “Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0”
XMLHTTP.send
Set html = CreateObject(“htmlfile”)
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid(“rso”)
Set objH = objResultDiv.getelementsbytagname(“h3”)(0)
Cells(i, 2).Value = objH.innerText
Set html = CreateObject(“htmlfile”)
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid(“rso”)
Set objH = objResultDiv.getelementsbytagname(“cite”)(0)
Cells(i, 3).Value = objH.innerText
DoEvents
Next
End Sub
As Firefox is a third party browser for the support scope of Microsoft, I can help you to check the VBA code for the IE browser.
You said that the VBA code given in this link for the IE browser generates the description with the link and your requirement is to store description and link in a separate column.
I tried to modify that sample code as per your requirement.
Here is the modified code from that sample.
Option Explicit
Const TargetItemsQty = 1 ' results for each keyword
Sub GWebSearchIECtl()
Dim objSheet As Worksheet
Dim objIE As Object
Dim x As Long
Dim y As Long
Dim strSearch As String
Dim lngFound As Long
Dim st As String
Dim colGItems As Object
Dim varGItem As Variant
Dim strHLink As String
Dim strDescr As String
Dim strNextURL As String
Set objSheet = Sheets("Sheet1")
Set objIE = CreateObject("InternetExplorer.Application")
objIE.Visible = True ' for debug or captcha request cases
y = 1 ' start searching for the keyword in the first row
With objSheet
.Select
.Range(.Columns("B:B"), .Columns("B:B").End(xlToRight)).Delete ' clear previous results
.Range(.Columns("C:C"), .Columns("C:C").End(xlToRight)).Delete ' clear previous results
.Range("A1").Select
Do Until .Cells(y, 1) = ""
x = 2 ' start writing results from column B
.Cells(y, 1).Select
strSearch = .Cells(y, 1) ' current keyword
With objIE
lngFound = 0
.navigate "https://www.google.com/search?q=" & EncodeUriComponent(strSearch) ' go to first search results page
Do
Do While .Busy Or Not .READYSTATE = 4: DoEvents: Loop ' wait IE
Do Until .document.READYSTATE = "complete": DoEvents: Loop ' wait document
Do While TypeName(.document.getelementbyid("res")) = "Null": DoEvents: Loop ' wait [#res] element
Set colGItems = .document.getelementbyid("res").getElementsByClassName("g") ' collection of search result [.g] items
For Each varGItem In colGItems ' process each item in collection
If varGItem.getelementsbytagname("a").Length > 0 And varGItem.getElementsByClassName("st").Length > 0 Then ' must have hyperlink and description
strHLink = varGItem.getelementsbytagname("a")(0).href ' get first hyperlink [a] found in current item
strDescr = GetInnerText(varGItem.getElementsByClassName("st")(0).innerHTML) ' get first description [span.st] found in current item
lngFound = lngFound + 1
'Debug.Print (strHLink)
'Debug.Print (strDescr)
With objSheet ' put result into cell
.Cells(y, x).Value = strDescr
.Hyperlinks.Add .Cells(y, x + 1), strHLink
.Cells(y, x).WrapText = True
x = x + 1 ' next column
End With
If lngFound = TargetItemsQty Then Exit Do ' continue with next keyword - necessary quantity of the results for current keyword found
End If
DoEvents
Next
If TypeName(.document.getelementbyid("pnnext")) = "Null" Then Exit Do ' continue with next keyword - no [a#pnnext.pn] next page button exists
strNextURL = .document.getelementbyid("pnnext").href ' get next page url
.navigate strNextURL ' go to next search results page
Loop
End With
y = y + 1 ' next row
Loop
End With
objIE.Quit
' google web search page contains the elements:
' [div#res] - main search results block
' [div.g] - each result item block within [div#res]
' [a] - hyperlink ancor(s) within each [div.g]
' [span.st] - description(s) within each [div.g]
' [a#pnnext.pn] - hyperlink ancor to the next search results page
End Sub
Function EncodeUriComponent(strText As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
End If
EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)
End Function
Function GetInnerText(strText As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.Open
objHtmlfile.Write "<body></body>"
End If
objHtmlfile.body.innerHTML = strText
GetInnerText = objHtmlfile.body.innerText
End Function
Output in IE 11 browser:
You can try to run it on your side to see the results with large amount of data.
If you meet with any performance issue then I suggest you try it with a smaller amount of data.