I have developed a code to scrape data from a websites but since I know very little about JSON I could be able to get the output as I want, the code was developed for this web: https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1 now I am replicating my code for other websites having json like this web :https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA ; but this code is not functioning properly. Here is my code(I want it to be generic for most webs)
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
Dim headers(), ws As Worksheet, iCol As Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
Set Http = New XMLHTTP60
Set Html = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For page = 1 To 8 'To cover all pages
With Http
.Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
.send
Html.body.innerHTML = .responseText
End With
Set iCol = New Collection
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, results()
ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
r = 0
For Each col In iCol
r = r + 1
With Http
.Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
srNo = r '<unsure where this is coming from.
mobile = json("infor")("0")("Mobile")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, tel, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
Set iCol = Nothing: Set json = Nothing
ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
Next
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
Please also let me know mistakes I am doing, so that i will take care of those in future.
Short answer:
No.
I would go so far as to say it is impossible you can write something generic for most webs . One could say the generic part is the parser itself. But you need to have some familiarity with the json structure of each endpoint to appropriately direct parsing. Json itself has defined structural syntax/components, but what you want from those will have different access paths and require different handling to do so. Then there are the arguments that may need to be supplied and differences in output format.
What is the best scenario?
If you have a set list of urls (ideally API endpoints) you have a better chance of writing something that might last for a while as you can familiarise yourself with the json returned. But how generic is this? It's really just branched code.
Re-usable stuff:
Might be the non parser stuff which can be generalised e.g. methods and classes you create that parse out the paths for the entire structure and look for key words and return you those paths? Helper functions you write that might recursively write out nested structures etc. Code that makes the request and handles fails etc.... I would definitely recommend looking into classes for re-usable code in web-scraping.
Class based examples:
I will add to this as and when
https://stackoverflow.com/a/52301153/6241235
https://codereview.stackexchange.com/questions/69009/vba-clickbot-featuring-ajax-waiting-and-element-searching
Related
I'm getting
run-time error 424
in 68th row (line)
request.Open "GET", Url, False
and I don't know how to fix it.
My previous question I posted ;
How to scrape specific part of online english dictionary?
My final goal is to get result like this;
A B
beginning bɪˈɡɪnɪŋ
behalf bɪˈhæf
behave bɪˈheɪv
behaviour bɪˈheɪvjər
belong bɪˈlɔːŋ
below bɪˈloʊ
bird bɜːrd
biscuit ˈbɪskɪt
Here's code I wrote, and it's mostly based on someone else's code I found on internet.
' Microsoft ActiveX Data Objects x.x Library
' Microsoft XML, v3.0
' Microsoft VBScript Regular Expressions
Sub ParseHelp()
' Word reference from
Dim Url As String
Url = "https://www.oxfordlearnersdictionaries.com/definition/english/" & Cells(ActiveCell.Row, "B").Value
' Get dictionary's html
Dim Html As String
Html = GetHtml(Url)
' Check error
If InStr(Html, "<TITLE>Not Found</Title>") > 0 Then
MsgBox "404"
Exit Sub
End If
' Extract phonetic alphabet from HTML
Dim wrapPattern As String
wrapPattern = "<span class='name' (.*?)</span>"
Set wrapCollection = FindRegexpMatch(Html, wrapPattern)
' MsgBox StripHtml(CStr(wrapCollection(1)))
' Fill phonetic alphabet into cell
If Not wrapCollection Is Nothing Then
Dim wrap As String
On Error Resume Next
wrap = StripHtml(CStr(wrapCollection(1)))
If Err.Number <> 0 Then
wrap = ""
End If
Cells(ActiveCell.Row, "C").Value = wrap
Else
MsgBox "not found"
End If
End Sub
Public Function StripHtml(Html As String) As String
Dim RegEx As New RegExp
Dim sOut As String
Html = Replace(Html, "</li>", vbNewLine)
Html = Replace(Html, " ", " ")
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>"
End With
sOut = RegEx.Replace(Html, "")
StripHtml = sOut
Set RegEx = Nothing
End Function
Public Function GetHtml(Url As String) As String
Dim xmlhttp As Object
Set xmlhttp = CreateObject("MSXML2.serverXMLHTTP")
Dim converter As New ADODB.stream
' Get
request.Open "GET", Url, False
request.send
' raw bytes
converter.Open
converter.Type = adTypeBinary
converter.Write request.responseBody
' read
converter.Position = 0
converter.Type = adTypeText
converter.Charset = "utf-8"
' close
GetHtml = converter.ReadText
converter.Close
End Function
Public Function FindRegexpMatch(txt As String, pat As String) As Collection
Set FindRegexpMatch = New Collection
Dim rx As New RegExp
Dim matcol As MatchCollection
Dim mat As Match
Dim ret As String
Dim delimiter As String
txt = Replace(txt, Chr(10), "")
txt = Replace(txt, Chr(13), "")
rx.Global = True
rx.IgnoreCase = True
rx.MultiLine = True
rx.Pattern = pat
Set matcol = rx.Execute(txt)
'MsgBox "Match:" & matcol.Count
On Error GoTo ErrorHandler
For Each mat In matcol
'FindRegexpMatch.Add mat.SubMatches(0)
FindRegexpMatch.Add mat.Value
Next mat
Set rx = Nothing
' Insert code that might generate an error here
Exit Function
ErrorHandler:
' Insert code to handle the error here
MsgBox "FindRegexpMatch. " & Err.GetException()
Resume Next
End Function
Any kind of help would be greatly appreciated.
The following is an example of how to read in values from column A and write out pronounciations to column B. It uses css selectors to match a child node then steps up to parentNode in order to ensure entire pronounciation is grabbed. There are a number of ways you could have matched on the parent node to get the second pronounciation. Note that I use a parent node and Replace as the pronounciation may span multiple childNodes.
If doing this for lots of lookups please be a good netizen and put some waits in the code so as to not bombard the site with requests.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, urls()
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row 'you need at least two words in column A or change the redim.
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(urls))
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", "https://www.oxfordlearnersdictionaries.com/definition/english/" & urls(i), False
.send
html.body.innerHTML = .responseText
data = Replace$(Replace$(html.querySelector(".name ~ .wrap").ParentNode.innerText, "/", vbNullString), Chr$(10), Chr$(32))
results(i) = Right$(data, Len(data) - 4)
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
Required references (VBE>Tools>References):
Microsoft HTML Object Library
Should you go down the API route then here is a small example. You can make 1000 free calls in a month with Prototype account. The next best, depending on how many calls you wish to make looks like the 10,001 calls (that one extra PAYG call halves the price). # calls will be affected by whether word is head word or needs lemmas lookup call first. The endpoint construction you need is GET /entries/{source_lang}/{word_id}?fields=pronunciations though that doesn't seem to filter massively. You will need a json parser to handle the json returned e.g. github.com/VBA-tools/VBA-JSON/blob/master/JsonConverter.bas. Download raw code from there and add to standard module called JsonConverter. You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, words()
'If not performing lemmas lookup then must be head word e.g. behave, behalf
Const appId As String = "yourAppId"
Const appKey As String = "yourAppKey"
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row
words = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(words))
Set html = New MSHTML.HTMLDocument
Dim json As Object
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(words) To UBound(words)
.Open "GET", "https://od-api.oxforddictionaries.com/api/v2/entries/en-us/" & LCase$(words(i)) & "?fields=pronunciations", False
.setRequestHeader "app_id", appId
.setRequestHeader "app_key", appKey
.setRequestHeader "ContentType", "application/json"
.send
Set json = JsonConverter.ParseJson(.responseText)
results(i) = IIf(json("results")(1)("type") = "headword", json("results")(1)("lexicalEntries")(1)("pronunciations")(2)("phoneticSpelling"), "lemmas lookup required")
Set json = Nothing
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
I need to get the dates and temp from a weather website and record it on cells but I am getting a object variable or with block variable not set error.
I tried to data from web in excel but I think the website is protected or something because I keep getting "under maintenance" page when trying to load the page from excel. I got the codes below from a tutorial but I can't make it work.
Sub record()
Dim request As Object
Dim response As String
Dim html As New HTMLDocument
Dim websie As String
Dim temps As Variant
'provide link
'website = "https://finance.yahoo.com/quote/EURUSD=X?p=EURUSD=X"
website = "https://www.accuweather.com/en/us/chicago/60608/september-weather/348308"
'create the object that will make the webpage request
Set request = CreateObject("MSXML2.XMLHTTP")
'go to the link
request.Open "GET", website, False
'send request for webpage
request.send
'get web response data to variable
response = StrConv(request.responseBody, vbUnicode)
'put webpage to an html object
html.body.innerHTML = response
'get temperature from specified element
'temps = html.getElementsByClassName("Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)")(0).innerText
temps = html.getElementsByClassName("high")(0).innerText
Sheets("record").Range("A1") = temps
End Sub
Sample lines from the website:
<a class="monthly-daypanel is-past">
<div class="date">2</div>
<div class="icon-container"...</div>
<div class="temp">
<div class="high">83</div>
<div class="low">83</div>
</div>
</a>
I want to get the date, high and low.
You need an User-Agent header. I would also extract the json string from one of the script tags (I use regex for this) and use that as source. I add in a date comparison to work out if it is a forecast or actual value. I read the json string into json object using json library and loop the resultant collection storing items of interest in an array for faster writing out to sheet at end.
json library:
I use jsonconverter.bas. Download raw code from here and add to standard module called jsonConverter . You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime.
Option Explicit
Public Sub GetWeatherListings()
Dim s As String, re As Object, ws As Worksheet
Set re = CreateObject("vbscript.regexp")
Set ws = ThisWorkbook.Worksheets("Sheet1")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.accuweather.com/en/us/chicago/60608/september-weather/348308", False
' .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responsetext
End With
Dim results(), r As Long, jsonSource As String, json As Object, item As Object
jsonSource = GetString(re, s, "dailyForecast = (.*?\])")
If jsonSource = "No match" Then Exit Sub
Set json = JsonConverter.ParseJson(jsonSource)
ReDim results(1 To json.count, 1 To 4) 'date, datetime, day > dActual, night > dActual
Dim dateTime() As String, datePart As String, forecast As Boolean
For Each item In json
r = r + 1
dateTime = Split(item("dateTime"), "T")
datePart = dateTime(LBound(dateTime))
forecast = CDate(datePart) >= Date
results(r, 1) = datePart
results(r, 2) = item("dateTime")
results(r, 3) = IIf(forecast, item("day")("dTemp"), item("day")("dActual"))
results(r, 4) = IIf(forecast, item("night")("dTemp"), item("night")("dActual"))
Next
Dim headers()
headers = Array("Date", "DateTime", "Day temp", "Night temp")
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetString(ByVal re As Object, ByVal inputString As String, ByVal pattern As String) As String
Dim matches As Object
With re
.Global = True
.MultiLine = True
.IgnoreCase = True
.pattern = pattern
If .Test(inputString) Then
Set matches = .Execute(inputString)
GetString = matches(0).SubMatches(0)
Exit Function
End If
End With
GetString = "No match"
End Function
Sample of end of output:
I want to import restaurant data like Restaurant name, phone number, website & address to excel but unfortunately, I am getting sponsored results & also not getting website & full address as it in on the inner page when we click on the hotel name. I with some help at platforms have created a code using but it is not helping out. Please rectify the issue in my code. Website:https://www.yelp.com/searchcflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=
Here is my code:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&
For page = 0 To 1 ' this is where you change the last number for the pages to traverse
With Http
.Open "GET", URL & page * 30, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult']")
For I = 0 To .Length - 1
Htmldoc.body.innerHTML = .Item(I).outerHTML
On Error Resume Next
r = r + 1: Cells(r, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
Cells(r, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
' Cells(r, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
'Cells(r, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
'Inner loop creation
Cells(r, 5) = Htmldoc.querySelector("[class*='container'] > website").href ' Extract from window after clicking on hotel name
Cells(r, 6) = Htmldoc.querySelector("[class*='container'] > fulladdress").innerText ' Extract from window after clicking on hotel name
On Error GoTo 0
Next I
End With
Next page
End Sub
You can use the free API to get the top 50 from the business_search endpoint. Pass sort parameter in query string to get top rated.
Use a json parser, such as jsonconverter.bas to handle the response. After installing the code from that link in a standard module called JsonConverter, go to VBE > Tools > References > Add a reference to Microsoft Scripting Runtime.
API instructions are here. You need to set up a test app, which requires some basic user info, and verify your email. You will then receive an API key for authentication which is passed in the authorization header as shown below.
There is other info returned which you can parse if wanted.
Option Explicit
Public Sub GetTopRestuarants()
Dim json As Object, headers(), r As Long, c As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://api.yelp.com/v3/businesses/search?term=restuarant&location=san-francisco&limit=50&sort_by=rating", False
.setRequestHeader "Authorization", "Bearer yourAPIkey"
.send
Set json = JsonConverter.ParseJson(.responseText)("businesses")
headers = Array("Restaurant name", "phone", "website", "address")
Dim results(), item As Object
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1
results(r, 1) = item("name")
results(r, 2) = item("phone")
results(r, 3) = item("url")
Dim subItem As Variant, address As String
address = vbNullString
For Each subItem In item("location")("display_address")
address = address & Chr$(32) & subItem
Next
results(r, 4) = Trim$(address)
Next
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Example top 20 of 50 returned:
Caveat emptor
Note that specifying the sort_by is a suggestion (not strictly enforced) to Yelp's search, which considers multiple input parameters to return the most relevant results. For example, the rating sort is not strictly sorted by the rating value, but by an adjusted rating value that takes into account the number of ratings, similar to a Bayesian average. This is to prevent skewing results to businesses with a single review.
This is one of the ways which will let you parse the results from it's inner pages. I can't access that webpage anymore to assist you further. However, give it a shot. I suppose it will work:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Const base$ = "https://www.yelp.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object
[A1:D1] = [{"Name","Phone","Address","Website"}]
For page = 1 To 3 'this is where you change the last number for this script to traverse
With Http
.Open "GET", URL & page * 30, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
For I = 0 To .Length - 1
If Not InStr(.item(I).getAttribute("href"), "/adredir?") > 0 Then
oTitle = .item(I).innerText
newUrl = Replace(.item(I).getAttribute("href"), "about:", base)
With Http
.Open "GET", newUrl, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Htmldoc.body.innerHTML = .responseText
End With
R = R + 1: Cells(R + 1, 1) = oTitle
Set oPhone = Htmldoc.querySelector(".biz-phone")
If Not oPhone Is Nothing Then
Cells(R + 1, 2) = oPhone.innerText
End If
Set oAddress = Htmldoc.querySelector(".map-box-address")
If Not oAddress Is Nothing Then
Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
End If
Set oWeb = Htmldoc.querySelector(".biz-website > a")
If Not oWeb Is Nothing Then
Cells(R + 1, 4) = oWeb.innerText
End If
End If
Next I
End With
Next page
End Sub
Btw, the ads have been kicked out.
I want to scrape data from a local web site. The code collects the table in that Page but I want to also collect the data which is hyperlinked from table "Name of VO/NGO" field.
This is the main table. Other fields I want are from the page that appears when you click "Name of VO/NGO".
I read online material but couldn't correct the code.
First output should appear like this and so on the list should be made of each NGO:
Option Explicit
Public Sub GetInfo()
Const URL As String = "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1"
Dim html As HTMLDocument, hTable As HTMLTable, ws As Worksheet, headers()
headers = Array("Sr No.", "Name of VO/NGO", "Address", "City","State","Telephone","Mobile No.","Website","Email")
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.Send
html.body.innerHTML = .responseText
End With
Set hTable = html.querySelector("table.dvdtbl")
Dim td As Object, tr As Object, r As Long, c As Long
r = 1
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For Each tr In hTable.getElementsByTagName("tr")
r = r + 1: c = 1
If r > 3 Then
For Each td In tr.getElementsByTagName("td")
.Cells(r - 2, c) = IIf(c = 2, "'" & td.innerText, td.innerText)
c = c + 1
Next
End If
Next
End With
End Sub
There are several things to do to achieve the results you are after.
You need to parse the id number from each link to reuse it in post requests.
You need to parse csrf token from this link to be used in post requests
finally, you have to use any json converter or script control to dig out individual fields from that json response.
My following attempt can fetch you the json response. All you need to do now is parse the json to meet your requirement:
Sub FetchTabularInfo()
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim col As Variant, icol As New Collection
Dim csrf As Variant, I&
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1", False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For I = 0 To .Length - 1
icol.Add Split(Split(.item(I).getAttribute("onclick"), "(""")(1), """)")(0)
Next I
End With
For Each col In icol
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
End With
Debug.Print Http.responseText
Next col
End Sub
Output of the first lead:
{"status":1,"infor":{"0":{"UniqueID":"AN\/2017\/0161456","Mobile":"9476076176","Email":"anaportblair#gmail.com","ngo_url":"http:\/\/www.adityanatyaacademy.com","ngo_name":"AdityaNatyaAcademy","pan_updDocId":"220156","reg_updDocId":"221361","Off_phone1":null,"Major_Activities1":".Drama\nJatrapala\nStreetplays\nAwareness Programe"},"issues_working_db":"","operational_states_db":"ANDAMAN & NICOBAR ISLANDS, ","operational_district_db":"ANDAMAN & NICOBAR ISLANDS->South Andaman, "},"member_info":[{"SalCode":null,"FName":"ASHUTOSH KARMAKAR","MName":null,"LName":null,"DesigName":"President","EmailId":"nicorajberg#gmail.com","MobileNo":"9434262953","pan_updDocId":"223392","aadhaar_updDocId":"223393"},{"SalCode":null,"FName":"KAVERI DEBSHARMA","MName":null,"LName":null,"DesigName":"Member","EmailId":"rajeshdebsharma#gmail.com","MobileNo":"9474299901","pan_updDocId":"223400","aadhaar_updDocId":"223401"},{"SalCode":null,"FName":"SATYAJIT BAIN","MName":null,"LName":null,"DesigName":"Asisstant Secretary","EmailId"
:"anaportblair#gmail.com","MobileNo":"9434271746","pan_updDocId":"223408","aadhaar_updDocId":"223409"}],"registeration_info":[{"nr_orgName":"AdityaNatyaAcademy","nr_add":"31 M.G. Road,\nOpp. Sun Sea Resort,\nMiddle Point.","nr_city":"Port Blair","StateName":"ANDAMAN & NICOBAR ISLANDS","reg_name":"Registrar of Companies","TypeDescription":"Registered Societies (Non-Government)","nr_regNo":"888","nr_updDocId":"0","nr_actName":"Society Registration Act 1860","nr_isFcra":"N","fcrano":"","ngo_reg_date":"05-12-1995"}],"source_info":[{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To Promote Art and Culture in Andaman and Nicobar Islands.","datefrom":"2013-04-01","dateto":"2014-03-31","amount_sanctioned":"25000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Culture","datefrom":"2014-04-01","dateto":"2015-03-31","amount_sanctioned":"25000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Cult
ure","datefrom":"2015-04-01","dateto":"2016-03-31","amount_sanctioned":"35000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Culture","datefrom":"2016-04-01","dateto":"2017-03-31","amount_sanctioned":"25000"}]}
Reference to add to execute the above script:
Microsoft Html Object Library
Microsoft xml, v6.0
I have a cell that contains something I would like searched in google news. I want the code to return the number of results for that search. Currently I have this code which I found elsewhere on the site and does not use google news but even then I sometimes get a
runtime error -2147024891 (80070005)
after 70 or so searched and I can't run again.
Sub HawkishSearch()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object
Dim start_time As Date
Dim end_time As Date
lastRow = Range("B" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.co.in/search?q=" & Cells(i, 2) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.XMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
If html.getElementById("resultStats") Is Nothing Then
str_text = "0 Results"
Else
str_text = html.getElementById("resultStats").innerText
End If
Cells(i, 3) = str_text
DoEvents
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
Best option (IMO) is to use the Google News API and register for an API key. You can then use a queryString including your search term and parse the JSON response to get the result count. I do that below and also populate a collection with the article titles and links. I use a JSON parser called JSONConverter.bas which you download and add to your project. You can then go to VBE > Tools > References > add a reference to Microsoft Scripting Runtime.
Sample JSON response from API:
The {} denotes a dictionary which you access by key, the [] denotes a collection which you access by index or by For Each loop over.
I use the key totalResults to retrieve the total results count from the initial dictionary returned by the API.
I then loop the collection of dictionaries (articles) and pull the story titles and URLs.
You can then inspect the results in the locals window or print out
Sample of results in locals window:
Option Explicit
Public Sub GetStories()
Dim articles As Collection, article As Object
Dim searchTerm As String, finalResults As Collection, json As Object, arr(0 To 1)
Set finalResults = New Collection
searchTerm = "Obama"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://newsapi.org/v2/everything?q=" & searchTerm & "&apiKey=yourAPIkey", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
Set json = JsonConverter.ParseJson(.responseText)
End With
Debug.Print "total results = " & json("totalResults")
Set articles = json("articles")
For Each article In articles
arr(0) = article("title")
arr(1) = article("url")
finalResults.Add arr
Next
Stop '<== Delete me later
End Sub
Loop:
If deploying in a loop you can use a class clsHTTP to hold the XMLHTTP object. This is more efficient than creating and destroying. I supply this class with a method GetString to retrieve the JSON response from the API, and a GetInfo method to parse the JSON and retrieve the results count and the API results URLs and Titles.
Example of results structure in locals window:
Class clsHTTP:
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal url As String) As String
With http
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
GetString = .responseText
End With
End Function
Public Function GetInfo(ByVal json As Object) As Variant
Dim results(), counter As Long, finalResults(0 To 1), articles As Object, article As Object
finalResults(0) = json("totalResults")
Set articles = json("articles")
ReDim results(1 To articles.Count, 1 To 2)
For Each article In articles
counter = counter + 1
results(counter, 1) = article("title")
results(counter, 2) = article("url")
Next
finalResults(1) = results
GetInfo = finalResults
End Function
Standard module:
Option Explicit
Public Sub GetStories()
Dim http As clsHTTP, json As Object
Dim finalResults(), searchTerms(), searchTerm As Long, url As String
Set http = New clsHTTP
With ThisWorkbook.Worksheets("Sheet1")
searchTerms = Application.Transpose(.Range("A1:A2")) '<== Change to appropriate range containing search terms
End With
ReDim finalResults(1 To UBound(searchTerms))
For searchTerm = LBound(searchTerms, 1) To UBound(searchTerms, 1)
url = "https://newsapi.org/v2/everything?q=" & searchTerms(searchTerm) & "&apiKey=yourAPIkey"
Set json = JsonConverter.ParseJson(http.GetString(url))
finalResults(searchTerm) = http.GetInfo(json)
Set json = Nothing
Next
Stop '<==Delete me later
End Sub
'
Otherwise:
I would use the following where I grab story links by their class name. I get the count and write the links to a collection
Option Explicit
Public Sub GetStories()
Dim sResponse As String, html As HTMLDocument, articles As Collection
Const BASE_URL As String = "https://news.google.com/"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://news.google.com/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US:en", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument: Set articles = New Collection
Dim numberOfStories As Long, nodeList As Object, i As Long
With html
.body.innerHTML = sResponse
Set nodeList = .querySelectorAll(".VDXfz")
numberOfStories = nodeList.Length
Debug.Print "number of stories = " & numberOfStories
For i = 0 To nodeList.Length - 1
articles.Add Replace$(Replace$(nodeList.item(i).href, "./", BASE_URL), "about:", vbNullString)
Next
End With
Debug.Print articles.Count
End Sub
Standard Google search:
The following works an example standard google search but you will not always get the same HTML structure depending on your search term. You will need to provide some failing cases to help me determine if there is a consistent selector method that can be applied.
Option Explicit
Public Sub GetResultsCount()
Dim sResponse As String, html As HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.google.com/search?q=mitsubishi", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Debug.Print .querySelector("#resultStats").innerText
End With
End Sub