Web scraping a hyperlinked page - html

I want to scrape data from a local web site. The code collects the table in that Page but I want to also collect the data which is hyperlinked from table "Name of VO/NGO" field.
This is the main table. Other fields I want are from the page that appears when you click "Name of VO/NGO".
I read online material but couldn't correct the code.
First output should appear like this and so on the list should be made of each NGO:
Option Explicit
Public Sub GetInfo()
Const URL As String = "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1"
Dim html As HTMLDocument, hTable As HTMLTable, ws As Worksheet, headers()
headers = Array("Sr No.", "Name of VO/NGO", "Address", "City","State","Telephone","Mobile No.","Website","Email")
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.Send
html.body.innerHTML = .responseText
End With
Set hTable = html.querySelector("table.dvdtbl")
Dim td As Object, tr As Object, r As Long, c As Long
r = 1
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For Each tr In hTable.getElementsByTagName("tr")
r = r + 1: c = 1
If r > 3 Then
For Each td In tr.getElementsByTagName("td")
.Cells(r - 2, c) = IIf(c = 2, "'" & td.innerText, td.innerText)
c = c + 1
Next
End If
Next
End With
End Sub

There are several things to do to achieve the results you are after.
You need to parse the id number from each link to reuse it in post requests.
You need to parse csrf token from this link to be used in post requests
finally, you have to use any json converter or script control to dig out individual fields from that json response.
My following attempt can fetch you the json response. All you need to do now is parse the json to meet your requirement:
Sub FetchTabularInfo()
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim col As Variant, icol As New Collection
Dim csrf As Variant, I&
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1", False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For I = 0 To .Length - 1
icol.Add Split(Split(.item(I).getAttribute("onclick"), "(""")(1), """)")(0)
Next I
End With
For Each col In icol
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
End With
Debug.Print Http.responseText
Next col
End Sub
Output of the first lead:
{"status":1,"infor":{"0":{"UniqueID":"AN\/2017\/0161456","Mobile":"9476076176","Email":"anaportblair#gmail.com","ngo_url":"http:\/\/www.adityanatyaacademy.com","ngo_name":"AdityaNatyaAcademy","pan_updDocId":"220156","reg_updDocId":"221361","Off_phone1":null,"Major_Activities1":".Drama\nJatrapala\nStreetplays\nAwareness Programe"},"issues_working_db":"","operational_states_db":"ANDAMAN & NICOBAR ISLANDS, ","operational_district_db":"ANDAMAN & NICOBAR ISLANDS->South Andaman, "},"member_info":[{"SalCode":null,"FName":"ASHUTOSH KARMAKAR","MName":null,"LName":null,"DesigName":"President","EmailId":"nicorajberg#gmail.com","MobileNo":"9434262953","pan_updDocId":"223392","aadhaar_updDocId":"223393"},{"SalCode":null,"FName":"KAVERI DEBSHARMA","MName":null,"LName":null,"DesigName":"Member","EmailId":"rajeshdebsharma#gmail.com","MobileNo":"9474299901","pan_updDocId":"223400","aadhaar_updDocId":"223401"},{"SalCode":null,"FName":"SATYAJIT BAIN","MName":null,"LName":null,"DesigName":"Asisstant Secretary","EmailId"
:"anaportblair#gmail.com","MobileNo":"9434271746","pan_updDocId":"223408","aadhaar_updDocId":"223409"}],"registeration_info":[{"nr_orgName":"AdityaNatyaAcademy","nr_add":"31 M.G. Road,\nOpp. Sun Sea Resort,\nMiddle Point.","nr_city":"Port Blair","StateName":"ANDAMAN & NICOBAR ISLANDS","reg_name":"Registrar of Companies","TypeDescription":"Registered Societies (Non-Government)","nr_regNo":"888","nr_updDocId":"0","nr_actName":"Society Registration Act 1860","nr_isFcra":"N","fcrano":"","ngo_reg_date":"05-12-1995"}],"source_info":[{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To Promote Art and Culture in Andaman and Nicobar Islands.","datefrom":"2013-04-01","dateto":"2014-03-31","amount_sanctioned":"25000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Culture","datefrom":"2014-04-01","dateto":"2015-03-31","amount_sanctioned":"25000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Cult
ure","datefrom":"2015-04-01","dateto":"2016-03-31","amount_sanctioned":"35000"},{"sourcefund":"S","deptt_name":"Directorate of Art and Culture","purpose":"To promote Art and Culture","datefrom":"2016-04-01","dateto":"2017-03-31","amount_sanctioned":"25000"}]}
Reference to add to execute the above script:
Microsoft Html Object Library
Microsoft xml, v6.0

Related

Inner Loop design for webscraping

I want to import restaurant data like Restaurant name, phone number, website & address to excel but unfortunately, I am getting sponsored results & also not getting website & full address as it in on the inner page when we click on the hotel name. I with some help at platforms have created a code using but it is not helping out. Please rectify the issue in my code. Website:https://www.yelp.com/searchcflt=restaurants&find_loc=San%20Francisco%2C%20CA&start=
Here is my code:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Htmldoc As New HTMLDocument, page&, I&
For page = 0 To 1 ' this is where you change the last number for the pages to traverse
With Http
.Open "GET", URL & page * 30, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult']")
For I = 0 To .Length - 1
Htmldoc.body.innerHTML = .Item(I).outerHTML
On Error Resume Next
r = r + 1: Cells(r, 1) = Htmldoc.querySelector("[class*='heading--h3'] > a").innerText
Cells(r, 2) = Htmldoc.querySelector("[class*='container'] > [class*='display--inline-block']").innerText
' Cells(r, 3) = Htmldoc.querySelector("[class*='container'] > address").innerText
'Cells(r, 4) = Htmldoc.querySelector("[class*='container'] > address").NextSibling.innerText
'Inner loop creation
Cells(r, 5) = Htmldoc.querySelector("[class*='container'] > website").href ' Extract from window after clicking on hotel name
Cells(r, 6) = Htmldoc.querySelector("[class*='container'] > fulladdress").innerText ' Extract from window after clicking on hotel name
On Error GoTo 0
Next I
End With
Next page
End Sub
You can use the free API to get the top 50 from the business_search endpoint. Pass sort parameter in query string to get top rated.
Use a json parser, such as jsonconverter.bas to handle the response. After installing the code from that link in a standard module called JsonConverter, go to VBE > Tools > References > Add a reference to Microsoft Scripting Runtime.
API instructions are here. You need to set up a test app, which requires some basic user info, and verify your email. You will then receive an API key for authentication which is passed in the authorization header as shown below.
There is other info returned which you can parse if wanted.
Option Explicit
Public Sub GetTopRestuarants()
Dim json As Object, headers(), r As Long, c As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://api.yelp.com/v3/businesses/search?term=restuarant&location=san-francisco&limit=50&sort_by=rating", False
.setRequestHeader "Authorization", "Bearer yourAPIkey"
.send
Set json = JsonConverter.ParseJson(.responseText)("businesses")
headers = Array("Restaurant name", "phone", "website", "address")
Dim results(), item As Object
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1
results(r, 1) = item("name")
results(r, 2) = item("phone")
results(r, 3) = item("url")
Dim subItem As Variant, address As String
address = vbNullString
For Each subItem In item("location")("display_address")
address = address & Chr$(32) & subItem
Next
results(r, 4) = Trim$(address)
Next
End With
With ActiveSheet
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Example top 20 of 50 returned:
Caveat emptor
Note that specifying the sort_by is a suggestion (not strictly enforced) to Yelp's search, which considers multiple input parameters to return the most relevant results. For example, the rating sort is not strictly sorted by the rating value, but by an adjusted rating value that takes into account the number of ratings, similar to a Bayesian average. This is to prevent skewing results to businesses with a single review.
This is one of the ways which will let you parse the results from it's inner pages. I can't access that webpage anymore to assist you further. However, give it a shot. I suppose it will work:
Sub GetInfo()
Const URL$ = "https://www.yelp.com/search?cflt=restaurants&find_loc=San%20Francisco%2C%20CA&start="
Const base$ = "https://www.yelp.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim oTitle$, oPhone As Object, Htmldoc As New HTMLDocument
Dim R&, newUrl$, I&, oWeb As Object, page&, oAddress As Object
[A1:D1] = [{"Name","Phone","Address","Website"}]
For page = 1 To 3 'this is where you change the last number for this script to traverse
With Http
.Open "GET", URL & page * 30, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("[class*='searchResult'] [class*='heading--h3'] > a")
For I = 0 To .Length - 1
If Not InStr(.item(I).getAttribute("href"), "/adredir?") > 0 Then
oTitle = .item(I).innerText
newUrl = Replace(.item(I).getAttribute("href"), "about:", base)
With Http
.Open "GET", newUrl, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
Htmldoc.body.innerHTML = .responseText
End With
R = R + 1: Cells(R + 1, 1) = oTitle
Set oPhone = Htmldoc.querySelector(".biz-phone")
If Not oPhone Is Nothing Then
Cells(R + 1, 2) = oPhone.innerText
End If
Set oAddress = Htmldoc.querySelector(".map-box-address")
If Not oAddress Is Nothing Then
Cells(R + 1, 3) = WorksheetFunction.Clean(oAddress.innerText)
End If
Set oWeb = Htmldoc.querySelector(".biz-website > a")
If Not oWeb Is Nothing Then
Cells(R + 1, 4) = oWeb.innerText
End If
End If
Next I
End With
Next page
End Sub
Btw, the ads have been kicked out.

Import Data in excel using JSON

I have developed a code to scrape data from a website but since I know very little about JSON I could be able to get the output as required shown in below snap:
However, I am getting all data from the web in the immediate window but want to organize these fields just like an above snap.
Here is my code:
Sub FetchTabularInfo()
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim col As Variant, icol As New Collection
Dim csrf As Variant, I&
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1", False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For I = 0 To .Length - 1
icol.Add Split(Split(.Item(I).getAttribute("onclick"), "(""")(1), """)")(0)
Next I
End With
For Each col In icol
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
End With
Debug.Print Http.responseText
Next col
End Sub
The output in the immediate window is:
The following shows you how to use a json parser. I use jsonconverter.bas. After copying the code from there into a standard module called JsonConverter, you need to go VBE>Tools>References>Add reference to Microsoft Scripting Runtime.
In the json response the {} are dictionaries accessed by key; the [] are collections accessed by index (or For Each over)
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim col As Variant, icol As New Collection
Dim csrf As Variant, i&
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1", False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
icol.Add Split(Split(.item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, headers(), results(), ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name of VGO/NGO", "Address", "City", "State", "Tel", "Mobile", "Web", "Email")
ReDim results(1 To icol.Count, 1 To UBound(headers) + 1)
For Each col In icol
r = r + 1
With Http
.Open "GET", "https://ngodarpan.gov.in/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://ngodarpan.gov.in/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
city = json("registeration_info")(1)("nr_city")
srNo = r '<unsure where this is coming from.
state = Replace$(json("registeration_info")(1)("StateName"), "amp;", vbNullString)
tel = IIf(IsNull(json("infor")("0")("Off_phone1")), vbNullString, json("infor")("0")("Off_phone1")) '<unsure where this is coming from. Need a csrf to test with
mobile = json("infor")("0")("Mobile")
website = json("infor")("0")("ngo_url")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, city, state, tel, mobile, website, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub

Import Data using JSON

I have developed a code to scrape data from a websites but since I know very little about JSON I could be able to get the output as I want, the code was developed for this web: https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1 now I am replicating my code for other websites having json like this web :https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA ; but this code is not functioning properly. Here is my code(I want it to be generic for most webs)
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
Dim headers(), ws As Worksheet, iCol As Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
Set Http = New XMLHTTP60
Set Html = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For page = 1 To 8 'To cover all pages
With Http
.Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
.send
Html.body.innerHTML = .responseText
End With
Set iCol = New Collection
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, results()
ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
r = 0
For Each col In iCol
r = r + 1
With Http
.Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
srNo = r '<unsure where this is coming from.
mobile = json("infor")("0")("Mobile")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, tel, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
Set iCol = Nothing: Set json = Nothing
ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
Next
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
Please also let me know mistakes I am doing, so that i will take care of those in future.
Short answer:
No.
I would go so far as to say it is impossible you can write something generic for most webs . One could say the generic part is the parser itself. But you need to have some familiarity with the json structure of each endpoint to appropriately direct parsing. Json itself has defined structural syntax/components, but what you want from those will have different access paths and require different handling to do so. Then there are the arguments that may need to be supplied and differences in output format.
What is the best scenario?
If you have a set list of urls (ideally API endpoints) you have a better chance of writing something that might last for a while as you can familiarise yourself with the json returned. But how generic is this? It's really just branched code.
Re-usable stuff:
Might be the non parser stuff which can be generalised e.g. methods and classes you create that parse out the paths for the entire structure and look for key words and return you those paths? Helper functions you write that might recursively write out nested structures etc. Code that makes the request and handles fails etc.... I would definitely recommend looking into classes for re-usable code in web-scraping.
Class based examples:
I will add to this as and when
https://stackoverflow.com/a/52301153/6241235
https://codereview.stackexchange.com/questions/69009/vba-clickbot-featuring-ajax-waiting-and-element-searching

Excel VBA Scraping- HTML tables are not visible

I am trying to get data from "https://in.tradingview.com/symbols/NSE-ABB/technicals/" using excel vba website scraping, eventhough I am geting response, but the body.innerHTML is not showing required table, but in chrome if I inspect the page, I am able to see the table with the name.
What is wrong with the code?
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
WriteTxtFile sResponse
With html
.body.innerHTML = sResponse
Set tElementC = .getElementsByClassName("table-1i1M26QY- maTable-27Z4Dq6Y- tableWithAction-2OCRQQ8y-")(0).getElementsByTagName("td")
End With
URL --> https://in.tradingview.com/symbols/NSE-ABB/technicals/
classname to access = "table-1i1M26QY- maTable-27Z4Dq6Y- tableWithAction-2OCRQQ8y-"
The webpage source HTML by the link provided https://in.tradingview.com/symbols/NSE-ABB/technicals/ doesn't contain the necessary data, it uses AJAX. The website has a sorta API available. The response is returned in JSON format. So you need to make some reverse engineering work first to find out how does the website works. In a browser, e. g. Chrome, press F12 to open DevTools, navigate to the webpage, go to Network tab, set the filter to XHR, it will look like as shown below:
Examine logged responses. One of them having the largest size actually contains all the necessary data:
To make such XHR you need to keep the entire payload structure also, and add the relevant headers:
In Form Data section there are a lot of quote field titles that located within the array, so you may choose which actually you need. You may find more available titles, click on Initiator link (first screenshot above), you will see JS code which initiated that XHR. Click Pretty print {} at the bottom to make the code readable. Type any title you already pulled out from Form Data in the search box, e. g. Recommend.Other, and find others next to it in the code:
Here is VBA example showing how such scraping could be done. Import JSON.bas module into the VBA project for JSON processing.
Option Explicit
Sub Test()
Dim aQuoteFieldTitles()
Dim aQuoteFieldData()
Dim sPayload As String
Dim sJSONString As String
Dim vJSON
Dim sState As String
Dim i As Long
' Put the necessary field titles into array
aQuoteFieldTitles = Array( _
"name", "description", "country", "type", "after_tax_margin", "average_volume", "average_volume_30d_calc", "average_volume_60d_calc", "average_volume_90d_calc", "basic_eps_net_income", "beta_1_year", "beta_3_year", "beta_5_year", "current_ratio", "debt_to_assets", "debt_to_equity", "dividends_paid", "dividends_per_share_fq", _
"dividends_yield", "dps_common_stock_prim_issue_fy", "earnings_per_share_basic_ttm", "earnings_per_share_diluted_ttm", "earnings_per_share_forecast_next_fq", "earnings_per_share_fq", "earnings_release_date", "earnings_release_next_date", "ebitda", "enterprise_value_ebitda_ttm", "enterprise_value_fq", "exchange", "expected_annual_dividends", _
"gross_margin", "gross_profit", "gross_profit_fq", "industry", "last_annual_eps", "last_annual_revenue", "long_term_capital", "market_cap_basic", "market_cap_calc", "net_debt", "net_income", "number_of_employees", "number_of_shareholders", "operating_margin", _
"pre_tax_margin", "preferred_dividends", "price_52_week_high", "price_52_week_low", "price_book_ratio", "price_earnings_ttm", "price_revenue_ttm", "price_sales_ratio", "quick_ratio", "return_of_invested_capital_percent_ttm", "return_on_assets", "return_on_equity", "return_on_invested_capital", "revenue_per_employee", "sector", _
"eps_surprise_fq", "eps_surprise_percent_fq", "total_assets", "total_capital", "total_current_assets", "total_debt", "total_revenue", "total_shares_outstanding_fundamental", "volume", "relative_volume", "pre_change", "post_change", "close", "open", "high", "low", "gap", "price_earnings_to_growth_ttm", "price_sales", "price_book_fq", _
"price_free_cash_flow_ttm", "float_shares_outstanding", "total_shares_outstanding", "change_from_open", "change_from_open_abs", "Perf.W", "Perf.1M", "Perf.3M", "Perf.6M", "Perf.Y", "Perf.YTD", "Volatility.W", "Volatility.M", "Volatility.D", "RSI", "RSI7", "ADX", "ADX+DI", "ADX-DI", "ATR", "Mom", "High.All", "Low.All", "High.6M", "Low.6M", _
"High.3M", "Low.3M", "High.1M", "Low.1M", "EMA5", "EMA10", "EMA20", "EMA30", "EMA50", "EMA100", "EMA200", "SMA5", "SMA10", "SMA20", "SMA30", "SMA50", "SMA100", "SMA200", "Stoch.K", "Stoch.D", "MACD.macd", "MACD.signal", "Aroon.Up", "Aroon.Down", "BB.upper", "BB.lower", "goodwill", "debt_to_equity_fq", "CCI20", "DonchCh20.Upper", _
"DonchCh20.Lower", "HullMA9", "AO", "Pivot.M.Classic.S3", "Pivot.M.Classic.S2", "Pivot.M.Classic.S1", "Pivot.M.Classic.Middle", "Pivot.M.Classic.R1", "Pivot.M.Classic.R2", "Pivot.M.Classic.R3", "Pivot.M.Fibonacci.S3", "Pivot.M.Fibonacci.S2", "Pivot.M.Fibonacci.S1", "Pivot.M.Fibonacci.Middle", "Pivot.M.Fibonacci.R1", _
"Pivot.M.Fibonacci.R2", "Pivot.M.Fibonacci.R3", "Pivot.M.Camarilla.S3", "Pivot.M.Camarilla.S2", "Pivot.M.Camarilla.S1", "Pivot.M.Camarilla.Middle", "Pivot.M.Camarilla.R1", "Pivot.M.Camarilla.R2", "Pivot.M.Camarilla.R3", "Pivot.M.Woodie.S3", "Pivot.M.Woodie.S2", "Pivot.M.Woodie.S1", "Pivot.M.Woodie.Middle", "Pivot.M.Woodie.R1", _
"Pivot.M.Woodie.R2", "Pivot.M.Woodie.R3", "Pivot.M.Demark.S1", "Pivot.M.Demark.Middle", "Pivot.M.Demark.R1", "KltChnl.upper", "KltChnl.lower", "P.SAR", "Value.Traded", "MoneyFlow", "ChaikinMoneyFlow", "Recommend.All", "Recommend.MA", "Recommend.Other", "Stoch.RSI.K", "Stoch.RSI.D", "W.R", "ROC", "BBPower", "UO", "Ichimoku.CLine", _
"Ichimoku.BLine", "Ichimoku.Lead1", "Ichimoku.Lead2", "VWMA", "ADR", "RSI[1]", "Stoch.K[1]", "Stoch.D[1]", "CCI20[1]", "ADX-DI[1]", "AO[1]", "Mom[1]", "Rec.Stoch.RSI", "Rec.WR", "Rec.BBPower", "Rec.UO", "Rec.Ichimoku", "Rec.VWMA", "Rec.HullMA9" _
)
' Field titles exactly as in the table MOVING AVERAGES
' aQuoteFieldTitles = Array("EMA5", "SMA5", "EMA10", "SMA10", "EMA20", "SMA20", "EMA30", "SMA30", "EMA50", "SMA50", "EMA100", "SMA100", "EMA200", "SMA200", "Ichimoku.BLine", "VWMA", "HullMA9")
' Compose payload
sPayload = "{""symbols"":{""tickers"":[""NSE:ABB""],""query"":{""types"":[]}},""columns"":" & JSON.Serialize(aQuoteFieldTitles) & "}"
' Retrieve JSON response
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", "https://scanner.tradingview.com/india/scan", True
.setRequestHeader "content-type", "application/x-www-form-urlencoded"
.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
.setRequestHeader "content-length", Len(sPayload)
.send (sPayload)
Do Until .readyState = 4: DoEvents: Loop
sJSONString = .responseText
End With
' Parse JSON response
JSON.Parse sJSONString, vJSON, sState
' Check response validity
Select Case True
Case sState <> "Object"
MsgBox "Invalid JSON response"
Case IsNull(vJSON("data"))
MsgBox vJSON("error")
Case Else
' Output data to worksheet #1
aQuoteFieldData = vJSON("data")(0)("d")
With ThisWorkbook.Sheets(1)
.Cells.Delete
.Cells.WrapText = False
For i = 0 To UBound(aQuoteFieldTitles)
.Cells(i + 1, 1).Value = aQuoteFieldTitles(i)
.Cells(i + 1, 2).Value = aQuoteFieldData(i)
Next
.Columns.AutoFit
End With
MsgBox "Completed"
End Select
End Sub
The output for me is as follows:
BTW, the similar approach applied in other answers.
As mentioned in comments, javascript has to run on the page to update the required content. There doesn't appear to be an API freely available. You can use a browser. You need to go VBE > Tools > References > add a reference to Microsoft Internet Controls.
Option Explicit
Public Sub GetInfo()
Dim IE As InternetExplorer, ws As Worksheet, hTable As Object, tRow As Object, td As Object, r As Long, c As Long, headers()
headers = Array("name", "value", "action")
Set ws = ThisWorkbook.Worksheets("Sheet1"): Set IE = New InternetExplorer
With IE
.Visible = True
.Navigate2 "https://in.tradingview.com/symbols/NSE-ABB/technicals/"
While .Busy Or .readyState < 4: DoEvents: Wend
Set hTable = IE.document.querySelector("table + .tableWithAction-2OCRQQ8y-")
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For Each tRow In hTable.getElementsByTagName("tr")
r = r + 1: c = 1
For Each td In tRow.getElementsByTagName("td")
ws.Cells(r, c).Value = td.innerText
c = c + 1
Next td
Next tRow
.Quit
End With
End Sub

VBA - Number of Google News Search Results

I have a cell that contains something I would like searched in google news. I want the code to return the number of results for that search. Currently I have this code which I found elsewhere on the site and does not use google news but even then I sometimes get a
runtime error -2147024891 (80070005)
after 70 or so searched and I can't run again.
Sub HawkishSearch()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object
Dim start_time As Date
Dim end_time As Date
lastRow = Range("B" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.co.in/search?q=" & Cells(i, 2) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.XMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
If html.getElementById("resultStats") Is Nothing Then
str_text = "0 Results"
Else
str_text = html.getElementById("resultStats").innerText
End If
Cells(i, 3) = str_text
DoEvents
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
Best option (IMO) is to use the Google News API and register for an API key. You can then use a queryString including your search term and parse the JSON response to get the result count. I do that below and also populate a collection with the article titles and links. I use a JSON parser called JSONConverter.bas which you download and add to your project. You can then go to VBE > Tools > References > add a reference to Microsoft Scripting Runtime.
Sample JSON response from API:
The {} denotes a dictionary which you access by key, the [] denotes a collection which you access by index or by For Each loop over.
I use the key totalResults to retrieve the total results count from the initial dictionary returned by the API.
I then loop the collection of dictionaries (articles) and pull the story titles and URLs.
You can then inspect the results in the locals window or print out
Sample of results in locals window:
Option Explicit
Public Sub GetStories()
Dim articles As Collection, article As Object
Dim searchTerm As String, finalResults As Collection, json As Object, arr(0 To 1)
Set finalResults = New Collection
searchTerm = "Obama"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://newsapi.org/v2/everything?q=" & searchTerm & "&apiKey=yourAPIkey", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
Set json = JsonConverter.ParseJson(.responseText)
End With
Debug.Print "total results = " & json("totalResults")
Set articles = json("articles")
For Each article In articles
arr(0) = article("title")
arr(1) = article("url")
finalResults.Add arr
Next
Stop '<== Delete me later
End Sub
Loop:
If deploying in a loop you can use a class clsHTTP to hold the XMLHTTP object. This is more efficient than creating and destroying. I supply this class with a method GetString to retrieve the JSON response from the API, and a GetInfo method to parse the JSON and retrieve the results count and the API results URLs and Titles.
Example of results structure in locals window:
Class clsHTTP:
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal url As String) As String
With http
.Open "GET", url, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
GetString = .responseText
End With
End Function
Public Function GetInfo(ByVal json As Object) As Variant
Dim results(), counter As Long, finalResults(0 To 1), articles As Object, article As Object
finalResults(0) = json("totalResults")
Set articles = json("articles")
ReDim results(1 To articles.Count, 1 To 2)
For Each article In articles
counter = counter + 1
results(counter, 1) = article("title")
results(counter, 2) = article("url")
Next
finalResults(1) = results
GetInfo = finalResults
End Function
Standard module:
Option Explicit
Public Sub GetStories()
Dim http As clsHTTP, json As Object
Dim finalResults(), searchTerms(), searchTerm As Long, url As String
Set http = New clsHTTP
With ThisWorkbook.Worksheets("Sheet1")
searchTerms = Application.Transpose(.Range("A1:A2")) '<== Change to appropriate range containing search terms
End With
ReDim finalResults(1 To UBound(searchTerms))
For searchTerm = LBound(searchTerms, 1) To UBound(searchTerms, 1)
url = "https://newsapi.org/v2/everything?q=" & searchTerms(searchTerm) & "&apiKey=yourAPIkey"
Set json = JsonConverter.ParseJson(http.GetString(url))
finalResults(searchTerm) = http.GetInfo(json)
Set json = Nothing
Next
Stop '<==Delete me later
End Sub
'
Otherwise:
I would use the following where I grab story links by their class name. I get the count and write the links to a collection
Option Explicit
Public Sub GetStories()
Dim sResponse As String, html As HTMLDocument, articles As Collection
Const BASE_URL As String = "https://news.google.com/"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://news.google.com/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US:en", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument: Set articles = New Collection
Dim numberOfStories As Long, nodeList As Object, i As Long
With html
.body.innerHTML = sResponse
Set nodeList = .querySelectorAll(".VDXfz")
numberOfStories = nodeList.Length
Debug.Print "number of stories = " & numberOfStories
For i = 0 To nodeList.Length - 1
articles.Add Replace$(Replace$(nodeList.item(i).href, "./", BASE_URL), "about:", vbNullString)
Next
End With
Debug.Print articles.Count
End Sub
Standard Google search:
The following works an example standard google search but you will not always get the same HTML structure depending on your search term. You will need to provide some failing cases to help me determine if there is a consistent selector method that can be applied.
Option Explicit
Public Sub GetResultsCount()
Dim sResponse As String, html As HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.google.com/search?q=mitsubishi", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Set html = New HTMLDocument
With html
.body.innerHTML = sResponse
Debug.Print .querySelector("#resultStats").innerText
End With
End Sub