Parse/Scrape column from table using MSHTML.HTMLDocument

Parse/Scrape column from table using MSHTML.HTMLDocument - html

I wrote this piece of code that scrapes the whole table from the webpage in the URL variable. I would like to only scrape/parse the column by the name
"Extrapolated Vol".
My html/xml is not strong, so a solution along with an explanation would be appreciated!
https://services.tcpl.ca/cor/public/gdsr/GdsrNGTLImperial20151122.htm
Thanks
Sub ExtractAlbertaAIL()
Application.ScreenUpdating = False
Application.EnableEvents = False
Dim URL As String
Dim Request As MSXML2.XMLHTTP
Dim doc As MSHTML.HTMLDocument
Dim tr As MSHTML.HTMLGenericElement
Dim td As MSHTML.HTMLGenericElement
Dim RowNumber As Integer
Dim ColNumber As Integer
ActiveWorkbook.Worksheets("Gas Day Summary").Range("A5:H10000") = ""
Set Request = CreateObject("msxml2.xmlhttp")
If Request Is Nothing Then
MsgBox "For some reason I wasn't able to make a MSXML2.XMLHTTP object"
Exit Sub
End If
URL = "https://services.tcpl.ca/cor/public/gdsr/GdsrNGTLImperial20151122.htm"
With Request
.Open "GET", URL, False
.send
Set doc = New MSHTML.HTMLDocument
doc.body.innerHTML = .responseText
End With
RowNumber = 1
For Each tr In doc.getElementsByTagName("table").Item(2).getElementsByTagName("tr")
ColNumber = 1
For Each td In tr.getElementsByTagName("td")
Worksheets("Gas Day Summary").Cells(RowNumber, ColNumber) = td.innerText
ColNumber = ColNumber + 1
Next td
RowNumber = RowNumber + 1
Next tr
Application.ScreenUpdating = True
Application.EnableEvents = True
End Sub

Try this
Sub ExtractAlbertaAIL()
Application.ScreenUpdating = False
Application.EnableEvents = False
Dim URL As String
Dim Request As MSXML2.XMLHTTP60
Dim doc As MSHTML.HTMLDocument
Dim tr As MSHTML.HTMLGenericElement
Dim td As MSHTML.HTMLGenericElement
Dim VOLUME_SUMMARY_FOUND As Boolean
VOLUME_SUMMARY_FOUND = False
Dim RowNumber As Integer
Dim ColNumber As Integer
ActiveWorkbook.Worksheets("Gas Day Summary").Range("A5:H10000") = ""
Set Request = CreateObject("msxml2.xmlhttp")
If Request Is Nothing Then
MsgBox "For some reason I wasn't able to make a MSXML2.XMLHTTP object"
Exit Sub
End If
URL = "https://services.tcpl.ca/cor/public/gdsr/GdsrNGTLImperial20151122.htm"
With Request
.Open "GET", URL, False
.send
Set doc = New MSHTML.HTMLDocument
doc.body.innerHTML = .responseText
End With
RowNumber = 1
For Each tr In doc.getElementsByTagName("table").Item(2).getElementsByTagName("tr")
If tr.Cells(0).innerText = "VOLUME SUMMARY" Then
VOLUME_SUMMARY_FOUND = True
End If
If VOLUME_SUMMARY_FOUND = True Then
Worksheets("Gas Day Summary").Cells(RowNumber, 1) = tr.Cells(0).innerText
Worksheets("Gas Day Summary").Cells(RowNumber, 2) = tr.Cells(2).innerText
RowNumber = RowNumber + 1
End If
Next tr
Application.ScreenUpdating = True
Application.EnableEvents = True
End Sub

Related

VBA web Scraping - Shifting HTMLdoc to XML but facing errors while clicking on a button

I have an MSHTML.HTMLDocument code that:
Opens the page "https://www.ksestocks.com/HistoryHighLow"
Fills an input i.e 786
Then click on a button to fetch a table
There I catch a row and its 4 children using the following code
Sub KSE_GetHTMLDocument()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDOC As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLClasses As MSHTML.IHTMLElementCollection
Dim HTMLClass As MSHTML.IHTMLElement
Dim HTMLCel As MSHTML.IHTMLElement
Dim colNum, rowNum, RowN, C As Integer
Dim Cel As Range
IE.Visible = False
IE.Navigate "https://www.ksestocks.com/HistoryHighLow"
Do While IE.ReadyState <> READYSTATE_COMPLETE
Loop
For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If IsEmpty(Cel.Value) = False Then
Set HTMLDOC = IE.Document
Set HTMLInput = HTMLDOC.getElementById("selscrip")
HTMLInput.Value = Trim(Cel.Value)
Debug.Print Cel.Value
HTMLDOC.getElementsByTagName("input")(0).Click
While IE.Busy Or IE.readyState < 4: DoEvents: Wend
C = 0
For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
For Each HTMLCel In HTMLClass.Children
Debug.Print HTMLCel.innerText
If C = 1 Then
Cel.Offset(0, 7).Value = HTMLCel.innerText
ElseIf C = 2 Then
Cel.Offset(0, 8).Value = HTMLCel.innerText
ElseIf C = 3 Then
Cel.Offset(0, 9).Value = HTMLCel.innerText
ElseIf C = 4 Then
Cel.Offset(0, 10).Value = HTMLCel.innerText
End If
C = C + 1
Next
End If
End If
Next
End If
Next
End Sub
The above code is working fine getting values from the website, but when I change the code to shift it to XML it stops working also internet explorer is popping up with no results every time with a new window.
where I am doing it wrong?
Is there more robust way of scraping webpage?
Please check the following code before running
Sub KSE_Get_XML()
Dim XMLp As New MSXML2.XMLHTTP60
Dim HTMLDOC As New MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLClasses As MSHTML.IHTMLElementCollection
Dim HTMLClass As MSHTML.IHTMLElement
Dim HTMLCel As MSHTML.IHTMLElement
Dim colNum, rowNum, RowN, C As Integer
XMLp.Open "GET", "https://www.ksestocks.com/HistoryHighLow", False
XMLp.send
HTMLDOC.body.innerHTML = XMLp.responseText
Dim Cel As Range
' Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
' Loop
For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If IsEmpty(Cel.Value) = False Then
HTMLDOC.body.innerHTML = XMLp.responseText
Set HTMLInput = HTMLDOC.getElementById("selscrip")
HTMLInput.Value = Trim(Cel.Value)
Debug.Print Cel.Value
HTMLDOC.getElementsByTagName("input")(0).Click
'Application.Wait Now + TimeValue("00:00:01")
'' Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
' DoEvents
' Loop
C = 0
For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
For Each HTMLCel In HTMLClass.Children
Debug.Print HTMLCel.innerText
If C = 1 Then
Cel.Offset(0, 7).Value = HTMLCel.innerText
ElseIf C = 2 Then
Cel.Offset(0, 8).Value = HTMLCel.innerText
ElseIf C = 3 Then
Cel.Offset(0, 9).Value = HTMLCel.innerText
ElseIf C = 4 Then
Cel.Offset(0, 10).Value = HTMLCel.innerText
End If
C = C + 1
Next
End If
End If
Next
End If
Next
End Sub

Get rid of IE altogether and switch to xmlhttp requests, which is robust and less error prone. When you go for xhr, you need to issue a post http requests with appropriate parameters. This is something you can do to get the results right next to Last 3 years (1 Sep 2018 - 1 Sep 2021) from that table.
Public Sub GetContent()
Const Url = "https://www.ksestocks.com/HistoryHighLow"
Dim Http As Object, Html As HTMLDocument, Htmldoc As HTMLDocument
Dim params$, I&, R&, ws As Worksheet, searchKeyword$
Set Html = New HTMLDocument
Set Htmldoc = New HTMLDocument
Set Http = CreateObject("MSXML2.XMLHTTP")
Set ws = ThisWorkbook.Worksheets("Sheet1")
R = 2
searchKeyword = "786" 'you can use different search keywords here to get related results
params = "selscrip=" & searchKeyword
With Http
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
.setRequestHeader "content-type", "application/x-www-form-urlencoded; charset=UTF-8"
.send (params)
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("td.plain")
For I = 0 To .Length - 1
If InStr(.item(I).innerText, "Last 3 years") > 0 Then
Htmldoc.body.innerHTML = "<table>" & .item(I).ParentNode.outerHTML & "</table>"
ws.Cells(R, 1) = Htmldoc.querySelectorAll("td.plain")(1).innerText
ws.Cells(R, 2) = Htmldoc.querySelectorAll("td.plain")(2).innerText
ws.Cells(R, 3) = Htmldoc.querySelectorAll("td.plain")(3).innerText
ws.Cells(R, 4) = Htmldoc.querySelectorAll("td.plain")(4).innerText
End If
Next I
End With
End Sub
Reference to add:
1. Microsoft XML, v6.0
2. Microsoft HTML Object Library
Your search keyword would be what you see in this image.

Web Scraping: reformat infomation from an HTML table

I am trying to extract the Futures data from MRCI.com and restructure it into one continous table in an excel worksheet so I can manipulate from there.
How can I repeat the Futures Contract in each row to get the following table layout:
Table Structure
Here's my code so far:
Sub MRCIData()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim FutTable As MSHTML.IHTMLElement
Dim FutRows As MSHTML.IHTMLElementCollection
Dim FutRow As MSHTML.IHTMLElement
Dim FutCells As MSHTML.IHTMLElementCollection
Dim FutCell As MSHTML.IHTMLElement
Dim FutContracts As MSHTML.IHTMLElementCollection
Dim FutContract As MSHTML.IHTMLElement
Dim FutRowText As String
Dim MrciURLHist As String
MrciURLHist = "https://www.mrci.com/ohlc/2020/200320.php"
XMLReq.Open "GET", MrciURLHist, False
XMLReq.send
If XMLReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = XMLReq.responseText
Set XMLReq = Nothing
Set FutContracts = HTMLDoc.getElementsByClassName("note1")
For Each FutContract In FutContracts
Next
Set FutTable = HTMLDoc.getElementsByClassName("strat")(0)
Set FutRows = FutTable.getElementsByTagName("tr")
For Each FutRow In FutRows
Set FutCells = FutRow.getElementsByTagName("td")
FutRowText = ""
If InStr(FutRow.innerText, "Total Volume") = 0 Then
For Each FutCell In FutCells
FutRowText = FutRowText & vbTab & FutCell.innerText
Next
End If
Debug.Print , FutRowText
Next
End Sub

The following code looks through the table line by line and determines what future to apply to the next rows until it finds a the next one, and so on. the output is not pretty so more a proof of concept. The code now outputs the table correctly.
Sub Main(ByVal Sheet As Worksheet)
Dim oRequest As New MSXML2.XMLHTTP60
Dim oDocument As New MSHTML.HTMLDocument
Dim oRows As MSHTML.IHTMLElementCollection
Dim oRow As MSHTML.IHTMLElement
Dim oCells As MSHTML.IHTMLElementCollection
Dim oCell As MSHTML.IHTMLElement
oRequest.Open "GET", "https://www.mrci.com/ohlc/2020/200320.php", False
oRequest.send
If oRequest.Status <> 200 Then
MsgBox "Error"
Exit Sub
End If
oDocument.body.innerHTML = oRequest.responseText
Set oRequest = Nothing
Dim Skip As Boolean
Dim Current As String
Dim RowIndex As Integer
Dim ColumnIndex As Integer
Set oRows = oDocument.getElementsByClassName("strat")(0).getElementsByTagName("tr")
Current = ""
Application.ScreenUpdating = False
For Each oRow In oRows
Skip = False
If oRow.getElementsByTagName("th").Length > 0 Then
Current = oRow.innerText
Skip = True
End If
If Not Current = "" And Skip = False Then
If InStr(oRow.innerText, "Total Volume") = 0 Then
Set oCells = oRow.getElementsByTagName("td")
ColumnIndex = 2
Sheet.Cells(RowIndex, 1).Value = Current
For Each oCell In oCells
Sheet.Cells(RowIndex, ColumnIndex).Value = oCell.innerText
ColumnIndex = ColumnIndex + 1
Next oCell
RowIndex = RowIndex + 1
End If
End If
Next oRow
Application.ScreenUpdating = True
End Sub

Extract Table from Webpage in Excel using VBA

How to Extract Below table in Excel from Webpage?
Table
Company | Bonus Ratio |Announcement|Record|Ex-Bonus
Codes
Dim ie As SHDocVw.InternetExplorer
Set ie = New InternetExplorerMedium
Set ie = CreateObject("InternetExplorer.Application")
While ie.busy
DoEvents
Wend
ie.Visible = True
While ie.busy
DoEvents
Wend
Dim NavURL As String
NavURL = "https://www.moneycontrol.com/stocks/marketinfo/bonus/homebody.php?sel_year=2015"
ie.Navigate NavURL
While ie.busy
DoEvents
Wend
Set doc = ie.document
Set hTable = doc.GetElementsByTagName("table")
y = 2 'Column B in Excel
z = 7 'Row 7 in Excel
For Each td In hTable
Set hHead = tb.GetElementsByTagName("td")
For Each hh In hHead
Set hTR = hh.GetElementsByTagName("tr")
For Each tr In hTR
Webpage: https://www.moneycontrol.com/stocks/marketinfo/bonus/homebody.php?sel_year=2015
by Keeping Bonus Ratio as Same as on Webpage or Text Format
While copy it in Excel, Bonus Ratio Converts to Decimal

Your hTable is a collection as opposed to a single element. Your code should be throwing an error.
You want to target the specific table and then loop the table rows and cells within rows. You want to check if the second column is being processed so you can protect the formatting of the ratios. You also want to monitor the row number to handle the merged cells at the top.
Option Explicit
Public Sub GetInfo()
Const URL As String = "https://www.moneycontrol.com/stocks/marketinfo/bonus/homebody.php?sel_year=2015"
Dim html As HTMLDocument, hTable As HTMLTable, ws As Worksheet, headers()
headers = Array("Company", "Bonus Ratio", "Announcement", "Record", "Ex-bonus")
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.Send
html.body.innerHTML = .responseText
End With
Set hTable = html.querySelector("table.dvdtbl")
Dim td As Object, tr As Object, r As Long, c As Long
r = 1
With ws
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For Each tr In hTable.getElementsByTagName("tr")
r = r + 1: c = 1
If r > 3 Then
For Each td In tr.getElementsByTagName("td")
.Cells(r - 2, c) = IIf(c = 2, "'" & td.innerText, td.innerText)
c = c + 1
Next
End If
Next
End With
End Sub

Modifying the program for parsing

There is a program that parse a certain table from the site . Works great . I want to parse another table from the site . By the tag number “table” they are the same . I am trying to use the same program , but it gives an error : Run-time error 91 in the line :
If oRow.Cells(y).Children.Length > 0 Then
New table : http://allscores.ru/soccer/fstats.php?champ=2604&team=439&team2=420&tour=110
Old table : http://allscores.ru/soccer/new_ftour.php?champ=2604&f_team=439
New table : in the attached picture
Function extractTable(Ssilka As String, book1 As Workbook, iLoop As Long)
Dim oDom As Object, oTable As Object, oRow As Object
Dim iRows As Integer, iCols As Integer
Dim x As Integer, y As Integer
Dim data()
Dim vata()
Dim tata()
Dim oHttp As Object
Dim oRegEx As Object
Dim sResponse As String
Dim oRange As Range
Dim odRange As Range
' get page
Set oHttp = CreateObject("MSXML2.XMLHTTP")
oHttp.Open "GET", Ssilka, False
oHttp.Send
' cleanup response
sResponse = StrConv(oHttp.responseBody, vbUnicode)
Set oHttp = Nothing
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
Set oRegEx = CreateObject("vbscript.regexp")
With oRegEx
.MultiLine = True
.Global = True
.IgnoreCase = False
.Pattern = "<(script|SCRIPT)[\w\W]+?</\1>"
sResponse = .Replace(sResponse, "")
End With
Set oRegEx = Nothing
' create Document from response
Set oDom = CreateObject("htmlFile")
oDom.Write sResponse
DoEvents
' table with results, indexes starts with zero
Set oTable = oDom.getelementsbytagname("table")(3)
DoEvents
iRows = oTable.Rows.Length
iCols = oTable.Rows(1).Cells.Length
' first row and first column contain no intresting data
ReDim data(1 To iRows - 1, 1 To iCols - 1)
ReDim vata(1 To iRows - 1, 1 To iCols - 1)
ReDim tata(1 To iRows - 1, 1 To iCols - 1)
' fill in data array
For x = 1 To iRows - 1
Set oRow = oTable.Rows(x)
For y = 1 To iCols - 1
If oRow.Cells(y).Children.Length > 0 Then
data(x, y) = oRow.Cells(y).getelementsbytagname("a")(0).getattribute("href")
data(x, y) = Replace(data(x, y), "about:", "http://allscores.ru/soccer/")
vata(x, y) = oRow.Cells(y).innerText
End If
Next y
Next x
Set oRow = Nothing
Set oTable = Nothing
Set oDom = Nothing
Set oRange = book1.ActiveSheet.Cells(110, 26 + (iLoop * 21)).Resize(iRows - 1, iCols - 1)
oRange.NumberFormat = "#"
oRange.Value = data
Set odRange = book1.ActiveSheet.Cells(34, 26 + (iLoop * 21)).Resize(iRows - 1, iCols - 1)
odRange.NumberFormat = "#"
odRange.Value = vata
Set oRange = Nothing
Set odRange = Nothing
End Function

This is not particularly robust but does grab the values from the table. iLoop is not used.
Option Explicit
Public Sub test()
extractTable "http://allscores.ru/soccer/fstats.php?champ=2604&team=439&team2=420&tour=110", ThisWorkbook, 1
End Sub
Public Sub extractTable(Ssilka As String, book1 As Workbook)
Dim oDom As Object, oTable As Object
Dim oHttp As Object
Dim oRegEx As Object
Dim sResponse As String
Set oHttp = CreateObject("MSXML2.XMLHTTP")
oHttp.Open "GET", Ssilka, False
oHttp.send
sResponse = StrConv(oHttp.responseBody, vbUnicode)
Set oHttp = Nothing
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
Set oRegEx = CreateObject("vbscript.regexp")
With oRegEx
.MultiLine = True
.Global = True
.IgnoreCase = False
.Pattern = "<(script|SCRIPT)[\w\W]+?</\1>"
sResponse = .Replace(sResponse, "")
End With
Set oRegEx = Nothing
Set oDom = CreateObject("htmlFile")
oDom.Write sResponse
Set oTable = oDom.getElementsByTagName("table")(3)
Dim b As Object, a As Object
Set b = oTable.getElementsByTagName("TR") 'DispHTMLElementCollection
Dim i As Long, y As Long
With ActiveSheet
For i = 3 To 17 '17-3 gives the 15 rows of interest. Start at 3 to avoid header and empty row.
Set a = b(i).ChildNodes
For y = 1 To a.Length - 1
.Cells(i - 2, y) = a(y).innerText
Next y
Next i
End With
End Sub

how to select 6th tr's td in a html table using vba code

tr---- 0495024988
14.08.1996
04/04/130/02514/AM96/
23.01.1996
0495024988
6. tr----(here the text is there which i have copy to my excel sheet)
here i have read many post where the javascript code is given but the vba code is not there. please help me out of this.

Sub GoToWebSiteAndPlayAroundNew()
Dim appIE As Object ' InternetExplorer.Application
Dim URL As String
Dim i As Long, strText As String
Dim doc As Object, hTable As Object, hBody As Object, hTR As Object, hTD As Object
Dim tb As Object, bb As Object, tr As Object, td As Object
Dim y As Long, z As Long, wb As Excel.Workbook, ws As Excel.Worksheet
Dim sws As SHDocVw.ShellWindows
Dim IE As Object
Dim vIE As SHDocVw.InternetExplorer
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set appIE = CreateObject("InternetExplorer.Application")
URL = "http://dgft.delhi.nic.in:8100/dgft/IecPrint"
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
With appIE
.navigate URL
.Visible = True
Do While .busy: DoEvents: Loop
Do While .ReadyState <> 4: DoEvents: Loop
.document.getElementById("iec").Value = "0495024988"
.document.getElementById("name").Value = "AMB"
End With
On Error Resume Next
With appIE.document
Set elems = .getElementsByTagName("input")
For Each e In elems
If (e.getAttribute("value") = "Submit Query") Then
e.Click
Exit For
End If
Next e
End With
Set sws = New SHDocVw.ShellWindows
For Each vIE In sws
'If Left(vIE.LocationURL, 4) = "http" Then 'avoid explorer windows/etc this way
'If MsgBox("IE Window found. The URL is:" & vbCrLf & vIE.LocationURL & vbCrLf & _
'vbCrLf & "Do you want to see the html?", vbYesNo) = vbYes Then
'Show html in a msgbox
' MsgBox vIE.document.body.innerHTML
'Or put it to a file
'dim vFF as long
'vff=freefile
'open "C:\thehtml.txt" for output as #vff
'print #vff,vie.document.body.innerhtml
'close #vff
' End If
'End If
Set doc = vIE.document
Set hTable = doc.getElementsByTagName("table")
For Each tb In hTable
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
MsgBox hTR.Length
For Each tr In hTR
Set hTD = tr.getElementsByTagName("td")
MsgBox hTD.Length
y = 1 ' Resets back to column A
For Each td In hTD
ws.Cells(z, y).Value = td.innerText
y = y + 1
Next td
DoEvents
z = z + 1
Next tr
Exit For
Next bb
Exit For
Next tb
Next
End Sub

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Parse/Scrape column from table using MSHTML.HTMLDocument - html

Related

VBA web Scraping - Shifting HTMLdoc to XML but facing errors while clicking on a button

Web Scraping: reformat infomation from an HTML table

Extract Table from Webpage in Excel using VBA

Modifying the program for parsing

how to select 6th tr's td in a html table using vba code

Categories

Resources