I want to go through a document and find all center aligned text and delete it, I can setup formatted text on the find and replace tool, but when I record, it doesn't save formatting... does anyone know how to edit the basic code to do this?
also is the open office documentation compatible with libre office.
Recording in OpenOffice generates dispatcher code, which usually isn't very good. It's better to use the UNO API when writing macros. Here is some code that does what you want:
Sub DeleteCenteredLines
oDoc = ThisComponent
Dim vDescriptor, vFound
' Create a descriptor from a searchable document.
vDescriptor = oDoc.createSearchDescriptor()
' Set the text for which to search and other
With vDescriptor
.searchString = ""
.searchAll=True
End With
Dim srchAttributes(0) As New com.sun.star.beans.PropertyValue
srchAttributes(0).Name = "ParaAdjust"
srchAttributes(0).Value = com.sun.star.style.ParagraphAdjust.CENTER
vDescriptor.SetSearchAttributes(srchAttributes())
' Find the first one
vFound = oDoc.findFirst(vDescriptor)
Do While Not IsNull(vFound)
vFound.setPropertyValue("ParaAdjust", com.sun.star.style.ParagraphAdjust.LEFT)
oTC = oDoc.Text.createTextCursorByRange(vFound)
oTC.gotoStartOfParagraph(false)
oTC.gotoEndOfParagraph(true)
oTC.String = ""
oTC.goRight(1,true)
oTC.String = ""
vFound = oDoc.findNext( vFound.End, vDescriptor)
Loop
End Sub
Check out http://www.pitonyak.org/AndrewMacro.odt for examples of many common tasks. In my experience, looking for examples in this document is usually easier than trying to record macros and make sense of what was recorded.
This works for OpenOffice as well as LibreOffice. Generally the API is the same for both.
My solution which replaces strings in italic and superscript to tags.
(it is extremly slow. Maybe someone can improve it)
Sub replace_italico_sobrescrito_por_tag()
MsgBox "It takes long to run."
Dim vartemp As String
theDoc = thisComponent
iSheetsCount = theDoc.Sheets.Count
Dim theCell As Object, rText As String, textSlice As String, textItalic As Long, textSup As Integer
Dim theParEnum As Object, theParElement As Object
Dim theSubEnum As Object, theSubElement As Object
For k=0 to iSheetsCount-1
Sheet = theDoc.getSheets().getByIndex(k)
dim pX as integer, pY as integer, maxcol as integer, maxrow as integer
maxcol = 100
maxrow = 500
For pX=0 to maxrow
For pY=0 to maxcol
theCell = Sheet.GetCellByPosition(pX, pY)
theParEnum = theCell.GetText().CreateEnumeration
rText = ""
Do While theParEnum.HasMoreElements
theParElement = theParEnum.NextElement
theSubEnum = theParElement.CreateEnumeration
Do While theSubEnum.HasMoreElements
textSlice = ""
theSubElement = theSubEnum.NextElement
If theCell.Type = 2 Then
textSlice = theSubElement.String
textItalic = theSubElement.CharPosture
textSup = theSubElement.CharEscapement
Else
textSlice = theCell.String
End If
If theSubElement.CharPosture >= 1 Then
textSlice = "<i>" & textSlice & "</i>"
End If
If theSubElement.CharEscapement > 0 Then
textSlice = "<sup>" & textSlice & "</sup>"
End If
rText = rText & textSlice
Loop
Loop
theCell.String=rText
Next pY
Next pX
Next k
MsgBox "End"
End Sub
Related
I'm getting
run-time error 424
in 68th row (line)
request.Open "GET", Url, False
and I don't know how to fix it.
My previous question I posted ;
How to scrape specific part of online english dictionary?
My final goal is to get result like this;
A B
beginning bɪˈɡɪnɪŋ
behalf bɪˈhæf
behave bɪˈheɪv
behaviour bɪˈheɪvjər
belong bɪˈlɔːŋ
below bɪˈloʊ
bird bɜːrd
biscuit ˈbɪskɪt
Here's code I wrote, and it's mostly based on someone else's code I found on internet.
' Microsoft ActiveX Data Objects x.x Library
' Microsoft XML, v3.0
' Microsoft VBScript Regular Expressions
Sub ParseHelp()
' Word reference from
Dim Url As String
Url = "https://www.oxfordlearnersdictionaries.com/definition/english/" & Cells(ActiveCell.Row, "B").Value
' Get dictionary's html
Dim Html As String
Html = GetHtml(Url)
' Check error
If InStr(Html, "<TITLE>Not Found</Title>") > 0 Then
MsgBox "404"
Exit Sub
End If
' Extract phonetic alphabet from HTML
Dim wrapPattern As String
wrapPattern = "<span class='name' (.*?)</span>"
Set wrapCollection = FindRegexpMatch(Html, wrapPattern)
' MsgBox StripHtml(CStr(wrapCollection(1)))
' Fill phonetic alphabet into cell
If Not wrapCollection Is Nothing Then
Dim wrap As String
On Error Resume Next
wrap = StripHtml(CStr(wrapCollection(1)))
If Err.Number <> 0 Then
wrap = ""
End If
Cells(ActiveCell.Row, "C").Value = wrap
Else
MsgBox "not found"
End If
End Sub
Public Function StripHtml(Html As String) As String
Dim RegEx As New RegExp
Dim sOut As String
Html = Replace(Html, "</li>", vbNewLine)
Html = Replace(Html, " ", " ")
With RegEx
.Global = True
.IgnoreCase = True
.MultiLine = True
.Pattern = "<[^>]+>"
End With
sOut = RegEx.Replace(Html, "")
StripHtml = sOut
Set RegEx = Nothing
End Function
Public Function GetHtml(Url As String) As String
Dim xmlhttp As Object
Set xmlhttp = CreateObject("MSXML2.serverXMLHTTP")
Dim converter As New ADODB.stream
' Get
request.Open "GET", Url, False
request.send
' raw bytes
converter.Open
converter.Type = adTypeBinary
converter.Write request.responseBody
' read
converter.Position = 0
converter.Type = adTypeText
converter.Charset = "utf-8"
' close
GetHtml = converter.ReadText
converter.Close
End Function
Public Function FindRegexpMatch(txt As String, pat As String) As Collection
Set FindRegexpMatch = New Collection
Dim rx As New RegExp
Dim matcol As MatchCollection
Dim mat As Match
Dim ret As String
Dim delimiter As String
txt = Replace(txt, Chr(10), "")
txt = Replace(txt, Chr(13), "")
rx.Global = True
rx.IgnoreCase = True
rx.MultiLine = True
rx.Pattern = pat
Set matcol = rx.Execute(txt)
'MsgBox "Match:" & matcol.Count
On Error GoTo ErrorHandler
For Each mat In matcol
'FindRegexpMatch.Add mat.SubMatches(0)
FindRegexpMatch.Add mat.Value
Next mat
Set rx = Nothing
' Insert code that might generate an error here
Exit Function
ErrorHandler:
' Insert code to handle the error here
MsgBox "FindRegexpMatch. " & Err.GetException()
Resume Next
End Function
Any kind of help would be greatly appreciated.
The following is an example of how to read in values from column A and write out pronounciations to column B. It uses css selectors to match a child node then steps up to parentNode in order to ensure entire pronounciation is grabbed. There are a number of ways you could have matched on the parent node to get the second pronounciation. Note that I use a parent node and Replace as the pronounciation may span multiple childNodes.
If doing this for lots of lookups please be a good netizen and put some waits in the code so as to not bombard the site with requests.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, urls()
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row 'you need at least two words in column A or change the redim.
urls = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(urls))
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(urls) To UBound(urls)
.Open "GET", "https://www.oxfordlearnersdictionaries.com/definition/english/" & urls(i), False
.send
html.body.innerHTML = .responseText
data = Replace$(Replace$(html.querySelector(".name ~ .wrap").ParentNode.innerText, "/", vbNullString), Chr$(10), Chr$(32))
results(i) = Right$(data, Len(data) - 4)
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
Required references (VBE>Tools>References):
Microsoft HTML Object Library
Should you go down the API route then here is a small example. You can make 1000 free calls in a month with Prototype account. The next best, depending on how many calls you wish to make looks like the 10,001 calls (that one extra PAYG call halves the price). # calls will be affected by whether word is head word or needs lemmas lookup call first. The endpoint construction you need is GET /entries/{source_lang}/{word_id}?fields=pronunciations though that doesn't seem to filter massively. You will need a json parser to handle the json returned e.g. github.com/VBA-tools/VBA-JSON/blob/master/JsonConverter.bas. Download raw code from there and add to standard module called JsonConverter. You then need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime. Remove the top Attribute line from the copied code.
Option Explicit
Public Sub WriteOutPronounciations()
Dim html As MSHTML.HTMLDocument, i As Long, ws As Worksheet
Dim data As String, lastRow As Long, words()
'If not performing lemmas lookup then must be head word e.g. behave, behalf
Const appId As String = "yourAppId"
Const appKey As String = "yourAppKey"
Set ws = ThisWorkbook.Worksheets("Sheet1")
lastRow = ws.Cells(ws.rows.Count, "A").End(xlUp).row
words = Application.Transpose(ws.Range("A1:A" & lastRow).Value)
ReDim results(1 To UBound(words))
Set html = New MSHTML.HTMLDocument
Dim json As Object
With CreateObject("MSXML2.ServerXMLHTTP")
For i = LBound(words) To UBound(words)
.Open "GET", "https://od-api.oxforddictionaries.com/api/v2/entries/en-us/" & LCase$(words(i)) & "?fields=pronunciations", False
.setRequestHeader "app_id", appId
.setRequestHeader "app_key", appKey
.setRequestHeader "ContentType", "application/json"
.send
Set json = JsonConverter.ParseJson(.responseText)
results(i) = IIf(json("results")(1)("type") = "headword", json("results")(1)("lexicalEntries")(1)("pronunciations")(2)("phoneticSpelling"), "lemmas lookup required")
Set json = Nothing
Next
End With
With ThisWorkbook.Worksheets(1)
.Cells(1, 2).Resize(UBound(results, 1), 1) = Application.Transpose(results)
End With
End Sub
I was recently assisted in scraping data from a webpage by the guys at Stackoverflow. It's a great community. I was given a function that pulls data into excel from a cell containing a url. Unfortunately I'm running into some problems because I need a loop function so that Excel does not restart all my functions once I save or refresh the page.
So far I have tried to build this, but am next to useless in VBA. Wondering if anyone can provide a little extra assistance.
Sub POSTPageViews()
Dim InputSheet As Worksheet
Dim i As Long
Dim AllWords As Range
Dim text As String
Dim OutValue As String
Dim driver As SeleniumWrapper.WebDriver
On Error Resume Next
Set driver = New SeleniumWrapper.WebDriver
driver.Start "chrome", "https://re.po.st/"
driver.Open strLocation
Set InputSheet = Active
Set WorkRng = Application.Selection
WordListSheet.Range("E1") = "All Words"
InputSheet.Activate
r = 1
Do While Cells(r, 1) <> ""
Cells(r, 1).Value = txt
OutValue = driver.findElementById("sguidtotaltable").findElementByTagName("span").text
Next i
r = r + 1
driver.stop 'Stops the browser
Loop
End Sub
But naturally it is not working... Anybody see what is wrong? Basically in Column E I have all the URLs and in column K I would like to see the accompanying values.
Thanks
Does this work (in the spirit of my comments)?
Sub POSTPageViews()
Dim driver As SeleniumWrapper.WebDriver
Set driver = New SeleniumWrapper.WebDriver
driver.Start "chrome", "https://re.po.st/"
With Worksheets("Trial")
r = 2
Do While .Cells(r, 5) <> ""
driver.Open .Cells(r, 5).Value
.Cells(r, 11) = driver.findElementById("sguidtotaltable").findElementByTagName("span").text
r = r + 1
Loop
driver.stop 'Stops the browser
End Sub
Fairly new to VBA here. I'm attempting to write a piece of VBA that will essentially copy some data from an already open excel file and paste it into a forwarded email in HTML formatting. Creating the email is working fine, the problem is it seems to get stuck in an endless loop when retrieving the data from excel. It gets stuck on the line str = str & "" & c.Value & ""
It will clear through the first for r loop, but then gets stuck repeating on the second.
Here's the code:
Function GetExcelData() As String
Dim OrderColumn As Range, OrderRow As Range, r As Range, c As Range
Dim str As String
Dim xl As Excel.Application
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
'Dim rn As Excel.Range
Set xl = GetObject(, "Excel.Application")
Set wb = xl.Workbooks("Book1")
Set ws = wb.Sheets(1)
ws.Activate
Set OrderColumn = ws.Range("A1", ws.Range("A1").End(xlDown))
str = "<table>"
For Each r In OrderColumn
str = str & "<tr>"
Set OrderRow = ws.Range(r, r.End(xlToRight))
For Each c In OrderRow
str = str & "<td>" & c.Value & "</td>"
Next c
str = str & "</tr>"
Next r
str = str & "</table>"
GetExcelData = str
End Function
Any help would be appreciated.
Thanks
Instead of composing an HTML markup for the Excel range you may consider using the Word object model for getting the job done. The WordEditor property of the Inspector class returns an instance of the Document class from the Word object model. See Chapter 17: Working with Item Bodies for more information.
Question from a amatuer scripter with informal coding background:
I've researched this on stack, msdn, random scripting websites but can't seem to glean a concrete solution. So please be advised this request for help is a last resort even if the solution is simple.
To put it simply, I'm trying to call a function that parses the last modified date of a file into an array of date formats. The filepath is the function parameter. These files are .vbs files in a client-side testing environment. This will be apparent if you look at the script.
My best guess is the "name redefined" error has something to do global variables being Dim'd in some way that's throwing the error.
Anyway, here's the calling sub:
Option Explicit
'=============================
'===Unprocessed Report========
'=============================
'*****Inputs: File Path*********************
dim strFolderPath, strFilename, strReportName, strFileExt, FullFilePath
strFolderPath = "C:\Users\C37745\Desktop\"
strFilename = "UNPROCESSED_REPORT"
strReportName = "Unprocessed"
strFileExt = ".xlsx"
'************************************
FullFilePath = strFolderPath & strFilename & strFilename & strFileExt
'************************************
Sub Include(MyFile)
Dim objFSO, oFileBeingReadIn ' define Objects
Dim sFileContents ' define Strings
Set objFSO = CreateObject("Scripting.FileSystemObject")
Set oFileBeingReadIn = objFSO.OpenTextFile(MyFile, 1)
sFileContents = oFileBeingReadIn.ReadAll
oFileBeingReadIn.Close
ExecuteGlobal sFileContents
End Sub
Include "C:\Users\C37745\Desktop\VBStest\OtherTest\TEST_DLM.vbs"
''''''''''FOR TESTING''''''''''''''
Dim FilePath, varTEST
strFilePath = FullFilePath
varTEST = ParseDLMToArray(strFilePath)
msgbox varTESTtemp(0)
'''''''''''''''''''''''''''''''''
Here's the function I'm trying to call (or read, I guess):
Function ParseDLMtoArray(strFilePath)
Dim strFilePath, dlmDayD, dlmMonthM, dlmYearYY, dlmYearYYYY, DateFormatArray, dateDLM
Dim objFSO, File_Object
Set objFSO = CreateObject("Scripting.FileSystemObject")
Set File_Object = objFSO.GetFile(strFilePath)
dateDLM = File_object.DateLastModified
dlmDayD = Day(dateDLM)
dlmMonthM = Month(dateDLM)
dlmYearYY = Right(Year(dateDLM),2)
dlmYearYYYY = Year(dateDLM)
'Adds a leading zero if a 1-digit month is detected
If(Len(Month(dlmDayD))=1) Then
dlmmonthMM ="0"& dlmMonthM
Else
dlmMonthMM = dlmMonthM
End If
'Adds a leading zero if a 1-digit day is detected
If(Len(Day(dlmDayD))=1) Then
dlmDayDD = "0" & dlmDayD
Else
dlmDayDD = dlmDayD
End If
varDLM_mmyyyy = dlmMonthMM & dlmYearYYYY
varDLM_mmddyy = dlmMonthMM & dlmDayDD & dlmYearYY
varDLM_mmddyyyy = dlmMonthMM & dlmDayDD & dlmYearYYYY
DateFormatArray = Array( _
varDLM_mmyyyy, _
varDLM_mmddyy, _
varDLM_mmddyyyy _
)
ParseDLMtoArray = DateFormatArray
End Function
Any advice is appreciated, including general feedback on best practices if you see an issue there. Thanks!
Your
Function ParseDLMtoArray(strFilePath)
Dim strFilePath
...
tries to declare/define strFilePath again. That obviously can't be allowed, because it would be impossible to decide whether that variable should contain Empty (because of the Dim) or the argument you passed.
At a first glance at your code, you can just delete the Dim strFilePath.
Aim
I am looking to scrape 20/20 cricket scorecard data from the Cricinfo website, ideally into CSV form for data analysis in Excel
As an example the current Australian Big Bash 2011/12 scorecards are available from
Game 1: http://www.espncricinfo.com/big-bash-league-2011/engine/match/524915.html
Last Game: http://www.espncricinfo.com/big-bash-league-2011/engine/match/524935.html
Background
I am proficient in using VBA (either automating IE or using XMLHTTP and then using regular expressions) to scrape data from websites, ie
Extract values from HTML TD and Tr
In that same question a comment was posted suggesting html parsing - which I hadn't come accross before - so I have taken a look at questions such as RegEx match open tags except XHTML self-contained tags
Query
While I could write a regex to parse the cricket data below I would like advice as to how I could efficiently retrieve these results with html parsing.
Please bear in mind that my preference is a repeatable CSV format containing:
the date/name of the match
Team 1 name
the output should dump up to 11 records for Team 1 (blank records where players haven't batted, ie "Did Not Bat")
Team 2 name
the output should dump up to 11 records for Team 2 (blank records where players haven't batted)
Nirvana for me would be a solution that I could deploy using VBA or VBscript so I could fully automate my analysis, but I presume I will have to use a separate tool for the html parse.
Sample Site links and Data to be Extracted
There are 2 techniques that I use for "VBA". I will describe them 1 by one.
1) Using FireFox / Firebug Addon / Fiddler
2) Using Excel's inbuilt facility to get data from the web
Since this post will be read by many so I will even cover the obvious. Please feel free to skip whatever part you know
1) Using FireFox / Firebug Addon / Fiddler
FireFox : http://en.wikipedia.org/wiki/Firefox
Free download (http://www.mozilla.org/en-US/firefox/new/)
Firebug Addon: http://en.wikipedia.org/wiki/Firebug_%28software%29
Free download (https://addons.mozilla.org/en-US/firefox/addon/firebug/)
Fiddler : http://en.wikipedia.org/wiki/Fiddler_%28software%29
Free download (http://www.fiddler2.com/fiddler2/)
Once you have installed Firefox, install the Firebug Addon. The Firebug Addon lets you inspect the different elements in a webpage. For example if you want to know the name of a button, simply right click on it and click on "Inspect Element with Firebug" and it will give you all the details that you will need for that button.
Another example would be finding the name of a table on a website which has the data that you need scrapped.
I use Fiddler only when I am using XMLHTTP. It helps me to see the exact info being passed when you click on a button. Because of the increase in the number of BOTS which scrape the sites, most sites now, to prevent automatic scrapping, capture your mouse coordinates and pass that information and fiddler actually helps you in debugging that info that is being passed. I will not get into much details here about it as this info can be used maliciously.
Now let's take a simple example on how to scrape the URL posted in your question
http://www.espncricinfo.com/big-bash-league-2011/engine/match/524915.html
First let's find the name of the table which has that info. Simply right click on the table and click on "Inspect Element with Firebug" and it will give you the below snapshot.
So now we know that our data is stored in a table called "inningsBat1" If we can extract the contents of that table to an Excel file then we can definitely work with the data to do our analysis. Here is sample code which will dump that table in Sheet1
Before we proceed, I would recommend, closing all Excel and starting a fresh instance.
Launch VBA and insert a Userform. Place a command button and a webcrowser control. Your Userform might look like this
Paste this code in the Userform code area
Option Explicit
'~~> Set Reference to Microsoft HTML Object Library
Private Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)
Private Sub CommandButton1_Click()
Dim URL As String
Dim oSheet As Worksheet
Set oSheet = Sheets("Sheet1")
URL = "http://www.espncricinfo.com/big-bash-league-2011/engine/match/524915.html"
PopulateDataSheets oSheet, URL
MsgBox "Data Scrapped. Please check " & oSheet.Name
End Sub
Public Sub PopulateDataSheets(wsk As Worksheet, URL As String)
Dim tbl As HTMLTable
Dim tr As HTMLTableRow
Dim insertRow As Long, Row As Long, col As Long
On Error GoTo whoa
WebBrowser1.navigate URL
WaitForWBReady
Set tbl = WebBrowser1.Document.getElementById("inningsBat1")
With wsk
.Cells.Clear
insertRow = 0
For Row = 0 To tbl.Rows.Length - 1
Set tr = tbl.Rows(Row)
If Trim(tr.innerText) <> "" Then
If tr.Cells.Length > 2 Then
If tr.Cells(1).innerText <> "Total" Then
insertRow = insertRow + 1
For col = 0 To tr.Cells.Length - 1
.Cells(insertRow, col + 1) = tr.Cells(col).innerText
Next
End If
End If
End If
Next
End With
whoa:
Unload Me
End Sub
Private Sub Wait(ByVal nSec As Long)
nSec = nSec + Timer
While Timer < nSec
DoEvents
Sleep 100
Wend
End Sub
Private Sub WaitForWBReady()
Wait 1
While WebBrowser1.ReadyState <> 4
Wait 3
Wend
End Sub
Now run your Userform and click on the Command button. You will notice that the data is dumped in Sheet1. See snapshot
Similarly you can scrape other info as well.
2) Using Excel's inbuilt facility to get data from the web
I believe you are using Excel 2007 so I will take that as an example to scrape the above mentioned link.
Navigate to Sheet2. Now navigate to Data Tab and click on the button "From Web" on the extreme right. See snapshot.
Enter the url in the "New Web Query Window" and click on "Go"
Once the page is uploaded, select the relevant table that you want to import by clicking on the small arrow as shown in the snapshot. Once done, click on "Import"
Excel will then ask you where you want the data to be imported. Select the relevant cell and click on OK. And you are done! The data will be imported to the cell which you specified.
If you wish you can record a macro and automate this as well :)
Here is the macro that I recorded.
Sub Macro1()
With ActiveSheet.QueryTables.Add(Connection:= _
"URL;http://www.espncricinfo.com/big-bash-league-2011/engine/match/524915.html" _
, Destination:=Range("$A$1"))
.Name = "524915"
.FieldNames = True
.RowNumbers = False
.FillAdjacentFormulas = False
.PreserveFormatting = True
.RefreshOnFileOpen = False
.BackgroundQuery = True
.RefreshStyle = xlInsertDeleteCells
.SavePassword = False
.SaveData = True
.AdjustColumnWidth = True
.RefreshPeriod = 0
.WebSelectionType = xlSpecifiedTables
.WebFormatting = xlWebFormattingNone
.WebTables = """inningsBat1"""
.WebPreFormattedTextToColumns = True
.WebConsecutiveDelimitersAsOne = True
.WebSingleBlockTextImport = False
.WebDisableDateRecognition = False
.WebDisableRedirections = False
.Refresh BackgroundQuery:=False
End With
End Sub
Hope this helps. Let me know if you still have some queries.
Sid
For anyone else interested in this I ended up using the code below based on Siddhart Rout's earlier answer
XMLHttp was significantly quicker than automating IE
the code generates a CSV file for each series to be dowloaded (held in the X variable)
the code dumps each match to a regular 29 row range (regardless of how many players batted) to facillitate easier analysis later on
Public Sub PopulateDataSheets_XML()
Dim URL As String
Dim ws As Worksheet
Dim lngRow As Long
Dim lngRecords As Long
Dim lngWrite As Long
Dim lngSpare As Long
Dim lngInnings As Long
Dim lngRow1 As Long
Dim X(1 To 15, 1 To 4) As String
Dim objFSO As Object
Dim objTF As Object
Dim xmlHttp As Object
Dim htmldoc As HTMLDocument
Dim htmlbody As htmlbody
Dim tbl As HTMLTable
Dim tr As HTMLTableRow
Dim strInnings As String
s = Timer()
Set xmlHttp = CreateObject("MSXML2.ServerXMLHTTP")
Set objFSO = CreateObject("scripting.filesystemobject")
X(1, 1) = "http://www.espncricinfo.com/indian-premier-league-2011/engine/match/"
X(1, 2) = 501198
X(1, 3) = 501271
X(1, 4) = "indian-premier-league-2011"
X(2, 1) = "http://www.espncricinfo.com/big-bash-league-2011/engine/match/"
X(2, 2) = 524915
X(2, 3) = 524945
X(2, 4) = "big-bash-league-2011"
X(3, 1) = "http://www.espncricinfo.com/ausdomestic-2010/engine/match/"
X(3, 2) = 461028
X(3, 3) = 461047
X(3, 4) = "big-bash-league-2010"
Set htmldoc = New HTMLDocument
Set htmlbody = htmldoc.body
For lngRow = 1 To UBound(X, 1)
If Len(X(lngRow, 1)) = 0 Then Exit For
Set objTF = objFSO.createtextfile("c:\temp\" & X(lngRow, 4) & ".csv")
For lngRecords = X(lngRow, 2) To X(lngRow, 3)
URL = X(lngRow, 1) & lngRecords & ".html"
xmlHttp.Open "GET", URL
xmlHttp.send
Do While xmlHttp.Status <> 200
DoEvents
Loop
htmlbody.innerHTML = xmlHttp.responseText
objTF.writeline X(lngRow, 1) & lngRecords & ".html"
For lngInnings = 1 To 2
strInnings = "Innings " & lngInnings
objTF.writeline strInnings
Set tbl = Nothing
On Error Resume Next
Set tbl = htmlbody.Document.getElementById("inningsBat" & lngInnings)
On Error GoTo 0
If Not tbl Is Nothing Then
lngWrite = 0
For lngRow1 = 0 To tbl.Rows.Length - 1
Set tr = tbl.Rows(lngRow1)
If Trim(tr.innerText) <> vbNewLine Then
If tr.Cells.Length > 2 Then
If tr.Cells(1).innerText <> "Extras" Then
If Len(tr.Cells(1).innerText) > 0 Then
objTF.writeline strInnings & "-" & lngWrite & "," & Trim(tr.Cells(1).innerText) & "," & Trim(tr.Cells(3).innerText)
lngWrite = lngWrite + 1
End If
Else
objTF.writeline strInnings & "-" & lngWrite & "," & Trim(tr.Cells(1).innerText) & "," & Trim(tr.Cells(3).innerText)
lngWrite = lngWrite + 1
Exit For
End If
End If
End If
Next
For lngSpare = 12 To lngWrite Step -1
objTF.writeline strInnings & "-" & lngWrite + (12 - lngSpare)
Next
Else
For lngSpare = 1 To 13
objTF.writeline strInnings & "-" & lngWrite + (12 - lngSpare)
Next
End If
Next
Next
Next
'Call ConsolidateSheets
End Sub
RegEx is not a complete solution for parsing HTML because it is not guaranteed to be regular.
You should use the HtmlAgilityPack to query the HTML. This will allow you to use the CSS selectors to query the HTML similar to how you do it with jQuery.
As quite a few people may see this I thought I would use it as a chance to demonstrate a few features I rarely see people using in VBA web-scraping: deleteRow, querySelector and use of clipboard to write out a table (complete with formatting and hyperlinks) to a sheet based on the table.outerHTML.
deleteRow is used to remove the unwanted rows. querySelector is used to apply faster css selectors to match on nodes. Modern browsers/html parsers are optimized for css and class selectors (which I use) are the second fastest selector type (after id).
Use of css selectors and understanding htmlTable methods/properties will allow for much greater flexibility in your web-scraping endeavours. Understanding the use of the clipboard means a simple copy paste method for transferring a table to Excel.
Execution could easily be tied to a button push and the url read in from a cell.
VBA:
Option Explicit
Public Sub test()
WriteOutTable "https://www.espncricinfo.com/series/8044/scorecard/524935/hobart-hurricanes-vs-melbourne-stars-big-bash-league-2011-12"
End Sub
Public Sub WriteOutTable(ByVal url As String)
'required VBE (Alt+F11) > Tools > References > Microsoft HTML Object Library ; Microsoft XML, v6 (your version may vary)
Dim hTable As MSHTML.HTMLTable, clipboard As Object
Dim xhr As MSXML2.xmlhttp60, html As MSHTML.htmlDocument
Set xhr = New MSXML2.xmlhttp60
Set html = New MSHTML.htmlDocument
With xhr
.Open "GET", url, False
.Send
html.body.innerHTML = .responseText
End With
Set hTable = html.querySelector(".batsman")
rowCount = hTable.Rows.Length - 1
For i = rowCount To 0 Step -1
Select Case True
Case i = rowCount Or i = rowCount - 1 Or InStr(hTable.Rows(i).outerHTML, "wicket-details") > 0
hTable.deleteRow i
End Select
Next
Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
ActiveSheet.Cells(1, 1).PasteSpecial
End Sub