看看下面的例子,它展示了如何通过 XHR 检索 HTML 内容,用 RegExp 解析它并输出到工作表:
Option Explicit
Sub Test()
Dim sUrl As String
Dim sPayLoad As String
Dim sContent As String
Dim aData()
' Retrieve general info
sUrl = "http://globalenergyobservatory.org/geoid/40540"
sContent = GetXHR(sUrl)
aData = ParseData(sContent)
' Output to Sheet1
With ThisWorkbook.Sheets(1)
.Cells.Delete
Output .Cells(1, 1), aData
End With
' Retrieve performance data
sUrl = "http://globalenergyobservatory.org/geoutility.php"
sPayLoad = "type=Gas&op=performance&did=40540&StartYear=2000&EndYear=2009"
sContent = PostXHR(sUrl, sPayLoad)
aData = ParseData(sContent)
' Output to Sheet2
With ThisWorkbook.Sheets(2)
.Cells.Delete
Output .Cells(1, 1), aData
End With
MsgBox "Completed"
End Sub
Function GetXHR(sUrl As String) As String
' GET XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sUrl, False
.Send
GetXHR = .responseText
End With
End Function
Function PostXHR(sUrl As String, sPayLoad As String) As String
' POST XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", sUrl, False
.SetRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.SetRequestHeader "Content-Length", Len(sPayLoad)
.Send CStr(sPayLoad)
PostXHR = .responseText
End With
End Function
Function ParseData(sContent As String) As Variant()
Dim cRows As Object
Dim oRow
Dim sRowContent As String
Dim cCells As Object
Dim oCell
Dim sCellContent As String
Dim cItems As Object
Dim oItem
Dim sSubMatch
Dim sItem As String
Dim aItems()
Dim aRows()
Dim aData()
Dim j As Long
Dim i As Long
' Create dictionary for rows
With CreateObject("Scripting.Dictionary")
' Get row nodes <tr> RegEx matches collection
Set cRows = RegExMatches(sContent, "<tr[^>]*?/>|<tr[^>]*?>(.*?)</tr>")
' Process each rows in collection
For Each oRow In cRows
' Get row HTML content
sRowContent = oRow.SubMatches(0)
' Get cell nodes <th> and <td> RegEx matches collection
Set cCells = RegExMatches(sRowContent, "<td[^>]*?/>|<t([hd])[^>]*?>([\s\S]*?)</t\1>")
' Create dictionary for cell items
With CreateObject("Scripting.Dictionary")
' Process each cell in collection
For Each oCell In cCells
' Get cell HTML content
sCellContent = oCell.SubMatches(1)
' RegEx patterns combined:
' <input type="text"[^>]*?value="([^"]*?)"[^>]*?> - textbox value
' <input type="checkbox"[^>]*?checked="checked">([^<]*?)</input> - checked box
' <input type="radio"[^>]*?checked="(checked)"/> - checked radio
' <label[^>]*?>([^<]*?)</label> - label text
' <span[^>]*?>([^<]*?)(?=<) - text within span
' <select[^>]*?>[\s\S]*?<option[^>]*?selected="selected">([^<]*?)</option> - dropdown value
' ^[^<>]* - plain text
' Get the below cell items RegEx matches collection
Set cItems = RegExMatches(sCellContent, _
"<input type=""text""[^>]*?value=(""[^""]*?"")[^>]*?>|" & _
"<input type=""checkbox""[^>]*?checked=""checked"">([^<]*?)</input>|" & _
"<input type=""radio""[^>]*?checked=""(checked)""/>|" & _
"<label[^>]*?>([^<]*?)</label>|" & _
"<span[^>]*?>([^<]*?)(?=<)|" & _
"<select[^>]*?>[\s\S]*?<option[^>]*?selected=""selected"">([^<]*?)</option>|" & _
"^([^<>]*)" _
)
' Process each item in collection
For Each oItem In cItems
' Create dictionary for item submatches
With CreateObject("Scripting.Dictionary")
' Add combined patterns submatches to dictionary
For Each sSubMatch In oItem.SubMatches
.Item(.Count) = sSubMatch
Next
' Convert dictionary to array and join array elements, then trim result string
sItem = Trim(Join(.Items(), ""))
End With
' Nonempty string add to cell items dictionary
If sItem <> "" Then .Item(.Count) = sItem
Next
Next
' Convert cell items dictionary to array of cell items
aItems = .Items()
End With
' Nonempty array of cell items add to row dictionary
If UBound(aItems) >= 0 Then .Item(.Count) = aItems
Next
' Convert rows dictionary to rows array
' Rows array contains nested cell items arrays for each matched item in row
aRows = .Items()
End With
' Convert nested arrays into 2d array for output
' ReDim the first dimension only once
ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
' Copy values for each row and cell
For j = 0 To UBound(aRows)
aItems = aRows(j)
For i = 0 To UBound(aItems)
' Expand the second dimension if necessary
If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
aData(j + 1, i + 1) = aItems(i)
Next
Next
' Return 2d array
ParseData = aData
End Function
Function RegExMatches(sText As String, sPattern As String, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
Set RegExMatches = .Execute(sText)
End With
End Function
Sub Output(objDstRng As Range, arrCells As Variant)
With objDstRng
.Parent.Select
With .Resize( _
UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _
UBound(arrCells, 2) - LBound(arrCells, 2) + 1)
.NumberFormat = "@"
.WrapText = True
.Value = arrCells
.Columns.ColumnWidth = 25
.Columns.AutoFit
.EntireRow.AutoFit
End With
End With
End Sub
对于源网页:
Sheet1 上我的输出如下:
在 Sheet2 上:
由于网页在浏览器中的实际运行方式,一般信息和性能数据是分开提取的 - 性能数据是在网页加载时从 POST XHR 作为 DHTML 添加到页面的。