【问题标题】:Extract values from entry cells on website to excel从网站上的输入单元格中提取值到 Excel
【发布时间】:2017-06-28 14:36:39
【问题描述】:

我在网页上有一个带有输入单元格和下拉菜单的表格,但我似乎找不到将其提取到 Excel 工作表的方法。

我使用了以下代码:

Sub ExtractGEO()

Dim objWeb As QueryTable

Set objWeb = ActiveSheet.QueryTables.Add( _
Connection:="URL;http://globalenergyobservatory.org/geoid/40540", _
Destination:=Range("A1"))

With objWeb

    .WebSelectionType = xlSpecifiedTables
    .WebTables = "3" ' Identify your HTML Table here
    .Refresh BackgroundQuery:=False
    .SaveData = True
End With
'
End Sub

但这只会提取标题,而不是输入单元格中写入的值,也不会提取下拉菜单上的选定值。 我完全不熟悉 HTML,所以我不明白在提取表格时如何不提取这些信息。任何帮助表示赞赏。

谢谢!

【问题讨论】:

  • Excel QueryTable 没有您需要的那么灵活。考虑 IE 自动化或 XHR 响应解析。您需要获取所有表格行的集合并处理每个单元格,包括从inputselect 标签中提取数据。

标签: excel vba drop-down-menu webpage data-extraction


【解决方案1】:

这将使您入门。这将返回网站上的所有数据。您将不得不研究捕获所需数据的不同方法。

Sub ExtractGEO()

' Requires a reference to Microsoft XML, v6.0

Dim oXH             As Object
Dim sURL            As String
Dim bodytxt         As String

On Error GoTo errorHandler

sURL = "http://globalenergyobservatory.org/geoid/40540"

Set oXH = CreateObject("msxml2.xmlhttp")

With oXH
.Open "get", sURL, False
.Send
 bodytxt = .responseText
End With

Msgbox  bodytxt

Debug.Print bodytxt

errorHandler:

Set oXH = Nothing

End Sub

如果您希望我举一个使用 IE 提取方法的示例,我希望对您有所帮助,我只想问。 Id 往往要慢得多。我上面写的方法的唯一问题是您可以拨打电话的次数有限。如果你有工作回合,你可能永远不会打。

【讨论】:

    【解决方案2】:

    看看下面的例子,它展示了如何通过 XHR 检索 HTML 内容,用 RegExp 解析它并输出到工作表:

    Option Explicit
    
    Sub Test()
    
        Dim sUrl As String
        Dim sPayLoad As String
        Dim sContent As String
        Dim aData()
    
        ' Retrieve general info
        sUrl = "http://globalenergyobservatory.org/geoid/40540"
        sContent = GetXHR(sUrl)
        aData = ParseData(sContent)
        ' Output to Sheet1
        With ThisWorkbook.Sheets(1)
            .Cells.Delete
            Output .Cells(1, 1), aData
        End With
        ' Retrieve performance data
        sUrl = "http://globalenergyobservatory.org/geoutility.php"
        sPayLoad = "type=Gas&op=performance&did=40540&StartYear=2000&EndYear=2009"
        sContent = PostXHR(sUrl, sPayLoad)
        aData = ParseData(sContent)
        ' Output to Sheet2
        With ThisWorkbook.Sheets(2)
            .Cells.Delete
            Output .Cells(1, 1), aData
        End With
    
        MsgBox "Completed"
    
    End Sub
    
    Function GetXHR(sUrl As String) As String
    
        ' GET XHR
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", sUrl, False
            .Send
            GetXHR = .responseText
        End With
    
    End Function
    
    Function PostXHR(sUrl As String, sPayLoad As String) As String
    
        ' POST XHR
        With CreateObject("MSXML2.XMLHTTP")
            .Open "POST", sUrl, False
            .SetRequestHeader "Content-Type", "application/x-www-form-urlencoded"
            .SetRequestHeader "Content-Length", Len(sPayLoad)
            .Send CStr(sPayLoad)
            PostXHR = .responseText
        End With
    
    End Function
    
    Function ParseData(sContent As String) As Variant()
    
        Dim cRows As Object
        Dim oRow
        Dim sRowContent As String
        Dim cCells As Object
        Dim oCell
        Dim sCellContent As String
        Dim cItems As Object
        Dim oItem
        Dim sSubMatch
        Dim sItem As String
        Dim aItems()
        Dim aRows()
        Dim aData()
        Dim j As Long
        Dim i As Long
    
        ' Create dictionary for rows
        With CreateObject("Scripting.Dictionary")
            ' Get row nodes <tr> RegEx matches collection
            Set cRows = RegExMatches(sContent, "<tr[^>]*?/>|<tr[^>]*?>(.*?)</tr>")
            ' Process each rows in collection
            For Each oRow In cRows
                ' Get row HTML content
                sRowContent = oRow.SubMatches(0)
                ' Get cell nodes <th> and <td> RegEx matches collection
                Set cCells = RegExMatches(sRowContent, "<td[^>]*?/>|<t([hd])[^>]*?>([\s\S]*?)</t\1>")
                ' Create dictionary for cell items
                With CreateObject("Scripting.Dictionary")
                    ' Process each cell in collection
                    For Each oCell In cCells
                        ' Get cell HTML content
                        sCellContent = oCell.SubMatches(1)
                        ' RegEx patterns combined:
                        ' <input type="text"[^>]*?value="([^"]*?)"[^>]*?> - textbox value
                        ' <input type="checkbox"[^>]*?checked="checked">([^<]*?)</input> - checked box
                        ' <input type="radio"[^>]*?checked="(checked)"/> - checked radio
                        ' <label[^>]*?>([^<]*?)</label> - label text
                        ' <span[^>]*?>([^<]*?)(?=<) - text within span
                        ' <select[^>]*?>[\s\S]*?<option[^>]*?selected="selected">([^<]*?)</option> - dropdown value
                        ' ^[^<>]* - plain text
                        ' Get the below cell items RegEx matches collection
                        Set cItems = RegExMatches(sCellContent, _
                            "<input type=""text""[^>]*?value=(""[^""]*?"")[^>]*?>|" & _
                            "<input type=""checkbox""[^>]*?checked=""checked"">([^<]*?)</input>|" & _
                            "<input type=""radio""[^>]*?checked=""(checked)""/>|" & _
                            "<label[^>]*?>([^<]*?)</label>|" & _
                            "<span[^>]*?>([^<]*?)(?=<)|" & _
                            "<select[^>]*?>[\s\S]*?<option[^>]*?selected=""selected"">([^<]*?)</option>|" & _
                            "^([^<>]*)" _
                            )
                        ' Process each item in collection
                        For Each oItem In cItems
                            ' Create dictionary for item submatches
                            With CreateObject("Scripting.Dictionary")
                                ' Add combined patterns submatches to dictionary
                                For Each sSubMatch In oItem.SubMatches
                                    .Item(.Count) = sSubMatch
                                Next
                                ' Convert dictionary to array and join array elements, then trim result string
                                sItem = Trim(Join(.Items(), ""))
                            End With
                            ' Nonempty string add to cell items dictionary
                            If sItem <> "" Then .Item(.Count) = sItem
                        Next
                    Next
                    ' Convert cell items dictionary to array of cell items
                    aItems = .Items()
                End With
                ' Nonempty array of cell items add to row dictionary
                If UBound(aItems) >= 0 Then .Item(.Count) = aItems
            Next
            ' Convert rows dictionary to rows array
            ' Rows array contains nested cell items arrays for each matched item in row
            aRows = .Items()
        End With
        ' Convert nested arrays into 2d array for output
        ' ReDim the first dimension only once
        ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
        ' Copy values for each row and cell
        For j = 0 To UBound(aRows)
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                ' Expand the second dimension if necessary
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Next
        ' Return 2d array
        ParseData = aData
    
    End Function
    
    Function RegExMatches(sText As String, sPattern As String, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
        With CreateObject("VBScript.RegExp")
            .Global = bGlobal
            .MultiLine = bMultiLine
            .IgnoreCase = bIgnoreCase
            .Pattern = sPattern
            Set RegExMatches = .Execute(sText)
        End With
    End Function
    
    Sub Output(objDstRng As Range, arrCells As Variant)
    
        With objDstRng
            .Parent.Select
            With .Resize( _
                    UBound(arrCells, 1) - LBound(arrCells, 1) + 1, _
                    UBound(arrCells, 2) - LBound(arrCells, 2) + 1)
                .NumberFormat = "@"
                .WrapText = True
                .Value = arrCells
                .Columns.ColumnWidth = 25
                .Columns.AutoFit
                .EntireRow.AutoFit
            End With
        End With
    
    End Sub
    

    对于源网页:

    Sheet1 上我的输出如下:

    在 Sheet2 上:

    由于网页在浏览器中的实际运行方式,一般信息和性能数据是分开提取的 - 性能数据是在网页加载时从 POST XHR 作为 DHTML 添加到页面的。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2017-08-04
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2023-03-16
      • 2020-07-13
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多