【问题标题】:Scraping table from website (forexfactory.com)从网站(forexfactory.com)抓取表格
【发布时间】:2021-07-01 21:11:17
【问题描述】:

需要一些帮助来从 forexfactory.com 抓取一个简单的表格。下面的代码可以工作,但格式很不稳定,而且那里的列似乎有一个不能正常工作的列。我知道我使用的代码很新手。有待改进。

Sub Pulldata2()

 Dim ieObj As InternetExplorer
 Dim appIE As Object
 Dim htmlEle As IHTMLElement
 Dim i As Integer
 Dim strSheet As String
 Dim LastRow As Long
    
 strSheet = Sheet1.Range("A3")
    
       
    Set ieObj = New InternetExplorer
    ieObj.Visible = False
    ieObj.navigate Sheet1.Range("A3").Value
    
    
    Application.ScreenUpdating = False
    Application.DisplayStatusBar = False
    Application.Wait Now + TimeValue("00:00:03")
    Sheet2.Activate
    
        
    For Each htmlEle In ieObj.document.getElementsByClassName("calendar__table")(0).getElementsByTagName("tr")
        LastRow = Cells(Rows.Count, "A").End(xlUp).Row
        i = 1 + LastRow
        With ActiveSheet
        
            .Range("A" & i).Value = htmlEle.Children(1).textContent
            .Range("B" & i).Value = htmlEle.Children(2).textContent
            .Range("C" & i).Value = htmlEle.Children(3).textContent
            .Range("D" & i).Value = htmlEle.Children(4).textContent
            .Range("E" & i).Value = htmlEle.Children(5).textContent
            .Range("F" & i).Value = htmlEle.Children(6).textContent
            .Range("G" & i).Value = htmlEle.Children(7).textContent
            .Range("H" & i).Value = htmlEle.Children(8).textContent
       
          
    End With
    i = i + 1
    On Error Resume Next
Next htmlEle
        


End Sub

目前正在发生的事情:

我想要的:

【问题讨论】:

  • 请显示示例输出行的外观。例如,您想要实际值、预测值、先前值吗?了解输出应该是什么样子是良好抓取的第一步,这将有助于我们为您提供帮助。
  • 嘿QHarr!谢谢回复。我更新了帖子以显示正在发生的事情。理想情况下,我想要输入的只是日期、货币、报告,然后是实际、预测和上一个。您可以在我的代码中看到我正在尝试跳过列。提前致谢 - 您之前曾帮助过我,非常感谢。
  • 您只需要表格中的第一个(最新)日期吗?
  • 是的 - 我不太挑剔。你说的是 TueApr 6 日期对吗?我只希望它在日期下作为单个单元格。
  • 我的意思是,你只想要那个日期的数据,对吧?

标签: excel vba internet-explorer web-scraping


【解决方案1】:

首先要注意的是内容存在于视图源中,这意味着它可以由 xmlhttp 请求,而无需借助浏览器的开销。

接下来要注意的是,每个目标列都可以由特定类定位。为了从正确的行开始,忽略标题还必须指定父类。

由于css选择器很长,但并不复杂,所以我通过循环生成它。

我使用 css 选择器返回目标列中的所有 td 节点。我根据要检索的列数使用一点数学循环该列表(注意我不是在此 nodeList 中检索日期;我已经在脚本中较早地抓住了它,然后将其添加到占位符第一列)。这样我重新创建了只包含所需列的表。


Option Explicit

Public Sub WriteTable()
    'tools > references > Microsoft HTML Object Library
    Dim html As MSHTML.HTMLDocument, xhr As Object
    
    Set xhr = CreateObject("MSXML2.XMLHTTP")
    Set html = New MSHTML.HTMLDocument

    With xhr
        .Open "GET", "https://www.forexfactory.com/", False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim dateText As String
    
    dateText = html.querySelector(".flexDatePicker").Value
    
    Dim css As String, base As String, classSet()
    
    base = ", .calendar__row > "                 'parent class to prefix each child class
    classSet = Array(".calendar__time", ".calendar__currency", ".calendar__event", _
                     ".calendar__actual", ".calendar__forecast", ".calendar__previous") 'classes to match on
    css = ".calendar__row >  " & Join$(classSet, base) 'final selector
    
    Dim tableCells As Object, r As Long, res As Long, c As Long, results()
    
    Set tableCells = html.querySelectorAll(css)
    r = 1
    
    ReDim results(1 To tableCells.Length / 6, 1 To 7) 'calc size of results array based on 6 columns + 1 for date (previously calculated)
    
    results(1, 1) = dateText
    
    For c = 0 To tableCells.Length - 1
        res = c Mod 6
        results(r, res + 2) = tableCells.Item(c).innerText
        If res = 5 Then r = r + 1                'new row in array
    Next
    
    'array fill down time and populate blank date rows
    For r = UBound(results, 1) To LBound(results, 1) + 1 Step -1
        results(r, 2) = IIf(results(r, 2) = vbNullString, results(r - 1, 2), results(r, 2))
        results(r, 1) = dateText
    Next
    
    Dim headers()
    
    headers = Array("Date", "Time", "Currency", "Event", "Actual", "Forecast", "Previous")
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
    
End Sub

阅读:

  1. CSS selectors

更传统的方法可能如下所示(尽管需要重构以降低嵌套级别):

Option Explicit

Public Sub WriteTable()
    'tools > references > Microsoft HTML Object Library
    Dim html As MSHTML.HTMLDocument, xhr As Object
    
    Set xhr = CreateObject("MSXML2.XMLHTTP")
    Set html = New MSHTML.HTMLDocument

    With xhr
        .Open "GET", "https://www.forexfactory.com/", False
        .setRequestHeader "User-Agent", "Mozilla/5.0"
        .send
        html.body.innerHTML = .responseText
    End With
    
    Dim dateText As String
    
    dateText = html.querySelector(".flexDatePicker").Value
    
    Dim table As MSHTML.HTMLTable, r As Long, c As Long, results() As Variant, n As Long
    
    Set table = html.querySelector(".calendar__table")
  
    ReDim results(1 To table.Rows.Length - 4, 1 To 7) 'calc size of results array based on 6 columns + 1 for date (previously calculated)

    results(1, 1) = dateText
    
    Dim i As Long
    
    For r = 4 To table.Rows.Length - 1
        n = 2
        If table.Rows(r).Children.Length = 10 Then
            If table.Rows(r).Children(4).innerText <> vbNullString Then
                i = i + 1
                For c = 1 To table.Rows(r).Children.Length - 1
                    Select Case c
                    Case 1, 2, 4, 6, 7, 8
                        results(i, n) = table.Rows(r).Children(c).innerText
                        n = n + 1
                    End Select
                Next
            End If
        End If
    Next
    
    results = Application.Transpose(results)
    ReDim Preserve results(1 To 7, 1 To i)
    results = Application.Transpose(results)
    
    'array fill down time and populate blank date rows
    For r = UBound(results, 1) To LBound(results, 1) + 1 Step -1
        results(r, 2) = IIf(results(r, 2) = vbNullString, results(r - 1, 2), results(r, 2))
        results(r, 1) = dateText
    Next
    
    Dim headers() As Variant
    
    headers = Array("Date", "Time", "Currency", "Event", "Actual", "Forecast", "Previous")
    
    With ActiveSheet
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
    
End Sub

【讨论】:

  • 嗨 QHarr - 我在这一行遇到错误 dateText = html.querySelector(".flexDatePicker").Value
  • 代码对我来说运行良好
  • 我将错误图片保存在主帖中。它可能在我的参考文献中。我添加了 Microsoft HTML 对象库
  • 当您按下调试时,dateText = html.querySelector 的黄色行会突出显示吗?
  • 你可以尝试用html.getElementsByClassName("flexdatepicker__date")(0).getElementsByTagName("input").value替换
猜你喜欢
  • 2021-07-08
  • 2019-01-16
  • 1970-01-01
  • 2017-10-16
  • 1970-01-01
  • 2022-07-10
  • 2018-10-13
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多