【问题标题】:Extracting HTML code from websites by looping through list of URLs通过遍历 URL 列表从网站中提取 HTML 代码
【发布时间】:2019-09-29 01:04:58
【问题描述】:

我正在使用 Excel VBA 根据 D 列中每一行中的 URL 启动一个 IE 浏览器选项卡。然后根据预定义的类提取相关的 HTML 代码并填充到 A - C 列中。

很确定我错过了一步。该过程在 D2 处停止,并且不会继续从下一个 URL 中提取 HTML(在单元格 D3、D4 等中)。

提前感谢您的任何建议!

Sub useClassnames()
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
Dim IE As InternetExplorer
Dim html As HTMLDocument
Dim shellWins As New ShellWindows
Dim IE_TabURL As String
Dim intRowPosition As Integer

Set IE = New InternetExplorer
IE.Visible = False

intRowPosition = 2

Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True

IE.navigate Sheet1.Range("D" & intRowPosition)

While IE.Busy
    DoEvents
Wend

intRowPosition = intRowPosition + 1

While Sheet1.Range("D" & intRowPosition) <> vbNullString
    IE.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)

    While IE.Busy
        DoEvents
    Wend

    intRowPosition = intRowPosition + 1

Wend

Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading Web page…"
DoEvents
Loop

Set html = IE.document
Set elements = html.getElementsByClassName("container-bs")

Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "container-bs" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
count = count + 1
End If
Next element

Range("A2:C2000").Select
Columns("A:A").EntireColumn.AutoFit
Columns("B:B").ColumnWidth = 36
End Sub

【问题讨论】:

    标签: excel vba internet-explorer web-scraping automation


    【解决方案1】:

    你的台词

    Set html = IE.document
    Set elements = html.getElementsByClassName("container-bs")
    

    等发生在While 循环之后。它必须在里面。


    您的If 声明:

    If element.className = "container-bs"
    

    应该是多余的,因为您已经在循环该类名的集合;所以我删除了这个。


    您没有在循环中使用element,因此本质上您是在使用它来控制递增的计数器变量。这表明您可以使用更好的编码策略来检索感兴趣的项目。


    始终声明父工作表,不要依赖隐式 Activesheet 引用 - 这很容易出错。


    我希望结构更像如下(我无法考虑重构以删除 element


    Option Explicit
    Public Sub UseClassnames()
        Dim element As IHTMLElement, elements As IHTMLElementCollection, ie As InternetExplorer
        Dim html As HTMLDocument, intRowPosition As Long
    
        intRowPosition = 2
        Set ie = CreateObject("InternetExplorer.Application")
        ie.Visible = True
    
        While Sheet1.Range("D" & intRowPosition) <> vbNullString
    
            If intRowPosition = 2 Then
                ie.navigate Sheet1.Range("D" & intRowPosition)
            Else
                ie.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)
            End If
    
            While ie.Busy Or ie.readyState < 4: DoEvents: Wend
    
            Set html = ie.document
            Set elements = html.getElementsByClassName("container-bs")
    
            Dim count As Long, erow As Long
    
            count = 0
    
            For Each element In elements
                erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
                With Sheet1
                    .Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
                    .Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
                    .Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
                End With
                count = count + 1
            Next element
    
            intRowPosition = intRowPosition + 1
        Wend
        With Sheet1
            .Range("A2:C2000").Select
            .Columns("A:A").EntireColumn.AutoFit
            .Columns("B:B").ColumnWidth = 36
        End With
    End Sub
    

    【讨论】:

    • 非常感谢您的详细解释,确实很有帮助。我正在抓取的网站似乎出了点问题,我会重新测试一下。
    • 这仍然存在问题吗?
    • 我刚刚有机会尝试一下并重构以删除element。工作就像一个魅力,非常感谢你!
    • 很高兴它到达那里。
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2014-02-13
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2022-06-13
    • 2012-04-08
    • 1970-01-01
    相关资源
    最近更新 更多