【问题标题】:VBA web scraping - assigning data to correct rows when some data is missingVBA网络抓取 - 当某些数据丢失时将数据分配给正确的行
【发布时间】:2016-09-08 23:07:33
【问题描述】:

下面的宏应该填充网站上的搜索参数并从结果中抓取数据 - 131 页,通常每页 5 行数据。为了测试,我注释掉了循环,所以它只有 1 页。

在 Excel 工作表中填写以下数据列:公司名称、联系人姓名、联系人电子邮件、位置、电话 (BH)、手机、服务。

我只有电话 (BH) 和手机有问题。原因是公司有时只有电话 (BH) 或移动电话,而不是两者兼有。

我循环的方式(5 个公司名称,然后是 5 个联系人姓名等)需要检测所有数据,否则我不知道这些数据应该在哪一行。

Option Explicit

Private Sub CommandButton1_Click()

Dim htmlele As IHTMLElement
Dim Link As String
Dim ie As Object
Dim page As Integer
Dim companyresult As Long, nameresult As Long, emailresult As Long, locationresult As Long, servicesresult As Long

Application.ScreenUpdating = False

Link = "http://bdav.org.au/find"

Set ie = CreateObject("InternetExplorer.Application")

ie.Navigate Link
'ie.Visible = True

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "postcode" Then htmlele.Value = "3000"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Name = "dist" Then htmlele.Value = "99"
Next htmlele

For Each htmlele In ie.Document.getElementsByTagName("input")
    If htmlele.Type = "submit" Then htmlele.Click
Next htmlele

Do Until ie.ReadyState = 4 And ie.Busy = False
    DoEvents
Loop

'Do

    page = page + 1
    Link = "http://bdav.org.au/find/" & page

    Do Until ie.ReadyState = 4 And ie.Busy = False
        DoEvents
    Loop

    With ThisWorkbook.Worksheets("results")
        For Each htmlele In ie.Document.getElementsByTagName("h2")
            companyresult = companyresult + 1
            .Range("a" & companyresult + 1).Value = htmlele.innerHTML
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                nameresult = nameresult + 1
                .Range("b" & nameresult + 1).Value = htmlele.innerHTML
            End If
        Next htmlele

        For Each htmlele In ie.Document.getElementsByTagName("a")

            If Left(htmlele.href, 7) = "mailto:" Then
                emailresult = emailresult + 1
                .Range("c" & emailresult + 1).Value = Mid(htmlele.href, 8)
            End If

        Next htmlele


        For Each htmlele In ie.Document.getElementsByTagName("label")

            If htmlele.innerHTML = "Location:" Then
                locationresult = locationresult + 1
                .Range("d" & locationresult + 1).Value = htmlele.NextSibling.innerHTML
            End If

        Next htmlele

           For Each htmlele In ie.Document.getElementsByTagName("label")

            If htmlele.innerHTML = "Services:" Then
                servicesresult = servicesresult + 1
                If Right(htmlele.NextSibling.innerHTML, 4) = "<br>" Then
                    htmlele.NextSibling.innerHTML = Left(htmlele.NextSibling.innerHTML, Len(htmlele.NextSibling.innerHTML) - 4)
                End If
                .Range("g" & servicesresult + 1).Value = Replace(htmlele.NextSibling.innerHTML, "<br>", vbLf)
            End If

        Next htmlele


    End With

'Loop

ie.Quit

Set ie = Nothing

Application.ScreenUpdating = True

End Sub

【问题讨论】:

    标签: vba web-scraping


    【解决方案1】:

    考虑继续解析 PhoneMobile 数据(假设它们存在于页面上),然后运行有条件的错误句柄以说明可能不存在的情况。具体来说,错误句柄将使用Resume Next(在问题行后继续)Error 91 用于对象变量未设置

    至于与行对齐,请尝试使用不同的迭代器,因为 phoneresult = phoneresult + 1 如果随机丢失,将不可靠。所以使用servicesresult 或任何始终存在的节点是PhoneMobile 的兄弟节点。以下不是确切的代码,但包含用于演示概念。

    Private Sub CommandButton1_Click()
    On Error Goto ErrHandle
            '...    
            For Each htmlele In ie.Document.getElementsByTagName("Phone")
                If htmlele.innerHTML = "Phone (BH):" Then
                    '...other needed code
                    .Range("h" & servicesresult + 1).Value = htmlele.innerHTML
                End If    
            Next htmlele
    
            For Each htmlele In ie.Document.getElementsByTagName("Mobile")
                If htmlele.innerHTML = "Mobile:" Then
                    '...other needed code
                    .Range("i" & servicesresult + 1).Value = htmlele.innerHTML
                End If    
            Next htmlele
            '...
            Exit Sub
    
    ErrHandle:
        ' MISSING NODE ERROR
        If Err.Number = 91 Then
            Resume Next
        ' ALL OTHER ERRORS
        Else:
            MsgBox Err.Number & " - " & Err.Description
            Exit Sub
        End If
    End Sub
    

    【讨论】:

    • 不幸的是我无法使这个解决方案工作,没有为宏生成错误 91。但是我找到了一个解决方案,我在这里发布。
    【解决方案2】:

    我找到了一个比我之前尝试过的方法更简洁的解决方案,只需 1 次循环遍历标签标签元素集合。我根据位置标签外观获取行号(此标签始终出现在公司中)。如果缺少该公司的任何其他标签,则不会放在该行中。

    Option Explicit
    
    Private Sub CommandButton1_Click()
    
    Dim htmlele As IHTMLElement
    Dim Link As String
    Dim ie As Object
    Dim page As Integer
    Dim companyresult As Long, nameresult As Long, emailresult As Long, labelresult As Long
    
    Application.ScreenUpdating = False
    
    Link = "http://bdav.org.au/find"
    
    Set ie = CreateObject("InternetExplorer.Application")
    
    ie.Navigate Link
    'ie.Visible = True
    
    Do Until ie.ReadyState = 4 And ie.Busy = False
        DoEvents
    Loop
    
    For Each htmlele In ie.Document.getElementsByTagName("input")
        If htmlele.Name = "postcode" Then htmlele.Value = "3000"
    Next htmlele
    
    For Each htmlele In ie.Document.getElementsByTagName("input")
        If htmlele.Name = "dist" Then htmlele.Value = "99"
    Next htmlele
    
    For Each htmlele In ie.Document.getElementsByTagName("input")
        If htmlele.Type = "submit" Then htmlele.Click
    Next htmlele
    
    Do Until ie.ReadyState = 4 And ie.Busy = False
        DoEvents
    Loop
    
    Do
    
        page = page + 1
        Link = "http://bdav.org.au/find/" & page
    
        ie.Navigate Link
    
        Do Until ie.ReadyState = 4 And ie.Busy = False
            DoEvents
        Loop
    
        With ThisWorkbook.Worksheets("results")
            For Each htmlele In ie.Document.getElementsByTagName("h2")
                companyresult = companyresult + 1
                .Range("a" & companyresult + 1).Value = htmlele.innerHTML
            Next htmlele
    
            For Each htmlele In ie.Document.getElementsByTagName("a")
    
                If Left(htmlele.href, 7) = "mailto:" Then
                    nameresult = nameresult + 1
                    .Range("b" & nameresult + 1).Value = htmlele.innerHTML
                End If
            Next htmlele
    
            For Each htmlele In ie.Document.getElementsByTagName("a")
    
                If Left(htmlele.href, 7) = "mailto:" Then
                    emailresult = emailresult + 1
                    .Range("c" & emailresult + 1).Value = Mid(htmlele.href, 8)
                End If
    
            Next htmlele
    
            For Each htmlele In ie.Document.getElementsByTagName("label")
    
                If htmlele.innerHTML = "Location:" Then
                    labelresult = labelresult + 1
                    .Range("d" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
                ElseIf htmlele.innerHTML = "Phone (BH):" Then
                    .Range("e" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
                ElseIf htmlele.innerHTML = "Mobile:" Then
                    .Range("f" & labelresult + 1).Value = htmlele.NextSibling.innerHTML
                ElseIf htmlele.innerHTML = "Services:" Then
                    If Right(htmlele.NextSibling.innerHTML, 4) = "<br>" Then
                        htmlele.NextSibling.innerHTML = Left(htmlele.NextSibling.innerHTML, Len(htmlele.NextSibling.innerHTML) - 4)
                    End If
                    .Range("g" & labelresult + 1).Value = Replace(htmlele.NextSibling.innerHTML, "<br>", vbLf)
                End If
    
            Next htmlele
    
        End With
    
    Loop
    
    ie.Quit
    
    Set ie = Nothing
    
    Application.ScreenUpdating = True
    
    End Sub
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2020-11-21
      • 2020-08-02
      • 1970-01-01
      • 1970-01-01
      • 2020-12-11
      • 2023-01-22
      相关资源
      最近更新 更多