【问题标题】:Web Scraping by Elements按元素抓取网页
【发布时间】:2019-04-19 05:51:02
【问题描述】:

我正在尝试从网页中抓取数据。它不适用于所有格式都与类相同的网站,标记所有内容。我收到一个错误是“下标超出范围”,它在“ReDim results(1 To rowCount, 1 To numColumns)”代码上突出显示。

我在页面上得到了答案:Web Scraping by TagName 该代码适用于https://www.neighborhoodselfstorage.net/self-storage-ocean-city-md-88769

现在我尝试使用相同的代码:https://www.stormore.net/self-storage-seattle-wa-101616#utm_source=GoogleLocal&utm_medium=WRLocal&utm_campaign=101616

请任何人帮助解决这个问题。

Option Explicit  
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String
Const URL As String = "https://www.stormore.net/self-storage-seattle-wa-101616#utm_source=GoogleLocal&utm_medium=WRLocal&utm_campaign=101616"

Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
    .Open "GET", URL, False
    .setRequestHeader "User-Agent", "Mozilla/5.0"
    .send
    s = .responseText
    html.body.innerHTML = s

    Dim headers(), results(), listings As Object, amenities As String

    headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
    Set listings = html.querySelectorAll(".main li[class]")

    Dim rowCount As Long, numColumns As Long, r As Long, c As Long
    Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long

    rowCount = listings.Length
    numColumns = UBound(headers) + 1

    ReDim results(1 To rowCount, 1 To numColumns)
    Dim html2 As HTMLDocument
    Set html2 = New HTMLDocument
    For item = 0 To listings.Length - 1
        r = r + 1
        html2.body.innerHTML = listings.item(item).innerHTML
        'size,description, amenities,specials offer1 offer2, rate type, price

        results(r, 1) = Trim$(html2.querySelector(".size").innerText)
        results(r, 2) = Trim$(html.querySelector(".description").innerText)
        Set icons = html2.querySelectorAll("i[title]")

        ReDim amenitiesInfo(0 To icons.Length - 1)

        For icon = 0 To icons.Length - 1
            amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
        Next

        amenities = Join$(amenitiesInfo, ", ")

        results(r, 3) = amenities
        results(r, 4) = html2.querySelector(".offer1").innerText
        results(r, 5) = html2.querySelector(".offer2").innerText
        results(r, 6) = html2.querySelector(".rate-label").innerText
        results(r, 7) = html2.querySelector(".price").innerText
    Next

    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
    ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub

【问题讨论】:

    标签: html excel vba web web-scraping


    【解决方案1】:

    我认为您想要以下内容。

    初始错误:

    我认为,您最初的错误部分是由于 url 没有返回您在浏览器中使用相同 url 时看到的 html。我看到的内容在响应中不包含这些列表,因此行数为 0 ;因此,您的错误subscript out of range 在这一行出现错误:ReDim results(1 To rowCount, 1 To numColumns)

    所以,网址改为:

    https://www.stormore.net/self-storage-seattle-wa-101616

    下一步:

    检查 html 以了解如何生成列表行,我们注意到列表由 .main li.pure-g 干净地表示。一个额外的类需要添加到 li 以过滤掉不需要的信息。我们只想循环包含感兴趣的信息的行。

    Set listings = html.querySelectorAll(".main li.pure-g")
    

    最后:

    在检查 html 时,我们注意到并非所有行都包含所有感兴趣的项目,例如offer1offer2,因此我们将访问某些项目的尝试包装在 On Error Resume NextOn Error GoTo 0 中以掩盖错误并在输出的该列中输出“”。


    VBA:

    Option Explicit
    
    Public Sub GetInfo()
        Dim ws As Worksheet, html As HTMLDocument, s As String
        Const URL As String = "https://www.stormore.net/self-storage-seattle-wa-101616"
    
        Set ws = ThisWorkbook.Worksheets("Sheet1")
        Set html = New HTMLDocument
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", URL, False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .send
            s = .responseText
    
            html.body.innerHTML = s
    
            Dim headers(), results(), listings As Object, amenities As String
    
            headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
            Set listings = html.querySelectorAll(".main li.pure-g")
    
            Dim rowCount As Long, numColumns As Long, r As Long, c As Long
            Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long
    
            rowCount = listings.Length
            numColumns = UBound(headers) + 1
    
            ReDim results(1 To rowCount, 1 To numColumns)
            Dim html2 As HTMLDocument
            Set html2 = New HTMLDocument
            For item = 0 To listings.Length - 1
                r = r + 1
                html2.body.innerHTML = listings.item(item).innerHTML
                'size,description, amenities,specials offer1 offer2, rate type, price
    
                results(r, 1) = Trim$(html2.querySelector(".size").innerText)
                results(r, 2) = Trim$(html2.querySelector(".description").innerText)
                On Error Resume Next
                Set icons = html2.querySelectorAll("i[title]")
                ReDim amenitiesInfo(0 To icons.Length - 1)
    
                For icon = 0 To icons.Length - 1
                    amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
                Next
    
                amenities = Join$(amenitiesInfo, ", ")
    
                results(r, 3) = amenities
    
                results(r, 4) = html2.querySelector(".offer1").innerText
                results(r, 5) = html2.querySelector(".offer2").innerText
                On Error GoTo 0
                results(r, 6) = html2.querySelector(".rate-label").innerText
                results(r, 7) = html2.querySelector(".price").innerText
            Next
    
            ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
            ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
        End With
    End Sub
    

    【讨论】:

      猜你喜欢
      • 2022-01-14
      • 2018-12-27
      • 2021-09-25
      • 1970-01-01
      • 2020-06-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多