【问题标题】:Trouble printing a certain content within a loop derived from another loop在从另一个循环派生的循环中打印某些内容时遇到问题
【发布时间】:2019-03-01 20:45:04
【问题描述】:

我创建了一个 vba 脚本来解析定义为 postTime 的不同帖子的时间以及来自网页的标题。虽然postTime 在目标页面中可用,但我想从登录页面获取它并使用从目标页面收集的postTitle 打印它。我在我的脚本中定义了能够收集所需内容的选择器。但是,我当前的尝试只打印了某个帖子的postTime 几次,而我想打印多个帖子的postTime

如何在从另一个循环派生的循环中打印项目?

到目前为止我的脚本:

Sub CollectData()
    Const baseUrl = "https://stackoverflow.com"
    Dim Http As New XMLHTTP60, Html As New HTMLDocument
    Dim post As Object, itemlist$, linklist As Variant
    Dim qualifiedLink$, nlink As Variant, postTime$, postTitle$

    With Http
        .Open "GET", "https://stackoverflow.com/questions/tagged/web-scraping", False
        .send
        Html.body.innerHTML = .responseText
    End With

    Set post = Html.querySelectorAll(".summary .question-hyperlink")

    For I = 0 To post.Length - 1
        postTime = Html.querySelector(".user-action-time").innerText
        qualifiedLink = baseUrl & Split(post(I).getAttribute("href"), "about:")(1)
        itemlist = itemlist & IIf(itemlist = "", "", " ") & qualifiedLink
    Next I

    linklist = Split(itemlist, " ")

    For Each nlink In linklist
        With Http
            .Open "GET", nlink, False
            .send
            Html.body.innerHTML = .responseText
        End With
        postTitle = Html.querySelector("h1[itemprop='name'] a").innerText
        ' the following line prints postTime derived from earlier loop
        Debug.Print postTime, postTitle
    Next nlink
End Sub

【问题讨论】:

    标签: excel vba web-scraping


    【解决方案1】:

    您需要在第一个循环期间使用 querySelectorAll 并对其进行索引,以确保您获得不同的发布时间。我会将这些存储在一个集合中,并通过索引访问它们 最后一个循环

    Option Explicit
    
    Public Sub CollectData()
        Const baseUrl = "https://stackoverflow.com"
        Dim Http As New XMLHTTP60, Html As New HTMLDocument
        Dim post As Object, itemlist$, linklist As Variant, i As Long
        Dim qualifiedLink$, nlink As Variant, postTime$, postTitle$
        Dim times As Object
        Set times = New Collection
        With Http
            .Open "GET", "https://stackoverflow.com/questions/tagged/web-scraping", False
            .send
            Html.body.innerHTML = .responseText
        End With
    
        Set post = Html.querySelectorAll(".summary .question-hyperlink")
    
        For i = 0 To post.Length - 1
            postTime = Html.querySelectorAll(".user-action-time").item(i).innerText
            times.Add postTime
            qualifiedLink = baseUrl & Split(post(i).getAttribute("href"), "about:")(1)
            itemlist = itemlist & IIf(itemlist = "", "", " ") & qualifiedLink
        Next i
    
        linklist = Split(itemlist, " ")
        Dim accessor As Long
        For Each nlink In linklist
            accessor = accessor + 1
            With Http
                .Open "GET", nlink, False
                .send
                Html.body.innerHTML = .responseText
            End With
            postTitle = Html.querySelector("h1[itemprop='name'] a").innerText
            ' the following line prints postTime derived from earlier loop
            Debug.Print times(accessor), postTitle
    
        Next nlink
    End Sub
    

    更好的是,将时间存储在一个变量中,而不是在循环中继续使用querySelectorAll,因为这样会更有效:

    Option Explicit
    
    Public Sub CollectData()
        Const baseUrl = "https://stackoverflow.com"
        Dim Http As New XMLHTTP60, Html As New HTMLDocument
        Dim post As Object, itemlist$, linklist As Variant, i As Long
        Dim qualifiedLink$, nlink As Variant, postTime$, postTitle$
        Dim times As Object
        Set times = New Collection
        With Http
            .Open "GET", "https://stackoverflow.com/questions/tagged/web-scraping", False
            .send
            Html.body.innerHTML = .responseText
        End With
    
        Set post = Html.querySelectorAll(".summary .question-hyperlink")
        Dim timesList As Object
        Set timesList = Html.querySelectorAll(".user-action-time")
        For i = 0 To post.Length - 1
            postTime = timesList.item(i).innerText
            times.Add postTime
            qualifiedLink = baseUrl & Split(post(i).getAttribute("href"), "about:")(1)
            itemlist = itemlist & IIf(itemlist = "", "", " ") & qualifiedLink
        Next i
    
        linklist = Split(itemlist, " ")
        Dim accessor As Long
        For Each nlink In linklist
            accessor = accessor + 1
            With Http
                .Open "GET", nlink, False
                .send
                Html.body.innerHTML = .responseText
            End With
            postTitle = Html.querySelector("h1[itemprop='name'] a").innerText
            ' the following line prints postTime derived from earlier loop
            Debug.Print times(accessor), postTitle
    
        Next nlink
    End Sub
    

    【讨论】:

    • 是的,这就是我想尝试但无法理解的方法。谢谢 QHarr。
    猜你喜欢
    • 2016-04-23
    • 2022-01-21
    • 2016-08-08
    • 1970-01-01
    • 1970-01-01
    • 2021-03-07
    • 2017-03-28
    • 1970-01-01
    • 2020-03-15
    相关资源
    最近更新 更多