【问题标题】:Scraping website data with options in combo box VBA使用组合框 VBA 中的选项抓取网站数据
【发布时间】:2021-02-10 22:00:53
【问题描述】:

您好,我正在尝试从本网站获取产品名称(Cohiba Robusto)、产品尺寸(单支雪茄、3 支装、25 支箱)和价格(33.65 英镑、90 英镑、730 英镑):https://www.jjfox.co.uk/cohiba-robusto-621.html

我想得到这样的东西:

我正在使用下面的代码,它给出了一个错误(“对象变量或未设置变量”)。

将不胜感激。

Sub getproducts()

Sheets("JJFox").Select

Dim oHtml       As HTMLDocument
Dim oElement    As Object

Dim Elements As IHTMLElementCollection
Dim Document As HTMLDocument

Set oHtml = New HTMLDocument


'Cells(1, 6) = Time()
lastrow = ActiveSheet.Cells(Rows.Count, "A").End(xlUp).Row
cnt = lastrow + 1

counter1 = cnt

Dim gg As String

gg = "https://www.jjfox.co.uk/cohiba-robusto-621.html"


Dim objHTTP As New WinHttp.WinHttpRequest
url = gg
    objHTTP.Open "POST", url, False
    objHTTP.setRequestHeader "Content-Type", "application/json"
    objHTTP.send ("{""key"":null,""from"":""me@me.com"",""to"":null,""cc"":null,""bcc"":null,""date"":null,""subject"":""My Subject"",""body"":null,""attachments"":null}")
   
 oHtml.body.innerHTML = objHTTP.responseText
 'Cells(rw, 2) = oHtml.getElementsByTagName("description").innerText

   '    If Not .Document.querySelector("button[aria-label='Close']") Is Nothing Then
     '       .Document.querySelector("button[aria-label='Close']").Click
      '  End If
      
      
    txttitle = oHtml.getElementsByClassName("productcart")(0).innerText
txttitlehtml = oHtml.getElementsByClassName("packsize")(0).innerHTML


txttitle = Mid(txttitle, 1, InStr(1, txttitle, Chr(10)))
'Debug.Print txttitlehtml
'txttitle2 = oHtml.getElementsByClassName("price")(0).innerText

Dim Text As String
Text = GetHTML(gg)


starts = InStr(1, Text, "spConfig =")
endS = InStr(starts + 1, Text, "spConfig")

If starts = 0 Then


    Cells(counter1, 1) = txttitle
    Cells(counter1, 2) = "Single"
    starts = InStr(starts + 1, Text, "productPrice")
    endl = InStr(starts + 1, Text, ",")
    Cells(counter1, 3) = Val(Mid(Text, starts + 14, endl - (starts + 14)))
    Cells(counter1, 4) = "JJFox"
    Cells(counter1, 5) = Now()
     
    Cells(counter1, 7) = gg ' link to the page
    counter1 = counter1 + 1
   
Else


Text = Mid(Text, starts, endS - starts)
'Debug.Print Text
'find how many pack options are avaialble

myTxt = Text
countTxt = "label"

bb = (Len(myTxt) - Len(replace(myTxt, countTxt, ""))) / Len(countTxt) - 1
'End find////////////////////////////////////

varlabel = "class=" & Chr(34) & "label" & Chr(34)


starts = InStr(1, Text, "label") + 1
Text = Mid(Text, starts, Len(Text))
        
        For i = 1 To bb
        
        
        starts = InStr(1, Text, "label")
        
        If InStr(starts, Text, "label") Then
        
        'Show the element's properties
           
        
                Cells(counter1, 1) = txttitle
                Cells(counter1, 2) = Mid(Text, starts + 8, InStr(starts, Text, " \") - (starts + 8))
                
                                       
                        starts = InStr(starts + 1, Text, "oldPrice")
                        endl = InStr(starts + 1, Text, ",")
                        

                Cells(counter1, 3).FormulaR1C1 = Val(Mid(Text, starts + 11, endl - (starts + 11)))
                'Debug.Print Val(Mid(Text, startS + chrs, 6))
                Cells(counter1, 4) = "JJFox"
                Cells(counter1, 5) = Now()
                starts = starts + 1
                Text = Mid(Text, starts, Len(Text))
                Cells(counter1, 7) = gg ' link to the page
                counter1 = counter1 + 1
           End If
        
        Next i
            
End If
'Cells(2, 6) = Time()
End Sub



Function GetHTML(url As String) As String
     With CreateObject("MSXML2.ServerXMLHTTP.6.0")
        .Open "GET", url, False
        .send
        GetHTML = .responseText
    End With
End Function

【问题讨论】:

    标签: excel vba web-scraping


    【解决方案1】:

    价格和标签是从脚本标签中动态提取的,您可以使用 json 解析器将其内容解析为 json。但是,您需要从 html 中获取名称。

    只要对 html 和 css 有一点了解,就很容易定义一个 css 模式来定位感兴趣的脚本节点:

    .fieldset [type='text/x-magento-init']
    

    这会查找具有 type 属性且属性值为 text/x-magento-init 的子 script,以及具有类 fieldset 的父。

    我使用的效率有点低(你不会注意到):

    For i = 1 To optionsCollection.Count
    

    仅仅是因为我知道集合很小,并且允许我用一个循环索引两个变量。


    Json 库:

    我使用 jsonconverter.bas。从here 下载原始代码并添加到名为 JsonConverter 的标准模块中。从复制的代码中删除顶部的 Attribute 行。

    然后你需要去:

    VBE > 工具 > 参考 > 添加参考:

    Microsoft Scripting Runtime
    Microsoft HTML Object Library
    Microsoft XML Library. 
    

    在 VBA for json 中,[] 表示集合,{} 表示字典。


    Option Explicit
    
    Public Sub GetCigarData()
        '<  VBE > Tools > References:
        'Microsoft Scripting Runtime
        'Microsoft HTML Object Library
        'Microsoft XML Library
        
        Dim json As Object, html As MSHTML.HTMLDocument, xhr As MSXML2.XMLHTTP60, ws As Worksheet
    
        Set ws = ThisWorkbook.Worksheets("Sheet1")
        Set xhr = New MSXML2.XMLHTTP60
        Set html = New MSHTML.HTMLDocument
        
        With xhr
            .Open "GET", "https://www.jjfox.co.uk/cohiba-robusto-621.html", False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .send
             html.body.innerHTML = .responseText
        End With
    
        Set json = jsonConverter.ParseJson(html.querySelector(".fieldset [type='text/x-magento-init']").innerHTML)("#product_addtocart_form")("configurable")("spConfig")
         
        Dim prices As Scripting.Dictionary, options As Scripting.Dictionary, optionsCollection As Collection
        
        Set prices = json("optionPrices")
        Set options = json("attributes")
        Set optionsCollection = options(options.Keys(0))("options")
        
        Dim results() As Variant, headers() As Variant, i As Long, name As String
        ReDim results(1 To optionsCollection.Count, 1 To 3)
        
        name = html.querySelector(".base").innerText
    
        For i = 1 To optionsCollection.Count
             results(i, 1) = name
             results(i, 2) = optionsCollection.item(i)("label")
             results(i, 3) = prices(prices.Keys(i - 1))("finalPrice")("amount")
        Next
        
        headers = Array("Name", "Size", "Price")
        
        With ws
            .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
            .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results 
        End With
       
    End Sub
    

    了解 CSS 选择器:

    1. https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors

    【讨论】:

    • 非常感谢@QHarr。不幸的是,我无法运行代码,因为我在 Set json = jsoncoverter.parseJson(...) 中的 jsonConverter 上收到“变量未定义”错误。我已经按照上面的建议添加了所有参考资料,但它仍然无法正常工作。
    • @QHarr 我昨晚也尝试使用数据查找 JSON,但未成功。感谢您指出这种可能性。学习新方法总是很棒。
    • @SurenGrigoryan 请仔细阅读 QHarr 所写的内容。他链接了你可以获得 jsonConverter 并描述了如何使用它。这是一个解析 JSON 的库。 JSON 是一种类似于 XML 的数据交换格式。对于 Excel,这只是一个字符串。然而,Tim Hall 在 VBA 中开发了 jsonConverter。您必须将链接的 .bas 文件的代码复制到一个名为 JsonConverter 的新模块中。
    • 谢谢@Zwenn,由于某种原因我错过了。现在添加了它的工作正常。
    • @SurenGrigoryan 很好:-) 您可以将 QHarr 的答案标记为解决方案。其他用户可以从正确答案中受益。
    猜你喜欢
    • 2015-01-19
    • 1970-01-01
    • 1970-01-01
    • 2020-03-12
    • 1970-01-01
    • 2018-11-21
    • 2022-01-14
    相关资源
    最近更新 更多