【问题标题】:Web Scraping excel VBA [closed]Web Scraping excel VBA [关闭]
【发布时间】:2019-11-27 12:18:54
【问题描述】:

我想从电子商务网站抓取产品标题、价格、卖家和图片网址。结果需要复制到 A-D 列的活动工作表中。下面的代码最初是由@QHarr 之一为亚马逊开发的。任何帮助更新它以获得所需的结果?

我已经包含了结果的样子。谢谢

Public Sub WriteOutProductInfo()

'VBE>Tools>References> Microsoft HTML Object Library
Dim html As MSHTML.HTMLDocument

Set html = New MSHTML.HTMLDocument

With CreateObject("MSXML2.XMLHTTP")
    ' change the url for the page of amazon from where to copy data
    .Open "GET", "https://www.daraz.lk/catalog/?from=input&q=sarees&ppath=31186:3287", False
    .setRequestHeader "User-Agent", "Mozilla/5.0"
    .send
    html.body.innerHTML = .responseText
End With

' 1. declare additional headers as variable
  
Dim headers(), titles As Object, prices As Object, original_prices As Object
Dim seller As Object

headers = Array("Title", "Price")

With html
    Set titles = .querySelectorAll(".c3gUW0,.c13VH6")
    Set prices = .querySelectorAll(".------------")
End With

Dim results(), r As Long, priceInfo As String

ReDim results(1 To titles.Length, 1 To UBound(headers) + 1)

For r = 0 To titles.Length - 1
    results(r + 1, 1) = titles.Item(r).innerText        
Next

Dim ws As Worksheet

Set ws = ThisWorkbook.Worksheets("Sheet1")

With ws
    .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
    .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With

End sub

【问题讨论】:

  • 已编辑请取消标记为保留

标签: html excel vba web-scraping


【解决方案1】:

由于不同的原因,您发布的宏有点令人毛骨悚然。使用它来实现您的目标:

Sub WriteOutProductInfo()

'This macro works on the sheet it's startetd from

Dim browser As Object
Dim url As String
Dim nodesAllOffers As Object
Dim nodeOneOffer As Object
Dim currentRow As Long

  'Row, for title and price for current offer
  currentRow = ActiveSheet.Cells(Rows.Count, 1).End(xlUp).Row + 1
  'Make sure that line 2 is the start line if nothing has been entered in the table yet
  If currentRow = 1 Then currentRow = 2

  'Your sample url
  'You can loop over various urls from a table with the following code
  '(Loop not included here)
  url = "https://www.daraz.lk/catalog/?from=input&q=sarees&ppath=31186:3287"

  'Initialize Internet Explorer, set visibility,
  'call URL and wait until page is fully loaded
  Set browser = CreateObject("internetexplorer.application")
  browser.Visible = False
  browser.navigate url
  Do Until browser.ReadyState = 4: DoEvents: Loop

  'Get all html elements with the css class "c3KeDq" in a node collection
  'These are all offers of the called page, which contain in each case
  'the connected information (title and price)
  '
  'This is a sample of the html code for one offer
  '<div class="c3KeDq">
  '  <div class="c3vCyH">
  '  </div>
  '  <div class="c16H9d">
  '    <a age="0" href="//www.daraz.lk/products/ladies-office-and-saree-wear-brown-i102604104-s1009196133.html?search=1"
  '       title="Ladies Office And Saree Wear - Brown">
  '      Ladies Office And Saree Wear - Brown
  '    </a>
  '  </div>
  '  <div class="c3gUW0">
  '    <span class="c13VH6">
  '      Rs. 1,150
  '    </span>
  '  </div>
  '  <div class="c3lr34">
  '  </div>
  '  <div class="c15YQ9">
  '    <span class="c2i43- c1enUu" title="Sri Lanka">
  '      Sri Lanka
  '    </span>
  '  </div>
  '  <div class="c2attd">
  '    <div class="c31VUX">
  '      <button age="0" type="button" class="ant-btn c1xzE_ ant-btn-primary ant-btn-lg">
  '        ADD TO CART
  '      </button>
  '    </div>
  '  </div>
  '</div>
  Set nodesAllOffers = browser.document.getElementsByClassName("c3KeDq")
  If Not nodesAllOffers Is Nothing Then
    'If we got the node collection
    'We step through all offers and pull the titel and price out
    For Each nodeOneOffer In nodesAllOffers
      'Get title
      ActiveSheet.Cells(currentRow, 1).Value = getValueByClassName(nodeOneOffer, "c16H9d")
      'Get price
      ActiveSheet.Cells(currentRow, 2).Value = getValueByClassName(nodeOneOffer, "c13VH6")
      'Next offer (row)
      currentRow = currentRow + 1
    Next nodeOneOffer
  End If

  'Clean up
  browser.Quit
  Set browser = Nothing
  Set nodesAllOffers = Nothing
  Set nodeOneOffer = Nothing
End Sub

还有这个功能:

Private Function getValueByClassName(htmlNode As Object, cssClassName As String) As String

'This function works with a single node
'If htmlNode is a node collection, you will got only the first value

Dim nodeByClassName As Object
Dim resultString As String

  'Try to get wanted node
  Set nodeByClassName = htmlNode.getElementsByClassName(cssClassName)(0)
  If Not nodeByClassName Is Nothing Then
    'If we got the class node
    'Take innertext
    resultString = Trim(nodeByClassName.innertext)
  End If

  'Clean up
  Set nodeByClassName = Nothing

  'Return result string
  getValueByClassName = resultString
End Function

【讨论】:

  • 谢谢!如果您能提供更多帮助,有几件事: 1. 我想要 C 列中图像的 url,我通过它的属性“c1ZEkM”尝试了它,但没有用。 2.如果页面不止一个,我如何循环浏览所有页面,例如在这种情况下,有 3 页 daraz.lk/ac/?from=input&q=sarees
  • @Damian、Geert Bellekens、Ryan Wildry、SIM、QHarr。我已经对问题进行了更改,但我仍然被禁止发帖。你能看看吗。谢谢
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2021-10-15
  • 1970-01-01
  • 2020-05-20
  • 1970-01-01
  • 2020-10-07
相关资源
最近更新 更多