XHR:
所有信息都可以通过XMLHTTP (XHR) request 获得——比打开浏览器要快得多。
我首先使用.main li[class] 的css 选择器检索行数。
"." 是 class selector,li 是 type selector,[class] 是 attribute selector。中间的空格" " 是descendant combinator。这指定我要检索所有 li 标签/类型元素,具有类属性,其父类名称为 main。
这匹配如下:
如您所见,这给了我行数;要从中检索结果集信息的父 li 元素的数量。
li elements 的集合作为节点列表由querySelectorAll 返回。我无法遍历此列表,将getElementsByClassName / querySelector 应用于单个节点,因为li 元素没有公开我可以使用的方法。
现在,由于我没有使用浏览器,我不得不依赖HTMLDocument 对象可用的方法。与浏览器不同,当通过 VBA 自动化时,我无法访问它们支持的有限 pseudo class selectors,这将允许我使用诸如 :nth-of-type 之类的选择器语法来访问各个行。这是使用 VBA 进行网络抓取的一个令人讨厌的限制。
那么,我们能做些什么呢?好吧,在这种情况下,我可以将每个节点的innerHTML 转储到另一个HTMLDocument 变量html2,这样我就可以访问该对象的querySelector/querySelectorAll 方法。 HTML 将仅限于当前的li。
如果我们查看有问题的 HTML:
我们可以看到li 元素是一般的兄弟元素。他们坐在同一水平线上。当我循环我的nodeList listings 时,我将innerHTML 从当前节点转移到html2;我的第二个HTMLDocument 变量。
值得注意的是,我可能已经使用 children 对每个列表进行了下降,例如:
listings.item(i).Children(2)......
然后我可以在 newLines 等上进行拆分,以便访问所有信息。我认为我给定的方法更快,更健壮。
VBA:
Option Explicit
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String
Const URL As String = "https://www.neighborhoodselfstorage.net/self-storage-delmar-md-f1426"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
html.body.innerHTML = s
Dim headers(), results(), listings As Object, amenities As String
headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
Set listings = html.querySelectorAll(".main li[class]")
Dim rowCount As Long, numColumns As Long, r As Long, c As Long
Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long
rowCount = listings.Length
numColumns = UBound(headers) + 1
ReDim results(1 To rowCount, 1 To numColumns)
Dim html2 As HTMLDocument
Set html2 = New HTMLDocument
For item = 0 To listings.Length - 1
r = r + 1
html2.body.innerHTML = listings.item(item).innerHTML
'size,description, amenities,specials offer1 offer2, rate type, price
results(r, 1) = Trim$(html2.querySelector(".size").innerText)
results(r, 2) = Trim$(html2.querySelector(".description").innerText)
Set icons = html2.querySelectorAll("i[title]")
ReDim amenitiesInfo(0 To icons.Length - 1)
For icon = 0 To icons.Length - 1
amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
Next
amenities = Join$(amenitiesInfo, ", ")
results(r, 3) = amenities
results(r, 4) = html2.querySelector(".offer1").innerText
results(r, 5) = html2.querySelector(".offer2").innerText
results(r, 6) = html2.querySelector(".rate-label").innerText
results(r, 7) = html2.querySelector(".price").innerText
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Internet Explorer:
假设没有从给定的 url 重定向。在这里,我使用 :nth-of-type 伪类选择器来定位列表的每一行。这些行是 li (list) 元素,其中包含每个框列表的信息。我建立了一个 css 选择器字符串,它指定行,然后是我所追求的行中的元素。我将该字符串传递给querySelector 或querySelectorAll,后者返回匹配的元素。
Option Explicit
Public Sub UseIE()
Dim ie As New InternetExplorerm, ws As Worksheet
Const Url As String = "https://www.neighborhoodselfstorage.net/self-storage-delmar-md-f142"
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 Url
While .Busy Or .readyState < 4: DoEvents: Wend
Dim headers(), results(), listings As Object, listing As Object, amenities As String
headers = Array("Size", "Description", "Amenities", "Offer1", "Offer2", "RateType", "Price")
Set listings = .document.querySelectorAll(".main li[class]")
Dim rowCount As Long, numColumns As Long, r As Long, c As Long
Dim icons As Object, icon As Long, amenitiesInfo(), i As Long
rowCount = listings.Length
numColumns = UBound(headers) + 1
ReDim results(1 To rowCount, 1 To numColumns)
For Each listing In listings
r = r + 1
'size,description, amenities,specials offer1 offer2, rate type, price
With .document
results(r, 1) = Trim$(.querySelector(".main li:nth-of-type(" & r & ") .size").innerText)
results(r, 2) = Trim$(.querySelector(".main li:nth-of-type(" & r & ") .description").innerText)
Set icons = .querySelectorAll("." & Join$(Split(listing.className, Chr$(32)), ".") & ":nth-of-type(" & r & ") i[title]")
ReDim amenitiesInfo(0 To icons.Length - 1)
For icon = 0 To icons.Length - 1
amenitiesInfo(icon) = icons.item(icon).getAttribute("title")
Next
amenities = Join$(amenitiesInfo, ",")
results(r, 3) = amenities
results(r, 4) = .querySelector(".main li:nth-of-type(" & r & ") .offer1").innerText
results(r, 5) = .querySelector(".main li:nth-of-type(" & r & ") .offer2").innerText
results(r, 6) = .querySelector(".main li:nth-of-type(" & r & ") .rate-label").innerText
results(r, 7) = .querySelector(".main li:nth-of-type(" & r & ") .price").innerText
End With
Next
.Quit
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
参考资料(VBE > 工具 > 参考资料):
- Microsoft HTML 对象库
- Microsoft Internet 控件