【发布时间】:2021-07-03 08:11:07
【问题描述】:
我正在提取一些黄页数据,这很好。但是我的问题是页面导航。虽然它在尝试导航到第 3 页时从第 1 页导航到第 2 页很好,但我的代码返回到第 1 页并再次提取数据。数据提取很好,问题是导航。
这是我发现的问题,我认为是问题所在,但不知道如何解决。
当页面导航到第 2 页时,“emptyPageButton”的类更改为相同的类以导航到 NEXT PAGE,因此不是前进到下一页,即第 3 页,而是返回到第 1 页。如果我说应该提取 10 页,它会在每页 1 + 2 中提取 5 次,因为它会在两页之间来回切换。
我已经做了几次尝试,但都不起作用。我可以到达第 2 页,然后返回第 1 页
WITH CLASS 工作到第 2 页然后返回到第 1 页
''' Searches Number of Pages entered in Sheet20 rage J9
If pageNumber >= Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0).children (0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1).children (0)
'Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(1).children (1)
'Set nextPageElement = HTML.getElementsByClassName("view_more_section_noScroll ")(0).getElementsByTagName("a")(1)
If nextPageElement Is Nothing Then Exit Do
nextPageElement.Click 'next web page
Application.Wait Now + TimeValue("00:00:05")
WITH QUERY SELECTOR 工作到第 2 页然后返回到第 1 页
''' Searches Number of Pages entered in Sheet20 rage J9
If pageNumber >= Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.querySelector(".view_more_section_noScroll .pageButton")
If Not nextPageElement Is Nothing Then
nextPageElement.Click
Application.Wait Now + TimeValue("00:00:05")
Else:
Exit Do
End If
第 1 页的片段
<div class="view_more_section_noScroll">
<div class="emptyPageButton"></div>
<span class="pageCount">
<span class="bold">
1 /
</span>
<span class="">
37</span>
</span>
<a href="/search/si/2/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - load_more - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"next_serp"}"
class="ypbtn btn-theme pageButton">Next
>></a>
</div>
第 2 页及以后的代码段
<div class="view_more_section_noScroll">
<a href="/search/si/1/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - previous_page - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"previous_serp"}"
class="ypbtn btn-theme pageButton"><< Previous</a>
<span class="pageCount">
<span class="bold">
2 /
</span>
<span class="">
37</span>
</span>
<a href="/search/si/3/car+dealership/Toronto+ON" data-analytics="{"event_name":"click - load_more - Serp ","lk_se_id":"f32f0ee7-8492-46dd-87da-7b621c162879_Y2FyIGRlYWxlcnNoaXA_VG9yb250byBPTg","lk_name":"next_serp"}"
class="ypbtn btn-theme pageButton">Next
>></a>
</div>
问题,有人可以建议导航的正确类或 querySelector 是什么吗?
提前致谢。
'''########################## 2021 年 8 月 4 日更新 ########### ##########
完整的代码很大,我已经减少了很多代码以使其更易于阅读,因为唯一的问题是页面导航。这段代码应该让你知道我想要做什么。目前它覆盖了之前提取的结果,我错误地删除了代码中的某些内容,请暂时忽略它,因为只有页面导航是一个问题
Private Sub YellowPagesCa()
Dim HTML As htmlDocument
Dim objIE As Object
Dim result As String 'string variable that will hold our result link
Dim pageNumber As Long ' page no.
Dim nextPageElement As Object 'page element
Dim HtmlText As Variant ' for html data
Dim wsSheet As Worksheet ' WorkSheet
Dim wb As Workbook
Dim sht As Worksheet
Set wb = ThisWorkbook
Set wsSheet = wb.Sheets("YellowPages")
Set sht = ThisWorkbook.Worksheets("YellowPages")
'+++++ Internet Explorer ++++++
Set objIE = New InternetExplorer 'initiating a new instance of Internet Explorer and asigning it to objIE
objIE.Visible = True
objIE.navigate "https://www.yellowpages.ca/search/si/1/car+dealer/Toronto+ON"
Do While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Loop 'wait here a few seconds while the browser is busy
Set HTML = objIE.document
Set elements = HTML.getElementsByClassName("listing_right_section")
For Each element In elements
DoEvents
''' Element 1
If element.getElementsByClassName("listing__name--link listing__link jsListingName")(0) Is Nothing Then
wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = "-"
Else
HtmlText = element.getElementsByClassName("listing__name--link listing__link jsListingName")(0).href
wsSheet.Cells(sht.Cells(sht.Rows.Count, "A").End(xlUp).Row + 1, "A").Value = HtmlText
End If
'End If
Next element
Do
'''############### PAGE NAVIGATION ##############
'Searches Number of Pages entered in
If pageNumber >= 5 Then Exit Do 'Replace(Worksheets("Sheet20").Range("J9").Value, "", "+") Then Exit Do
Set nextPageElement = HTML.querySelector(".view_more_section_noScroll .pageButton")
' Set nextPageElement = HTML.getElementsByClassName("ypbtn btn-theme pageButton ")(0)
If Not nextPageElement Is Nothing Then
nextPageElement.Click
Application.Wait Now + TimeValue("00:00:05")
Else:
Exit Do
End If
Do While objIE.Busy = True Or objIE.readyState <> 4
DoEvents
Loop
Set HTML = objIE.document
pageNumber = pageNumber + 1
Loop
objIE.Quit ' end and clear browser
Set objIE = Nothing
Set HTML = Nothing
Set nextPageElement = Nothing
Set HtmlText = Nothing
Set element = Nothing
Complete.show
'End If
End Sub
【问题讨论】:
标签: excel vba web-scraping screen-scraping