【发布时间】:2019-06-20 10:07:09
【问题描述】:
我已经开发了一个从网站上抓取数据的代码,但是由于我对 JSON 知之甚少,所以我可以根据需要获得输出,因此该代码是为这个网站开发的:https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1 现在我正在复制我的其他具有 json 的网站的代码,例如:https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA;但此代码无法正常运行。这是我的代码(我希望它对大多数网络来说都是通用的)
Option Explicit
Public Sub FetchTabularInfo()
Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
Dim headers(), ws As Worksheet, iCol As Collection
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
Set Http = New XMLHTTP60
Set Html = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
For page = 1 To 8 'To cover all pages
With Http
.Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
.send
Html.body.innerHTML = .responseText
End With
Set iCol = New Collection
With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
For i = 0 To .Length - 1
iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
Next i
End With
Dim r As Long, results()
ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
r = 0
For Each col In iCol
r = r + 1
With Http
.Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
.send
csrf = .responseText
End With
csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)
Dim json As Object
With Http
.Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
.setRequestHeader "X-Requested-With", "XMLHttpRequest"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
.send "id=" & col & "&csrf_test_name=" & csrf
Set json = JsonConverter.ParseJson(.responseText)
Dim orgName As String, address As String, srNo As Long, city As String
Dim state As String, tel As String, mobile As String, website As String, email As String
On Error Resume Next
orgName = json("registeration_info")(1)("nr_orgName")
address = json("registeration_info")(1)("nr_add")
srNo = r '<unsure where this is coming from.
mobile = json("infor")("0")("Mobile")
email = json("infor")("0")("Email")
On Error GoTo 0
Dim arr()
arr = Array(srNo, orgName, address, tel, email)
For i = LBound(headers) To UBound(headers)
results(r, i + 1) = arr(i)
Next
End With
Next col
Set iCol = Nothing: Set json = Nothing
ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
Next
End Sub
Public Function GetLastRow(ByVal sh As Worksheet) As Long
On Error Resume Next
GetLastRow = sh.Cells.Find(What:="*", _
After:=sh.Range("A1"), _
Lookat:=xlPart, _
LookIn:=xlFormulas, _
SearchOrder:=xlByRows, _
SearchDirection:=xlPrevious, _
MatchCase:=False).Row
On Error GoTo 0
End Function
还请让我知道我正在做的错误,以便我将来处理这些错误。
【问题讨论】:
-
哦!所以我必须为每个网站一次又一次地设计..
-
我很害怕。这是网络抓取的一部分。可重用的东西是您部署的技术。
标签: json excel vba web-scraping