【问题标题】:Import Data using JSON使用 JSON 导入数据
【发布时间】:2019-06-20 10:07:09
【问题描述】:

我已经开发了一个从网站上抓取数据的代码,但是由于我对 JSON 知之甚少,所以我可以根据需要获得输出,因此该代码是为这个网站开发的:https://ngodarpan.gov.in/index.php/home/statewise_ngo/76/35/1 现在我正在复制我的其他具有 json 的网站的代码,例如:https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA;但此代码无法正常运行。这是我的代码(我希望它对大多数网络来说都是通用的)

Option Explicit

Public Sub FetchTabularInfo()
    Dim Http As XMLHTTP60, Html As HTMLDocument, col As Variant, csrf As Variant, i&, page As Long
    Dim headers(), ws As Worksheet, iCol As Collection

    Set ws = ThisWorkbook.Worksheets("Sheet1")
    headers = Array("SrNo", "Name", "Address", "Mobile", "Email")
    Set Http = New XMLHTTP60
    Set Html = New HTMLDocument

    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

    For page = 1 To 8 'To cover all pages

        With Http
            .Open "GET", "https://www.yelp.com/search?cflt=hvac&find_loc=San+Francisco%2C+CA" & CStr(page), Falsev 'Last letter of URL is page number whose range will be given in outerloop
            .send
            Html.body.innerHTML = .responseText
        End With
        Set iCol = New Collection
        With Html.querySelectorAll(".table tr a[onclick^='show_ngo_info']")
            For i = 0 To .Length - 1
                iCol.Add Split(Split(.Item(i).getAttribute("onclick"), "(""")(1), """)")(0)
            Next i
        End With

        Dim r As Long, results()
        ReDim results(1 To iCol.Count, 1 To UBound(headers) + 1)
        r = 0
        For Each col In iCol
            r = r + 1
            With Http
                .Open "GET", "https://www.yelp.com/index.php/ajaxcontroller/get_csrf", False
                .send
                csrf = .responseText
            End With

            csrf = Split(Replace(Split(csrf, ":")(1), """", ""), "}")(0)

            Dim json As Object
            With Http
                .Open "POST", "https://www.yelp.com/index.php/ajaxcontroller/show_ngo_info", False
                .setRequestHeader "X-Requested-With", "XMLHttpRequest"
                .setRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
                .send "id=" & col & "&csrf_test_name=" & csrf
                Set json = JsonConverter.ParseJson(.responseText)

                Dim orgName As String, address As String, srNo As Long, city As String
                Dim state As String, tel As String, mobile As String, website As String, email As String

                On Error Resume Next
                orgName = json("registeration_info")(1)("nr_orgName")
                address = json("registeration_info")(1)("nr_add")

                srNo = r                         '<unsure where this is coming from.

                mobile = json("infor")("0")("Mobile")

                email = json("infor")("0")("Email")
                On Error GoTo 0

                Dim arr()
                arr = Array(srNo, orgName, address, tel, email)
                For i = LBound(headers) To UBound(headers)
                    results(r, i + 1) = arr(i)
                Next
            End With
        Next col
        Set iCol = Nothing: Set json = Nothing
        ws.Cells(GetLastRow(ws) + 1, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    Next
End Sub

Public Function GetLastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    GetLastRow = sh.Cells.Find(What:="*", _
                               After:=sh.Range("A1"), _
                               Lookat:=xlPart, _
                               LookIn:=xlFormulas, _
                               SearchOrder:=xlByRows, _
                               SearchDirection:=xlPrevious, _
                               MatchCase:=False).Row
    On Error GoTo 0
End Function

还请让我知道我正在做的错误,以便我将来处理这些错误。

【问题讨论】:

  • 哦!所以我必须为每个网站一次又一次地设计..
  • 我很害怕。这是网络抓取的一部分。可重用的东西是您部署的技术。

标签: json excel vba web-scraping


【解决方案1】:

简答:

没有。

我什至可以说你不可能为大多数网络编写通用的。可以说通用部分是解析器本身。但是你需要对每个端点的 json 结构有一定的了解才能正确地直接解析。 Json 本身已经定义了结构化语法/组件,但是您想要从中获得的内容将具有不同的访问路径,并且需要不同的处理。然后是可能需要提供的参数和输出格式的差异。

最佳方案是什么?

如果您有一组 url(理想情况下是 API 端点)列表,那么您就有更好的机会编写可能会持续一段时间的内容,因为您可以熟悉返回的 json。但这有多通用?它实际上只是分支代码。

可重复使用的东西:

可能是可以概括的非解析器内容,例如您创建的方法和类解析出整个结构的路径并查找关键字并返回这些路径?您编写的帮助函数可能会递归地写出嵌套结构等。发出请求和处理失败的代码等......我绝对建议在网络抓取中查看可重用代码的类。

基于类的示例:

我会在什么时候添加这个

  1. https://stackoverflow.com/a/52301153/6241235
  2. https://codereview.stackexchange.com/questions/69009/vba-clickbot-featuring-ajax-waiting-and-element-searching

【讨论】:

猜你喜欢
  • 2018-07-08
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2013-06-01
  • 2021-10-11
  • 2014-12-28
  • 1970-01-01
相关资源
最近更新 更多