【问题标题】:Get data from URL using Excel VBA使用 Excel VBA 从 URL 获取数据
【发布时间】:2019-03-18 14:46:13
【问题描述】:

我想从URL 中提取数据。

我想要 Excel 列中的数据。

标题名称(图 1 和图 2)
街道地址
地址位置
邮政编码
地址区域
地址国家

图片 1

图 2

这是我的代码。

url = Sheets("ExtData").Range("A" & N).Value
With CreateObject("MSXML2.XMLHTTP")
    .Open "GET", url, False
    .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
    .send
    sResponse = StrConv(.responseBody, vbUnicode)
End With

Set html = New HTMLDocument
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
With html
    .body.innerHTML = sResponse

    Set titles = .querySelectorAll(".jcn [title]")
    Set addresses = .querySelectorAll(".desk-add.jaddt")

    Til = titles.Item(i).outerHTML
    Add = addresses.Item(i).innerText
    Sheets("ExtData").Range("B" & N) = .getElementsByClassName("Title").Item(0)
    Sheets("ExtData").Range("C" & N) = .getElementById("comp_add").outerHTML
    Range("A" & N + 1).Select
End With

【问题讨论】:

    标签: excel vba web-scraping


    【解决方案1】:

    您可以使用 css 类选择器

    Option Explicit
    Public Sub GetInfo()
        Dim html As HTMLDocument
        Set html = New HTMLDocument                  '<  VBE > Tools > References > Microsoft Scripting Runtime
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", "https://www.justdial.com/Agartala/Abhay-Varieties-Nor-Banamalipu/9999PX381-X381-141028162716-U1Z5_BZDET", False
            .send
            html.body.innerHTML = .responseText
        End With
        Debug.Print html.querySelector(".ph_hdr").innerText
    End Sub
    

    如果你想要单独的行,那么使用

    Dim items() As String, i As Long
    items = Split(html.querySelector(".ph_hdr").innerText, ", ")
    For i = LBound(items) To UBound(items)
        If items(i) <> vbNullString Then
            Activesheet.Cells(i + 1, 1) = items(i)
        End If
    Next
    

    一个奇怪的情况是我通常会grab all the script tags with json using a css selector of script\[type='application/ld+json'\] and loop that looking for the info1。然而,尽管当我查看写出的文本文件时存在信息,但当我使用 DOM 解析器时,我找不到该信息。所以,尽管我很讨厌用 html 来提倡正则表达式,但这里有一个正则表达式解决方案:

    Option Explicit
    Public Sub GetInfo()
        Dim html As HTMLDocument, s As String, re As Object
        Set re = CreateObject("vbscript.regexp")
        Set html = New HTMLDocument                  '<  VBE > Tools > References > Microsoft Scripting Runtime
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", "https://www.justdial.com/Agartala/Abhay-Varieties-Nor-Banamalipu/9999PX381-X381-141028162716-U1Z5_BZDET", False
            .send
            s = .responseText
            html.body.innerHTML = s
            Debug.Print html.querySelector(".fn").innerText
            Debug.Print Trim$(Replace$(GetString(re, s, "title>(.*)<"), Chr$(34), vbNullString))
            Debug.Print Trim$(Replace$(GetString(re, s, "streetAddress"":(.*"")"), Chr$(34), vbNullString))
            Debug.Print Trim$(Replace$(GetString(re, s, "addressLocality"":(.*"")"), Chr$(34), vbNullString))
            Debug.Print Trim$(Replace$(GetString(re, s, "postalCode"":(.*"")"), Chr$(34), vbNullString))
            Debug.Print Trim$(Replace$(GetString(re, s, "addressRegion"":(.*"")"), Chr$(34), vbNullString))
            Debug.Print Trim$(Replace$(GetString(re, s, "addressCountry"":(.*"")"), Chr$(34), vbNullString))
        End With
    End Sub
    
    Public Function GetString(ByVal re As Object, ByVal inputString As String, ByVal pattern As String) As Variant
        Dim matches As Object
    
        With re
            .Global = True
            .MultiLine = True
            .IgnoreCase = True
            .pattern = pattern
            If .test(inputString) Then
                Set matches = .Execute(inputString)
                GetString = matches(0).SubMatches(0)
                Exit Function
            End If
        End With
        GetString = "No match"
    End Function
    
    1. 我会使用 Instr 寻找 "address":

    【讨论】:

    • 感谢 Qharr 先生,您的解决方案非常完美...... Debug.Print html.querySelector(".fn").innerText 我想要的标题是 Abhay Varieties,但我想要完整的标题行看起来像“Abhay Varieties - 阿加尔塔拉的化妆品经销商 - Justdial”,先生,你能在我的图片中看到吗?我在第一张图片中制作了红线框
    • 今晚晚些时候我会为你补充
    • 请问先生如何从 URL 中提取完整标题
    • 我已将其添加进去。您是否运行了上面的最新编辑?底层版本
    猜你喜欢
    • 1970-01-01
    • 2020-07-14
    • 2013-11-27
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2019-02-08
    • 2021-05-06
    相关资源
    最近更新 更多