【问题标题】:XPath Query: need to get text from HTML [closed]XPath 查询:需要从 HTML 中获取文本 [关闭]
【发布时间】:2018-05-08 18:25:37
【问题描述】:

我正在尝试从这个 HTML 中提取文本“(TLE) THE LEARNING EXPERIENCE”,我尝试了许多 xpath,但似乎都没有。代码中的 XPath 在 Octoparse 中有效,但在代码中无效。

static void Main(string[] args)
    {


        HtmlWeb web = new HtmlWeb();
        HtmlDocument document = web.Load("http://w1.lara.state.mi.us/ChildCareSearch"); //The URL to get the Info From
        HtmlNode[] textXpath_node = document.DocumentNode.SelectNodes("//main[@id='main']/div[@class='container']/div[@class='row']/div[@class='container']/div[@class='col-sm-12']/div[2]/fieldset[@class='form-horizontal']/div[@id='SearchResultsContainer']/div[@class='pq-grid-center']/div[@class='pq-grid-cont-outer']/div[@class='pq-grid-cont']/div[@class='pq-grid-cont-inner']/table[@id='ExitTable']/tbody/tr[@class='pq-grid-row  pq-grid-oddRow '][1]/td[@id='ExitTable']/a[@class='blue-link']").ToArray(); 

        foreach (var node in textXpath_node)
        {
            Console.WriteLine(node.InnerText);
        }

        Console.ReadLine();
    }

【问题讨论】:

    标签: c# html xpath


    【解决方案1】:

    您的 XPATH 无法正常工作。为什么?查询有效。顺便说一句,您可以像这样使用nice browser 生成它:

    /html/body/div[1]/main/div/div/div/div/div[2]/fieldset/div[14]/div[2]/div[2]/div/div/table/tbody/tr[2]/td[3]/a
    

    好的,但为什么它也不起作用?好的。我们走吧。首先,让我们看看加载器加载了什么。使用调试器并查看HtmlDocument。好吧。如果你搜索“(TLE) THE LEARNING EXPERIENCE”,你不会找到它。什么?哼。如果页面使用 ajax 加载数据怎么办。很容易检查。打开nice browser网络部分并观察。

    哦!你很幸运。我们只是找到一个 POST 请求,它返回一个包含所有内容的 json! API 是 scraper 的梦想!

    {"Data":[{"CdcLicNbr":"DC630295306","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC630295306\"\u003e(TLE) THE LEARNING EXPERIENCE\u003c/a\u003e","CdcAddr":"23500 ORCHARD LAKE RD. ","CdcCity":"FARMINGTON HILLS","CdcZip":"48336","CdcLicName":"TLE AT FARMINGTON HILLS LIMITED LIABILITY COMPANY","CdcType":"DC","CdcCnty":63,"CntyDesc":"OAKLAND","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC330299143","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC330299143\"\u003e100 ACRE WOOD DAYCARE CENTER\u003c/a\u003e","CdcAddr":"1340 ONONDAGA RD. ","CdcCity":"HOLT","CdcZip":"48842","CdcLicName":"100 ACRE WOOD DAYCARE","CdcType":"DC","CdcCnty":33,"CntyDesc":"INGHAM","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC630347309","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC630347309\"\u003e1ST ADVANTAGE LEARNING CENTER\u003c/a\u003e","CdcAddr":"26555 JOHN R ROAD ","CdcCity":"MADISON HEIGHTS","CdcZip":"48071","CdcLicName":"1ST ADVANTAGE LEARNING CENTER, LLC","CdcType":"DC","CdcCnty":63,"CntyDesc":"OAKLAND","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC820289306","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC820289306\"\u003e2 DAY\u0027S CHILD LEARNING CENTER\u003c/a\u003e","CdcAddr":"15075 MEYERS ","CdcCity":"DETROIT","CdcZip":"48227","CdcLicName":"2 DAY\u0027S CHILD LEARNING CENTER, INC.","CdcType":"DC","CdcCnty":82,"CntyDesc":"WAYNE","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DF820310739","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DF820310739\"\u003e2 SWEETS ANGELS FAMILY DAYCARE\u003c/a\u003e","CdcAddr":"9701 EVERTS STREET ","CdcCity":"DETROIT","CdcZip":"48224","CdcLicName":"SMITH-HALE, TRACY DENISE","CdcType":"DF","CdcCnty":82,"CntyDesc":"WAYNE","FacilityType":"CHILD CARE FAMILY HOME (CAPACITY 1-6)"},{"CdcLicNbr":"DC390277285","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390277285\"\u003e21ST CENTURY - MILWOOD MAGNET\u003c/a\u003e","CdcAddr":"2916 KONKLE ","CdcCity":"KALAMAZOO","CdcZip":"49001","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303300","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303300\"\u003e21ST CENTURY/KCIS - HILLSIDE MIDDLE SCHOOL\u003c/a\u003e","CdcAddr":"1941 ALAMO AVE. ","CdcCity":"KALAMAZOO","CdcZip":"49006","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303302","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303302\"\u003e21ST CENTURY/KCIS - LINDEN GROVE MIDDLE SCHOOL\u003c/a\u003e","CdcAddr":"4241 ARBORETUM PKWY. ","CdcCity":"KALAMAZOO","CdcZip":"49006","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303299","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303299\"\u003e21ST CENTURY/KCIS - MAPLE STREET MAGNET\u003c/a\u003e","CdcAddr":"922 W. MAPLE ST. ","CdcCity":"KALAMAZOO","CdcZip":"49008","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DF410384743","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DF410384743\"\u003e3 MUSKETEERS DAYCARE\u003c/a\u003e","CdcAddr":"1388 LANCASTER AVE NW ","CdcCity":"GRAND RAPIDS","CdcZip":"49504","CdcLicName":"BIGELOW, ADELE","CdcType":"DF","CdcCnty":41,"CntyDesc":"KENT","FacilityType":"CHILD CARE FAMILY HOME (CAPACITY 1-6)"}],"TotalCount":9248,"CurrentPage":1}
    

    我们只是将抓取会话转换为 JSON 解析遍历。

    Try it online

    public static void Main()
    {
        // we use the POST url
        var url = "http://w1.lara.state.mi.us/ChildCareSearch/Home/SearchResults";
    
        // we use a webclient to execute the POST request. We dont need HAP anymore
        using(WebClient client = new WebClient())
        {
            // You can change this parameters to get another page or more elements.
            var reqparm = new System.Collections.Specialized.NameValueCollection
            {
                {"pq_datatype", "JSON"}, {"pq_curpage", "1"}, {"pq_rpp", "10"}
            };
            var responsebytes = client.UploadValues(url, "POST", reqparm);
            var json = Encoding.UTF8.GetString(responsebytes);
    
           // I deserialize everthing. Look into json.net if you dont need a whole object.
            var root = JsonConvert.DeserializeObject<RootObject>(json);
            var name = TrimAnchor(root.Data[0].CdcName);
            Console.WriteLine(name);
        }
    }
    
    // Feel free to fix and enhance this quick made <a> trimmer
    public static string TrimAnchor(string node)
    {
        return node.Substring(node.IndexOf('>') + 1).Replace("</a>","");
    }
    
    public class Datum
    {
        public string CdcLicNbr { get; set; }
        public string CdcName { get; set; }
        public string CdcAddr { get; set; }
        public string CdcCity { get; set; }
        public string CdcZip { get; set; }
        public string CdcLicName { get; set; }
        public string CdcType { get; set; }
        public int CdcCnty { get; set; }
        public string CntyDesc { get; set; }
        public string FacilityType { get; set; }
    }
    
    public class RootObject
    {
        public List<Datum> Data { get; set; }
        public int TotalCount { get; set; }
        public int CurrentPage { get; set; }
    }
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2016-05-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多