您的 XPATH 无法正常工作。为什么?查询有效。顺便说一句,您可以像这样使用nice browser 生成它:
/html/body/div[1]/main/div/div/div/div/div[2]/fieldset/div[14]/div[2]/div[2]/div/div/table/tbody/tr[2]/td[3]/a
好的,但为什么它也不起作用?好的。我们走吧。首先,让我们看看加载器加载了什么。使用调试器并查看HtmlDocument。好吧。如果你搜索“(TLE) THE LEARNING EXPERIENCE”,你不会找到它。什么?哼。如果页面使用 ajax 加载数据怎么办。很容易检查。打开nice browser网络部分并观察。
哦!你很幸运。我们只是找到一个 POST 请求,它返回一个包含所有内容的 json! API 是 scraper 的梦想!
{"Data":[{"CdcLicNbr":"DC630295306","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC630295306\"\u003e(TLE) THE LEARNING EXPERIENCE\u003c/a\u003e","CdcAddr":"23500 ORCHARD LAKE RD. ","CdcCity":"FARMINGTON HILLS","CdcZip":"48336","CdcLicName":"TLE AT FARMINGTON HILLS LIMITED LIABILITY COMPANY","CdcType":"DC","CdcCnty":63,"CntyDesc":"OAKLAND","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC330299143","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC330299143\"\u003e100 ACRE WOOD DAYCARE CENTER\u003c/a\u003e","CdcAddr":"1340 ONONDAGA RD. ","CdcCity":"HOLT","CdcZip":"48842","CdcLicName":"100 ACRE WOOD DAYCARE","CdcType":"DC","CdcCnty":33,"CntyDesc":"INGHAM","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC630347309","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC630347309\"\u003e1ST ADVANTAGE LEARNING CENTER\u003c/a\u003e","CdcAddr":"26555 JOHN R ROAD ","CdcCity":"MADISON HEIGHTS","CdcZip":"48071","CdcLicName":"1ST ADVANTAGE LEARNING CENTER, LLC","CdcType":"DC","CdcCnty":63,"CntyDesc":"OAKLAND","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC820289306","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC820289306\"\u003e2 DAY\u0027S CHILD LEARNING CENTER\u003c/a\u003e","CdcAddr":"15075 MEYERS ","CdcCity":"DETROIT","CdcZip":"48227","CdcLicName":"2 DAY\u0027S CHILD LEARNING CENTER, INC.","CdcType":"DC","CdcCnty":82,"CntyDesc":"WAYNE","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DF820310739","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DF820310739\"\u003e2 SWEETS ANGELS FAMILY DAYCARE\u003c/a\u003e","CdcAddr":"9701 EVERTS STREET ","CdcCity":"DETROIT","CdcZip":"48224","CdcLicName":"SMITH-HALE, TRACY DENISE","CdcType":"DF","CdcCnty":82,"CntyDesc":"WAYNE","FacilityType":"CHILD CARE FAMILY HOME (CAPACITY 1-6)"},{"CdcLicNbr":"DC390277285","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390277285\"\u003e21ST CENTURY - MILWOOD MAGNET\u003c/a\u003e","CdcAddr":"2916 KONKLE ","CdcCity":"KALAMAZOO","CdcZip":"49001","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303300","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303300\"\u003e21ST CENTURY/KCIS - HILLSIDE MIDDLE SCHOOL\u003c/a\u003e","CdcAddr":"1941 ALAMO AVE. ","CdcCity":"KALAMAZOO","CdcZip":"49006","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303302","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303302\"\u003e21ST CENTURY/KCIS - LINDEN GROVE MIDDLE SCHOOL\u003c/a\u003e","CdcAddr":"4241 ARBORETUM PKWY. ","CdcCity":"KALAMAZOO","CdcZip":"49006","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DC390303299","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DC390303299\"\u003e21ST CENTURY/KCIS - MAPLE STREET MAGNET\u003c/a\u003e","CdcAddr":"922 W. MAPLE ST. ","CdcCity":"KALAMAZOO","CdcZip":"49008","CdcLicName":"KALAMAZOO PUBLIC SCHOOLS","CdcType":"DC","CdcCnty":39,"CntyDesc":"KALAMAZOO","FacilityType":"CHILD CARE CENTER"},{"CdcLicNbr":"DF410384743","CdcName":"\u003ca class=\"blue-link\" href=\"/ChildCareSearch/Home/FacilityProfile/DF410384743\"\u003e3 MUSKETEERS DAYCARE\u003c/a\u003e","CdcAddr":"1388 LANCASTER AVE NW ","CdcCity":"GRAND RAPIDS","CdcZip":"49504","CdcLicName":"BIGELOW, ADELE","CdcType":"DF","CdcCnty":41,"CntyDesc":"KENT","FacilityType":"CHILD CARE FAMILY HOME (CAPACITY 1-6)"}],"TotalCount":9248,"CurrentPage":1}
我们只是将抓取会话转换为 JSON 解析遍历。
Try it online
public static void Main()
{
// we use the POST url
var url = "http://w1.lara.state.mi.us/ChildCareSearch/Home/SearchResults";
// we use a webclient to execute the POST request. We dont need HAP anymore
using(WebClient client = new WebClient())
{
// You can change this parameters to get another page or more elements.
var reqparm = new System.Collections.Specialized.NameValueCollection
{
{"pq_datatype", "JSON"}, {"pq_curpage", "1"}, {"pq_rpp", "10"}
};
var responsebytes = client.UploadValues(url, "POST", reqparm);
var json = Encoding.UTF8.GetString(responsebytes);
// I deserialize everthing. Look into json.net if you dont need a whole object.
var root = JsonConvert.DeserializeObject<RootObject>(json);
var name = TrimAnchor(root.Data[0].CdcName);
Console.WriteLine(name);
}
}
// Feel free to fix and enhance this quick made <a> trimmer
public static string TrimAnchor(string node)
{
return node.Substring(node.IndexOf('>') + 1).Replace("</a>","");
}
public class Datum
{
public string CdcLicNbr { get; set; }
public string CdcName { get; set; }
public string CdcAddr { get; set; }
public string CdcCity { get; set; }
public string CdcZip { get; set; }
public string CdcLicName { get; set; }
public string CdcType { get; set; }
public int CdcCnty { get; set; }
public string CntyDesc { get; set; }
public string FacilityType { get; set; }
}
public class RootObject
{
public List<Datum> Data { get; set; }
public int TotalCount { get; set; }
public int CurrentPage { get; set; }
}