我正在尝试抓取一个 aspx 网站，但无法超越 page2答案

【问题标题】：I am trying to scrape a aspx website but unable to go beyond page2我正在尝试抓取一个 aspx 网站，但无法超越 page2
【发布时间】：2016-03-04 17:31:32
【问题描述】：

我正在尝试抓取一个 aspx 网站：https://www.aae.org/patients/find.aspx。出于测试目的，请使用 33133 作为邮政编码和 100 作为半径。

最初我通过遍历搜索页面来收集个人资料链接，我成功地在第一页获得了前 20 个链接，但无法超出第 1 页，消息来源说 - '我们很抱歉，页面或文件找不到你要找的'

请看我下面的代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv

profile_links = []

def result_checker(self):
    No_results = self.xpath('//td[@colspan="3"]//p//text()')
    if "No results" in str(No_results):
        print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
        time.sleep(10)
        sys.exit()
    else:
        pass

def Get_data(zipcode, radius):
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Host':'www.tcms.com',
                'Origin':'https://www.aae.org',
                'Referer':'https://www.aae.org/patients/find.aspx'}

    class MyOpener(urllib.request.FancyURLopener):
        version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'

    myopener = MyOpener()
    url = 'https://www.aae.org/patients/find.aspx'
    f = myopener.open(url)
    soup = BeautifulSoup(f,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']

    formData = (
        ('__EVENTVALIDATION', eventvalidation),
        ('__VIEWSTATE', viewstate),
        ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
        ('EktronClientManager',EktronClientManager),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))

    encodedFields = urllib.parse.urlencode(formData)
    f1 = myopener.open(url, encodedFields)
    source = f1.read()
    target = open('sample.txt','w')
    target.write(str(source))
    target.close()
    source1 = html.fromstring(source)
    result_checker(source1)
    links = source1.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            profile_links.append("https://www.aae.org/patients/"+str(each))
        else:
            pass

    j = 2
    soup2 = BeautifulSoup(source,'lxml')
    viewstate = soup2.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']

    while j < 5:
        pages = 'Page$'+str(j)
        print (pages,'\n---------------')
        formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
                    ('__EVENTARGUMENT',pages),
                    ('__VIEWSTATE',viewstate),
                    ('__EVENTVALIDATION',eventvalidation),
                    ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))

        encodedFields1 = urllib.parse.urlencode(formData1)
        f2 = myopener.open(url, encodedFields1)
        source2 = f2.read()
        target = open('sample.txt','w')
        target.write(str(source2))
        target.close()
        source3 = html.fromstring(source2)
        links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links2:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                profile_links.append("https://www.aae.org/patients/"+str(each1))
            else:
                pass
        soup3 = BeautifulSoup(source2,'lxml')
        viewstate = soup3.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
        j+=1

if __name__ == "__main__":
    #Get_data('38132', 5)
    Get_data('33133', 100)

【问题讨论】：

有什么建议吗？
@Greg，感谢您的编辑建议，请看看我的代码并找出我做错了什么？
代码有点长，很难快速找到问题所在。抓取 ASP.NET 网站是一件令人头疼的事……您是否确保在请求之间保存并传递您的 cookie？
我找不到任何有关 FancyURLopener 处理会话/cookie 的信息。加载第一页是有道理的（因为没有会话并且服务器尚未向您发送任何 cookie），但如果您不发回 cookie，则其他页面将无法正常工作。
有道理！ cookie jar 解决了问题吗？

标签： python web-scraping urllib python-3.5

【解决方案1】：

是的 Greg Sadetsky，您对 cookie 的看法是绝对正确的，需要创建一个会话然后传递所有带有所需数据参数的 POST 请求。

在 Requests lib 的帮助下，我能够创建一个会话来存储可以跨请求使用的 cookie。

import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html

def Get_data(zipcode, radius):
    All_links = []
    url = 'https://www.aae.org/patients/find.aspx'
    s = requests.Session()
    r = s.get(url)
    #print (r.text.encode('utf-8'))
    soup = BeautifulSoup(r.content,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']
    params = {'EktronClientManager':EktronClientManager,
              '__VIEWSTATE':viewstate,
              '__EVENTVALIDATION':eventvalidation,
              'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
    r2 = s.post(url,data=params)
    source = html.fromstring(r2.content)
    links = source.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            All_links.append("https://www.aae.org/patients/"+str(each))
    #print (r2.content)
    soup1 = BeautifulSoup(r2.content,'lxml')
    viewstate = soup1.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
    j = 2
    while j < 7:
        page = 'Page$'+str(j)
        print (page)
        params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
                   '__EVENTARGUMENT':page,
                   'EktronClientManager':EktronClientManager,
                   '__VIEWSTATE':viewstate,
                   '__EVENTVALIDATION':eventvalidation,
                   'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
        r3 = s.post(url,data=params1)
        source1 = html.fromstring(r3.content)
        links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links1:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                All_links.append("https://www.aae.org/patients/"+str(each1))
        soup2 = BeautifulSoup(r3.content,'lxml')
        viewstate = soup2.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
        EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
        j+=1

Get_data(33133, 100)

【讨论】：