【发布时间】:2016-03-04 17:31:32
【问题描述】:
我正在尝试抓取一个 aspx 网站:https://www.aae.org/patients/find.aspx。出于测试目的,请使用 33133 作为邮政编码和 100 作为半径。
最初我通过遍历搜索页面来收集个人资料链接,我成功地在第一页获得了前 20 个链接,但无法超出第 1 页,消息来源说 - '我们很抱歉,页面或文件找不到你要找的'
请看我下面的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv
profile_links = []
def result_checker(self):
No_results = self.xpath('//td[@colspan="3"]//p//text()')
if "No results" in str(No_results):
print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
time.sleep(10)
sys.exit()
else:
pass
def Get_data(zipcode, radius):
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.tcms.com',
'Origin':'https://www.aae.org',
'Referer':'https://www.aae.org/patients/find.aspx'}
class MyOpener(urllib.request.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://www.aae.org/patients/find.aspx'
f = myopener.open(url)
soup = BeautifulSoup(f,'lxml')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup.select("#EktronClientManager")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
('EktronClientManager',EktronClientManager),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))
encodedFields = urllib.parse.urlencode(formData)
f1 = myopener.open(url, encodedFields)
source = f1.read()
target = open('sample.txt','w')
target.write(str(source))
target.close()
source1 = html.fromstring(source)
result_checker(source1)
links = source1.xpath("//table[@class='Results']//tr//a//@href")
for each in links:
if "MemberID" and "AddressID" in each:
print (each)
profile_links.append("https://www.aae.org/patients/"+str(each))
else:
pass
j = 2
soup2 = BeautifulSoup(source,'lxml')
viewstate = soup2.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
while j < 5:
pages = 'Page$'+str(j)
print (pages,'\n---------------')
formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
('__EVENTARGUMENT',pages),
('__VIEWSTATE',viewstate),
('__EVENTVALIDATION',eventvalidation),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))
encodedFields1 = urllib.parse.urlencode(formData1)
f2 = myopener.open(url, encodedFields1)
source2 = f2.read()
target = open('sample.txt','w')
target.write(str(source2))
target.close()
source3 = html.fromstring(source2)
links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
for each1 in links2:
if "MemberID" and "AddressID" in each1:
print (each1)
profile_links.append("https://www.aae.org/patients/"+str(each1))
else:
pass
soup3 = BeautifulSoup(source2,'lxml')
viewstate = soup3.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
j+=1
if __name__ == "__main__":
#Get_data('38132', 5)
Get_data('33133', 100)
【问题讨论】:
-
有什么建议吗?
-
@Greg,感谢您的编辑建议,请看看我的代码并找出我做错了什么?
-
代码有点长,很难快速找到问题所在。抓取 ASP.NET 网站是一件令人头疼的事……您是否确保在请求之间保存并传递您的 cookie?
-
我找不到任何有关 FancyURLopener 处理会话/cookie 的信息。加载第一页是有道理的(因为没有会话并且服务器尚未向您发送任何 cookie),但如果您不发回 cookie,则其他页面将无法正常工作。
-
有道理! cookie jar 解决了问题吗?
标签: python web-scraping urllib python-3.5