【发布时间】:2015-06-05 16:15:02
【问题描述】:
我进行了一些搜索,例如 like this delta one,但无法获得所需的内容。我被卡住了,无法让结果页面正常工作或加载......或者它没有做的任何事情。我正在寻找对此的一些见解。
我能够让蜘蛛爬过免责声明页面(我想,我什至不能 100% 确定如何检查它是否成功)。但是在搜索页面上,我不知道该怎么做。我的尝试如下。这也是我刚加入 stackoverflow 时的第一篇文章,如果我搞砸了代码格式,非常抱歉。
from scrapy.spider import Spider
from scrapy.http import FormRequest
from time import sleep
class ccSpider(Spider):
name = "courtsSpider"
allowed_domains = ["courts.state.md.us"]
start_urls = ["http://casesearch.courts.state.md.us"]
def parse(self,response):
self.log('\n\n[Parse is Starting...]')
print response.url
if "I have read" in response.body:
print "Disclaimer Page Accessed\n\n"
else:
print "Disclaimer Page not Accessed\n\n"
return
sleep(1)
return FormRequest.from_response(response,
formname = 'main',
formdata = {'disclaimer':'Y'},
callback = self.parseSearchPage
)
def parseSearchPage(self,response):
self.log('\n\n[Accessing Search Criteria Page...]')
print response.url
if "Default is person" in response.body:
print "Search Page Accessed\n\n"
else:
print "Search Page not Accessed\n\n"
return
sleep(1)
return FormRequest.from_response(response,
formname = 'inquiryForm',
formdata = {'lastName':'SMITH',
'firstName':'JOHN',
#'company':'N',
#'middleName':'',
#'exactMatch':'N',
#'site':'00',
#'courtSystem':'B',
#'filingStart':'',
#'filingEnd':'',
#'filingData':'',
#'caseId':''
},
callback = self.parseResultsPages
)
def parseResultsPages(self,response):
self.log('\n\n[Accessing Search Results Page...]')
print response.url
if "items found" in response.body:
print "Results Page Accessed\n\n"
else:
print "Results Page not Accessed\n\n"
print "Title of Page: " + response.xpath('//title/text()').extract()[0].strip()
return
# The Print below should be giving me search results titled page.. I think.
print response.xpath('//title/text()').extract()[0].strip()
【问题讨论】:
标签: python html forms web-scraping scrapy