【发布时间】:2015-01-16 03:53:05
【问题描述】:
class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(@class,"SkyScrapperBoxes")]/div[contains(@class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/@href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[@class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[@id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
我目前正在使用 Scrapy 来抓取网站。我对python中的unittest有一些了解。但是,我如何编写单元测试来检查链接是否正常工作,item['location']、item['details'] 是否返回值?我学过 Scrapy 合约,但是什么都看不懂。那么,这种情况下怎么写 unittest 呢?
【问题讨论】:
标签: python unit-testing web-scraping scrapy scrapy-spider