【发布时间】:2014-03-10 02:31:46
【问题描述】:
我正在尝试抓取一个网站并仅从具有元 noindex 的页面进行解析。 发生的情况是爬虫爬取了第一级,但完成了第一页。它似乎没有遵循链接。 以下是我的代码:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[@content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[@name="robots"]/@content').extract()
items.append(item)
yield items
【问题讨论】:
标签: python web-crawler scrapy