【发布时间】:2016-06-13 00:52:05
【问题描述】:
我正在构建一个简单的蜘蛛来抓取结构化网站并下载 *.txt 文件。除了自定义 FilesPipeline 类之外,我已经设法让一切正常工作。
我的目标是根据 url 位置将 *.txt 文件下载到目录中。如果我直接编辑 Scrapy 类,我可以实现我的目标(如下所示)
files.py -> FilesPipeline::file_path()
...
# return 'full/%s%s' % (media_guid, media_ext)
return url.split('example.com/')[1]
我想正确地重载课程但没有成功。我不确定我应该做些什么不同的事情。蜘蛛运行时不会出现警告或错误,但不会下载文件。
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MySpiderFilesPipeline': 1,
'myspider.pipelines.MySpiderPipeline': 300,
}
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from myspider.items import MySpiderItem
class SpideySpider(CrawlSpider):
name = 'spidey'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
rules = (
Rule(LinkExtractor(allow='', restrict_xpaths='//tr/td/a', deny_extensions='html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
links = response.xpath('//tr/td/a')
for link in links:
i = MySpiderItem()
i['title'] = response.xpath('//title/text()').extract()
i['href'] = link.xpath('@href').extract()
i['text'] = link.xpath('text()').extract()
i["current_url"] = response.url
referring_url = response.request.headers.get('Referer', None)
i['referring_url'] = referring_url
i['depth'] = response.meta['depth']
if i['text'][0]:
if re.match('^#.*\.txt$', i['text'][0]) is not None:
i['file_urls'] = [ response.urljoin(i['href'][0]) ]
yield i
pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.contrib.pipeline.files import FilesPipeline, FSFilesStore
import json
import re
class MySpiderPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if re.match('^#.*\.html$', item['text'][0]) is not None:
valid = False
raise DropItem("HTML File")
if re.match('^#.*\.txt$', item['text'][0]) is not None:
pass
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpiderFilesPipeline(FilesPipeline):
_url_breakstring = "example.com/"
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])]
def file_path(self, request, response=None, info=None):
return url.split(_url_breakstring)[1]
# media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
# return 'full/%s%s' % (media_guid, media_ext)
【问题讨论】:
标签: python inheritance web-scraping scrapy scrapy-spider