【问题标题】:Emailing items and logs with Scrapy使用 Scrapy 通过电子邮件发送项目和日志
【发布时间】:2013-04-28 07:41:51
【问题描述】:

我正在尝试让 Scrapy 在爬虫完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展,但我想将蜘蛛的错误附加为<spidername>-errors.log,将抓取的项目附加为<spidername>-items.json

我已将回调连接到每个信号,但由于某种原因,只有最后一个在触发:

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from StringIO import StringIO

class StatusMailer(object):
    def __init__(self, recipients, mail, crawler):
        self.recipients = recipients
        self.mail = mail
        self.files = defaultdict(StringIO)
        self.encoder = ScrapyJSONEncoder(crawler=crawler)

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')

    def spider_closed(self, spider):
        return self.mail.send(
            to=self.recipients,
            subject="Crawler for %s finished" % spider.name,
            body="",
            attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
        )

有什么方法可以从 Scrapy 中访问导出的项目和蜘蛛的错误(可能在这些消息打印到控制台之前制作某种挂钩来拦截这些消息)?

【问题讨论】:

    标签: python email scrapy


    【解决方案1】:

    嗯,看起来问题比我想象的要简单得多。您必须在完全写完 StringIO 实例后“回退”它们:

    def spider_closed(self, spider):
        files = []
    
        for name, contents in self.files.items():
            contents.seek(0)
    
            files.append((name, 'text/plain', contents))
    
        return self.mail.send(
            to=self.recipients,
            subject="Crawler for %s finished" % spider.name,
            body="",
            attachs=files
        )
    

    对于任何感兴趣的人,这是我的电子邮件分机:

    import gzip
    import datetime
    
    from scrapy import signals
    from scrapy.mail import MailSender
    from scrapy.exceptions import NotConfigured
    from scrapy.utils.serialize import ScrapyJSONEncoder
    
    from collections import defaultdict
    
    try:
        from cStringIO import cStringIO as StringIO
    except ImportError:
        from StringIO import StringIO
    
    def format_size(size):
        for x in ['bytes', 'KB', 'MB', 'GB']:
            if size < 1024.0:
                return "%3.1f %s" % (size, x)
    
            size /= 1024.0
    
    class GzipCompressor(gzip.GzipFile):
        extension = '.gz'
        mimetype = 'application/gzip'
    
        def __init__(self):
            super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
            self.read = self.fileobj.read
    
    class PlainCompressor(StringIO):
        extension = ''
        mimetype = 'text/plain'
    
        def read(self, *args, **kwargs):
            self.seek(0)
    
            return StringIO.read(self, *args, **kwargs)
    
        @property
        def size(self):
            return len(self.getvalue())
    
    class StatusMailer(object):
        def __init__(self, recipients, mail, compressor, crawler):
            self.recipients = recipients
            self.mail = mail
            self.encoder = ScrapyJSONEncoder(crawler=crawler)
            self.files = defaultdict(compressor)
    
            self.num_items = 0
            self.num_errors = 0
    
        @classmethod
        def from_crawler(cls, crawler):
            recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
            compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
    
            if not compression:
                compressor = PlainCompressor
            elif compression.lower().startswith('gz'):
                compressor = GzipCompressor
            else:
                raise NotConfigured
    
            if not recipients:
                raise NotConfigured
    
            mail = MailSender.from_settings(crawler.settings)
            instance = cls(recipients, mail, compressor, crawler)
    
            crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
            crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
            crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
            crawler.signals.connect(instance.request_received, signal=signals.request_received)
    
            return instance
    
        def item_scraped(self, item, response, spider):
            self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
            self.num_items += 1
    
        def spider_error(self, failure, response, spider):
            self.files[spider.name + '.log'].write(failure.getTraceback())
            self.num_errors += 1
    
        def request_received(self, request, spider):
            self.files[spider.name + '.log'].write(str(request) + '\n')
    
        def spider_closed(self, spider, reason):
            files = []
    
            for name, compressed in self.files.items():
                files.append((name + compressed.extension, compressed.mimetype, compressed))
    
            try:
                size = self.files[spider.name + '-items.json'].size
            except KeyError:
                size = 0
    
            body='''Crawl statistics:
    
     - Spider name: {0}
     - Spider finished at: {1}
     - Number of items scraped: {2}
     - Number of errors: {3}
     - Size of scraped items: {4}'''.format(
                spider.name,
                datetime.datetime.now(),
                self.num_items,
                self.num_errors,
                format_size(size)
            )
    
            return self.mail.send(
                to=self.recipients,
                subject='Crawler for %s: %s' % (spider.name, reason),
                body=body,
                attachs=files
            )
    

    将其添加到您的settings.py

    EXTENSIONS = {
        'your_package.extensions.StatusMailer': 80
    }
    

    并配置它:

    STATUSMAILER_RECIPIENTS = []
    STATUSMAILER_COMPRESSION = 'gzip'
    #STATUSMAILER_COMPRESSION = None
    
    MAIL_HOST = 'smtp.gmail.com'
    MAIL_PORT = 587
    MAIL_USER = ''
    MAIL_PASS = ''
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2021-06-16
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多