1.导入包

scrapy item_loader

2.提取数据页面

scrapy item_loader

3.item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

# import scrapy
#
#
# class BolespiderItem(scrapy.Item):
#     url = scrapy.Field()
#     title = scrapy.Field()
#     time = scrapy.Field()
#     sort = scrapy.Field()
#     content = scrapy.Field()
#     praise = scrapy.Field()
#     collect = scrapy.Field()
#     comment = scrapy.Field()
import scrapy, re
from scrapy.contrib.loader import ItemLoader
from datetime import datetime
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst


def convert_time(value):
    # 此处的value是列表中一项一项的
    value = value.replace('.', '').strip()
    try:
        time = datetime.strptime(value, '%Y/%m/%d')
    except:
        time = datetime.now()
    return time


def convert_sort(value):
    if '评论 ' in value:
        return ""
    else:
        return value


def convert_praise(value):
    # 三种情况:1.''; 2.'',1,;
    if value.strip() != "":
        pattern = re.compile(r'\d+')
        num = re.findall(pattern, value)
        if num:
            num = int(num[0])
        else:
            num = 0
        return num


def convert_collect(value):
    # 用正则拿数字
    num = re.findall(re.compile(r'\d+'), value)
    if num:
        num = int(num[0])
    else:
        num = 0
    return num


def convert_comment(value):
    num = re.findall(re.compile(r'\d+'), value)
    if num:
        num = int(num[0])
    else:
        num = 0
    return num


class BolespiderItem(scrapy.Item):
    title = scrapy.Field(
        output_processor=TakeFirst()
    )
    time = scrapy.Field(
        input_processor=MapCompose(convert_time),
        output_processor=TakeFirst()
    )
    sort = scrapy.Field(
        input_processor=MapCompose(convert_sort),
        output_processor=Join()
    )

    content = scrapy.Field(
        output_processor=Join()
    )
    praise = scrapy.Field(
        input_processor=MapCompose(convert_praise),
        output_processor=TakeFirst()
    )
    collect = scrapy.Field(
        input_processor=MapCompose(convert_collect),
        output_processor=TakeFirst()
    )
    comment = scrapy.Field(
        input_processor=MapCompose(convert_comment),
        output_processor=TakeFirst()
    )
    detail_url = scrapy.Field(
        output_processor=Join()
    )

相关文章:

  • 2021-11-26
猜你喜欢
  • 2022-12-23
  • 2022-12-23
  • 2021-04-26
  • 2022-01-08
  • 2021-10-17
  • 2021-11-15
  • 2021-11-23
相关资源
相似解决方案