项目简介
爬取趣头条新闻(http://home.qutoutiao.net/pages/home.html),具体内容:1、列表页(json):标题,简介、封面图、来源、发布时间
2、详情页(html):详细内容和图片
目录结构
生成的数据文件-单条记录
主要代码说明
爬虫:#爬取趣头条列表和详情页
qutoutiao.spiders.qutoutiaos.QutoutiaosSpider
管道文件:
#封面图片处理类
qutoutiao.imagepipelines.CoverImagePipeline
#内容图片处理类
qutoutiao.imagepipelines.ContentImagePipeline
#数据处理类
qutoutiao.pipelines.QutoutiaoPipeline
中间件:
#请求头设置类-这里只设置了user agent
qutoutiao.middlewares.RandomUserAgent
#代理设置类
qutoutiao.middlewares.RandomProxy
自定义:
#配置文件
qutoutiao.qttconfig.py
#工具类
qutoutiao.qttutils.QttUtils
创建项目
cd /home/chaoge/mypython/crawler/scrapy startproject qutoutiao
创建爬虫类即(qutoutiao.spiders.qutoutiaos.QutoutiaosSpider)
cd qutoutiao/qutoutiao/spidersscrapy genspider qutoutiaos "api.1sapp.com"
执行
scrapy crawl qutoutiaos
#scrapy crawl qutoutiaos --nolog#不显示log
#scrapy crawl qutoutiaos -o qutoutiaos_log.json #将log输出到qutoutiaos_log.json
代码实现
qutoutiao.qttconfig.py# 爬取域名(趣头条)
DOMAIN = \'http://home.qutoutiao.net/pages/home.html\'
#数据存储路径
DATA_STORE = \'/home/chaoge/mypython/crawler/qutoutiao/data\'
#列表:http://api.1sapp.com/content/outList?cid=255&tn=1&page=1&limit=10
#列表API
LIST_API = \'http://api.1sapp.com/content/outList?\'
#列表记录数
LIST_LIMIT = 10
#分类
CATEGORY_INFO = [
{"cid":255,"name":"推荐"},
{"cid":1,"name":"热点"},
{"cid":6,"name":"娱乐"},
{"cid":5,"name":"养生"},
{"cid":2,"name":"搞笑"},
{"cid":7,"name":"科技"},
{"cid":8,"name":"生活"},
{"cid":10,"name":"财经"},
{"cid":9,"name":"汽车"},
]
qutoutiao.qttutils.py
# -*- coding: utf-8 -*-
# 趣头条工具类
import time
import os
import shutil
from qutoutiao import qttconfig as QttConfig
class QttUtils:
# 获取存储路径
#
# @param [string] action [remove删除目录,默认create]
# @return [string] path/year/month/day/*
@staticmethod
def getStorePath(action=\'create\'):
localtimes = time.localtime()
year = time.strftime("%Y", localtimes)
month = time.strftime("%m", localtimes)
day = time.strftime("%d", localtimes)
store_path = QttConfig.DATA_STORE+"/%s/%s/%s"%(year,month,day)
#删除目录
if os.path.exists(store_path) and action == \'remove\':
#os.rmdir(store_path)
shutil.rmtree(store_path)
#创建多级目录
if not os.path.exists(store_path) and action == \'create\':
os.makedirs(store_path)
return store_path
qutoutiao.settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for qutoutiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = \'qutoutiao\'
SPIDER_MODULES = [\'qutoutiao.spiders\']
NEWSPIDER_MODULE = \'qutoutiao.spiders\'
#日志
#LOG_FILE = "qutoutiao.log"
#日志等级
#LOG_LEVEL = "DEBUG"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = \'qutoutiao (+http://www.yourdomain.com)\'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36\',
# \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
# \'Accept-Language\': \'en\',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# \'qutoutiao.middlewares.QutoutiaoSpiderMiddleware\': 543,
\'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware\': None,#spider中的allowed_domains将不受限制
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
\'qutoutiao.middlewares.RandomUserAgent\': 100,
\'qutoutiao.middlewares.RandomProxy\': 200,
}
#中间件中的UserAgent池
USER_AGENTS = [
\'User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\',
\'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50\',
\'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0\',
\'User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko\',
\'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;\',
\'User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11\',
\'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11\',
\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36\',
\'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11\',
\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16\',
]
#中间件中的Proxy池
PROXIES = [
{\'ip_port\':\'121.42.140.113:16816\',\'user_password\':\'username-xxxx:password-xxxx\'},
{\'ip_port\':\'117.90.137.181:9000\'},
{\'ip_port\':\'117.90.2.151:9000\'},
{\'ip_port\':\'114.235.23.147:9000\'},
]
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# \'scrapy.extensions.telnet.TelnetConsole\': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
\'qutoutiao.imagepipelines.CoverImagePipeline\': 301,#封面图片下载
\'qutoutiao.imagepipelines.ContentImagePipeline\': 302,#内容图片下载
\'qutoutiao.pipelines.QutoutiaoPipeline\': 400,#数据处理
}
#图片存储路径
IMAGES_STORE = "/home/chaoge/mypython/crawler/qutoutiao/tmp/images"
#缩图设置
#IMAGES_THUMBS = {
# \'small\':(50,50),
# \'big\':(270,270),
#}
#图片宽和高在110*110以下忽略
IMAGE_MIN_HEIGHT = 110
IMAGE_MIN_WIDTH = 110
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = \'httpcache\'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'
qutoutiao.items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QutoutiaoItem(scrapy.Item): # define the fields for your item here like: #文章id aid = scrapy.Field() #来源 source_name = scrapy.Field() #标题 title = scrapy.Field() #详细页url url = scrapy.Field() #简介 introduction = scrapy.Field() #封面图 cover = scrapy.Field() #发布时间 publish_time = scrapy.Field() #分类ID cid = scrapy.Field() #内容 content = scrapy.Field() #内容-中的图片 content_images = scrapy.Field()
qutoutiao.middlewares.py
# -*- coding: utf-8 -*- import random import base64 from settings import USER_AGENTS from settings import PROXIES #随机User-Agent class RandomUserAgent(object): def process_request(self,request,spider): useragent = random.choice(USER_AGENTS) request.headers.setdefault(\'User-Agent\',useragent) #request.headers.setdefault(\'Host\',\'html2.qktoutiao.com\') #request.headers.setdefault(\'Referer\',\'http://home.qutoutiao.net/pages/home.html\') #随机代理 class RandomProxy(object): def process_request(self,request,spider): proxy = random.choice(PROXIES) request.meta[\'proxy\'] = \'http://\'+proxy[\'ip_port\'] #base64_user_password = base64.b64encode(bytes(proxy[\'user_password\'], \'utf-8\')) #decodebs64 = base64.b64decode(base64_user_password) #print(base64_user_password,decodebs64) if \'user_password\' in proxy and proxy[\'user_password\']:#需要用户名密码的代理 base64_user_password = str(base64.b64encode(bytes(proxy[\'user_password\'], \'utf-8\'))) request.headers[\'Proxy-Authorization\'] = \'Basic \'+base64_user_password
qutoutiao.imagepipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don\'t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline import os from qutoutiao.qttutils import QttUtils #封面图下载 class CoverImagePipeline(ImagesPipeline): #获取settings中的常量 IMAGES_STORE = get_project_settings().get(\'IMAGES_STORE\') #下载图片 def get_media_requests(self, item, info): cover_images = item[\'cover\'] if cover_images: for image_url in cover_images: yield scrapy.Request(url=image_url) #下载完成 def item_completed(self, results, item, info): #print(\'*\'*20,results,item,info) image_path = [x[\'path\'] for ok, x in results if ok] #获取自定义存储路径 store_path = QttUtils.getStorePath() coverImages = [] #将图片移动到新的路径 if image_path: for image_url in image_path: file_name = os.path.split(str(image_url)) new_image = store_path+"/"+file_name[1] coverImages.append(new_image) os.rename(self.IMAGES_STORE+"/"+image_url,new_image) item[\'cover\'] = coverImages return item #内容图片下载 class ContentImagePipeline(ImagesPipeline): #获取settings中的常量 IMAGES_STORE = get_project_settings().get(\'IMAGES_STORE\') #下载图片 def get_media_requests(self, item, info): content_images = item[\'content_images\'] if content_images: for image_url in content_images: yield scrapy.Request(image_url) #下载完成 def item_completed(self, results, item, info): image_info = [(x[\'path\'],x[\'url\']) for ok, x in results if ok] #获取自定义存储路径 store_path = QttUtils.getStorePath() contentImages = [] #将图片移动到新的路径 if image_info: for value in image_info: image_url = value[0] image_source = value[1] file_name = os.path.split(str(image_url)) new_image = store_path+"/"+file_name[1] contentImages.append((new_image,image_source)) os.rename(self.IMAGES_STORE+"/"+image_url,new_image) item[\'content_images\'] = contentImages return item
qutoutiao.pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from qutoutiao.qttutils import QttUtils
class QutoutiaoPipeline(object):
def __init__(self):
#获取自定义存储路径
store_path = QttUtils.getStorePath()
json_path = store_path+"/"+"qutoutiao.json"
self.filename = open(json_path,"wb")
def process_item(self, item, spider):
text = json.dumps(dict(item),ensure_ascii=False)+"\n"
self.filename.write(text.encode("utf-8"))
return item
def close_spider(self,spider):
self.filename.close()
qutoutiao.spiders.qutoutiaos.py
# -*- coding: utf-8 -*-
#web site:http://home.qutoutiao.net/pages/home.html
import scrapy
#通过CrawlSpider,Rule类爬取
#-*-from scrapy.spiders import CrawlSpider,Rule-*-
#-*-from scrapy.linkextractors import LinkExtractor-*-
from qutoutiao.items import QutoutiaoItem
import json
import re
from qutoutiao import qttconfig as QttConfig
#-*-class QutoutiaosSpider(CrawlSpider):-*-
class QutoutiaosSpider(scrapy.Spider):
name = \'qutoutiaos\'
allowed_domains = [\'api.1sapp.com\']
#爬取地址
start_urls = []
categoryInfo = QttConfig.CATEGORY_INFO
limit = QttConfig.LIST_LIMIT
for value in categoryInfo:
url = QttConfig.LIST_API+"cid=%s&tn=1&page=1&limit=%s"%(str(value[\'cid\']),str(limit))
start_urls.append(url)
#response里链接的提取规则
# -*-pageLink = LinkExtractor(allow=("start=\d+"))-*-
# -*-rules = [
# -*- #用pageLink提取规则跟进,通过parseQtt进行解析
# -*- Rule(pageLink,callback="parseQtt",follow=True)
# -*-]
def parse(self, response):
response_url = response.url
#分类id从url又获取了一次
searchObj = re.search( r\'(.*)cid=(\d+)\', response_url)
cid = searchObj and searchObj.group(2) or 0
data = json.loads(response.text)[\'data\'][\'data\']
for value in data:
#初始化模型对象
item = QutoutiaoItem()
#来源
item[\'source_name\'] = value[\'source_name\']
#标题
item[\'title\'] = value[\'title\']
#详细页url
url = item[\'url\'] = value[\'url\']
#url = url[0:url.find(\'?\')]
#简介
item[\'introduction\'] = value[\'introduction\']
#封面图
item[\'cover\'] = value[\'cover\']
#发布时间
item[\'publish_time\'] = value[\'publish_time\']
#分类
item[\'cid\'] = cid
#爬取详情页
yield scrapy.Request( url = item[\'url\'], meta={\'meta_item\': item}, callback=self.detail_parse)
#详情页
def detail_parse(self, response):
# 提取每次Response的meta数据
meta_item = response.meta[\'meta_item\']
#取内容
content_selector = response.xpath(\'//div[@class="content"]\')
meta_item[\'content_images\'] = content_selector.xpath(\'//img/@src|//img/@data-src\').extract()
meta_item[\'content\'] = content_selector.extract()[0]
yield meta_item