爬取企查查网站中安徽省内的企业数据信息

企查查网站中汇聚了有关注册企业的详细信息，为了更好的查询企业相关信息，本人对网站中安徽省境内的企业进行了爬取，其中遇到的问题和使用的技术如下：

1、遇到的问题：

　　1>企查查PC版数据只显示前500页，为了尽可能最大化爬取网站数据，本次爬取按照市级分别爬取，共计爬取安徽省境内16个市区共计80000条企业信息；

　　2>在爬取网站数据时，若爬取速度过快，会出现手动验证功能，为了解决手动验证，同时为了避免封号，直接采用随机更换IP代理，IP代理可以在《89免费代理》网站获取免费代理账号，网址为：http://www.89ip.cn/，可以一次性获取30个代理IP，如果不够用，

　　　　可以多次提取，然后构建代理池，本人试了，该网站的免费代理比西次代理和快代理网站的免费代理要好很多，如下图：

2、使用的技术：

　　1>请求模块：requests请求，为了避免反爬，采用随机代理，同时使用fake_useragent随机产生user-agent；

　　2>解析库：使用xpath和正则表达式

　　3>提速优化：采用多线程，同时对爬取的数据进行一次性保存，避免磁盘频繁IO；

3、核心代码如下：

import requests
from lxml import etree
from queue import Queue
from threading import Thread
from fake_useragent import UserAgent
import csv
import os
import re
import random
import time
from ippools import ProxySpider
from proxy_ip import IP_LIST


class QichachaSpider:
    def __init__(self):
        self.url = \'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}\'
        self.q = Queue()
        self.company_info = []
        self.headers = {
            \'Host\': \'www.qichacha.com\',
            \'Referer\': \'https: // www.qichacha.com /\',
            \'X-Requested-With\': \'XMLHttpRequest\'
        }

    # 随机User-Agent
    def random_ua(self):
        ua = UserAgent()
        return ua.random

    # 随机IP
    def random_proxy(self):
        proxy_list = ProxySpider().get_training_ip(\'https://www.qichacha.com/\')
        return proxy_list

    # 爬取目标入队列
    def put_url(self):
        self.headers[\'User-Agent\'] = self.random_ua()
        url = \'https://www.qichacha.com/\'
        html = requests.get(url, headers=self.headers).content.decode(\'utf-8\', \'ignore\')
        parse_html = etree.HTML(html)
        r_list = parse_html.xpath(\'//div[@class="areacom"]/div[2]/div[2]/a/@href\')
        for r in r_list:
            link = r.split(\'_\')[1:]
            for i in range(1, 501):
                url = self.url.format(link[0], link[1], i)
                print(url)
                self.q.put(url)

    # 获取一级页面数据
    def get_data(self):
        while True:
            if not self.q.empty():
                url = self.q.get()
                self.headers[\'User-Agent\'] = self.random_ua()
                # proxies = self.random_proxy()
                proxies = random.choice(IP_LIST)
                try:
                    html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode(\'utf-8\',
                                                                                                              \'ignore\')
                    # html = requests.get(url, headers=self.headers).content.decode(\'utf-8\', \'ignore\')
                    # time.sleep(random.uniform(0.5, 1.5))
                    parse_html = etree.HTML(html)
                    company_list = parse_html.xpath(\'//table[@class="m_srchList"]/tbody/tr\')
                    if company_list is not None:
                        for company in company_list:
                            try:
                                company_name = company.xpath(\'./td[2]/a/text()\')[0].strip()
                                company_link = \'https://www.qichacha.com\' + company.xpath(\'./td[2]/a/@href\')[0].strip()
                                company_type, company_industry, company_business_scope = self.get_company_info(
                                    company_link)
                                company_person = company.xpath(\'./td[2]/p[1]/a/text()\')[0].strip()
                                company_money = company.xpath(\'./td[2]/p[1]/span[1]/text()\')[0].split(\'：\')[-1].strip()
                                company_time = company.xpath(\'./td[2]/p[1]/span[2]/text()\')[0].split(\'：\')[-1].strip()
                                company_email = company.xpath(\'./td[2]/p[2]/text()\')[0].split(\'：\')[-1].strip()
                                company_phone = company.xpath(\'td[2]/p[2]/span/text()\')[0].split(\'：\')[-1].strip()
                                company_address = company.xpath(\'td[2]/p[3]/text()\')[0].split(\'：\')[-1].strip()
                                company_status = company.xpath(\'td[3]/span/text()\')[0].strip()
                                company_dict = {
                                    \'公司名称\': company_name,
                                    \'公司链接\': company_link,
                                    \'公司类型\': company_type,
                                    \'所属行业\': company_industry,
                                    \'经营范围\': company_business_scope,
                                    \'公司法人\': company_person,
                                    \'注册资本\': company_money,
                                    \'注册时间\': company_time,
                                    \'邮箱\': company_email,
                                    \'电话\': company_phone,
                                    \'地址\': company_address,
                                    \'是否存续\': company_status,
                                }
                                print(company_dict)
                                # self.company_info.append(
                                #     (company_name, company_link, company_type, company_industry, company_business_scope,
                                #      company_person, company_money, company_time, company_email, company_phone,
                                #      company_address, company_status))
                                info_list = [company_name, company_link, company_type, company_industry,
                                             company_business_scope, company_person, company_money, company_time,
                                             company_email, company_phone, company_address, company_status]
                                self.save_data(info_list)

                            except:
                                with open(\'./bad.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f:
                                    writer = csv.writer(f)
                                    writer.writerow(url)
                                continue
                except:
                    self.q.put(url)

            else:
                break

    # 获取二级页面数据
    def get_company_info(self, company_link):
        headers = {\'User-Agent\': UserAgent().random}
        html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode(
            \'utf-8\', \'ignore\')
        while True:
            if \'企业类型\' not in html:
                html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST),
                                    timeout=3).content.decode(
                    \'utf-8\', \'ignore\')
            else:
                break
        try:
            company_type = re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S)[0].strip()
            company_industry = re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S)[0].strip()
            company_business_scope = re.findall(r\'经营范围.*?"3">(.*?)</td>\', html, re.S)[0].strip()
            return company_type, company_industry, company_business_scope
        except:
            return \'无\', \'无\', \'无\'

    # 保存数据
    def save_data(self, info):
        with open(\'./1111.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f:
            writer = csv.writer(f)
            writer.writerow(info)

    def main(self):
        if os.path.exists(\'./1111.csv\'):
            os.remove(\'./1111.csv\')
            with open(\'./1111.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f:
                writer = csv.writer(f)
                writer.writerow(
                    [\'公司名称\', \'公司链接\', \'公司类型\', \'所属行业\', \'经营范围\', \'公司法人\', \'注册资本\', \'注册时间\', \'邮箱\', \'电话\', \'地址\', \'是否存续\'])
        self.put_url()
        t_list = []
        for i in range(0, 10):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for j in t_list:
            j.join()


if __name__ == "__main__":
    spider = QichachaSpider()
    spider.main()

未了提高爬取速率，同时为了显示高大上，下面使用scrapy框架进行爬取，代码如下：

1、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QichachaItem(scrapy.Item):
    company_name = scrapy.Field()
    company_person = scrapy.Field()
    company_money = scrapy.Field()
    company_establish = scrapy.Field()
    company_email = scrapy.Field()
    company_phone = scrapy.Field()
    company_address = scrapy.Field()
    company_risk = scrapy.Field()
    company_status = scrapy.Field()
    company_type = scrapy.Field()
    company_trade = scrapy.Field()
    company_business_scope = scrapy.Field()
    company_link = scrapy.Field()
    company_city = scrapy.Field()

2、qichacha.py

# -*- coding: utf-8 -*-
import scrapy
import time
import random
import re
from ..items import QichachaItem


class QichachaSpider(scrapy.Spider):
    name = \'qichacha\'
    allowed_domains = [\'www.qichacha.com\']
    base_url = \'https://www.qichacha.com/gongsi_area.html?prov=AH&city={}&p={}\'
    city_code_list = [340100, 340200, 340300, 340400, 340500, 340600, 340700, 340800, 341000,
                      341100, 341200, 341300, 341500, 341600, 341700, 341800]
    city_name_list = [\'合肥市\', \'芜湖市\', \'蚌埠市\', \'淮南市\', \'马鞍山市\', \'淮北市\', \'铜陵市\', \'安庆市\', \'黄山市\', \'滁州市\', \'阜阳市\', \'宿州市\', \'六安市\', \'亳州市\',
                      \'池州市\', \'宣城市\']
    base_company_url = \'https://www.qichacha.com{}\'

    def start_requests(self):
        for i in range(len(self.city_code_list)):
            for j in range(1, 501):
                item = QichachaItem()
                item[\'company_city\'] = self.city_name_list[i]
                url = self.base_url.format(self.city_code_list[i], j)
                yield scrapy.Request(
                    url=url,
                    meta={\'item\': item},
                    callback=self.parse_page
                )
            time.sleep(random.randint(30, 60))

    def parse_page(self, response):
        item = response.meta[\'item\']
        company_list = response.xpath(\'//*[@id="searchlist"]/table/tbody/tr\')
        for company in company_list:
            item[\'company_name\'] = company.xpath(\'td[2]/a/text()\').extract_first()
            item[\'company_link\'] = self.base_company_url.format(company.xpath(\'td[2]/a/@href\').extract_first())
            item[\'company_person\'] = company.xpath(\'td[2]/p[1]/a/text()\').extract_first()
            item[\'company_money\'] = company.xpath(\'td[2]/p[1]/span[1]/text()\').extract_first().split(\'：\')[-1]
            item[\'company_establish\'] = company.xpath(\'td[2]/p[1]/span[2]/text()\').extract_first().split(\'：\')[-1]
            item[\'company_email\'] = company.xpath(\'td[2]/p[2]/text()\').extract_first().split(\'：\')[-1].strip()
            item[\'company_phone\'] = company.xpath(\'td[2]/p[2]/span/text()\').extract_first().split(\'：\')[-1]
            item[\'company_address\'] = company.xpath(\'td[2]/p[3]/text()\').extract_first().split(\'：\')[-1].strip()
            item[\'company_status\'] = company.xpath(\'td[3]/span/text()\').extract_first().split(\'：\')[-1]
            yield scrapy.Request(
                url=item[\'company_link\'],
                meta={\'item\': item},
                callback=self.parse_company
            )
        time.sleep(random.randint(10, 20))

    def parse_company(self, response):
        item = response.meta[\'item\']
        html = response.text
        if re.findall(r\'<h2>经营风险.*?<span>(.*?)</span>\', html, re.S):
            item[\'company_risk\'] = re.findall(r\'<h2>经营风险.*?<span>(.*?)</span>\', html, re.S)[0].strip()
        else:
            item[\'company_risk\'] = \'-\'
        if re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S):
            item[\'company_type\'] = re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S)[0].strip()
        else:
            item[\'company_type\'] = \'-\'
        if re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S):
            item[\'company_trade\'] = re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S)[0].strip()
        else:
            item[\'company_trade\'] = \'-\'
        if re.findall(r\'经营范围</td> <td class="" colspan="3">(.*?)</td>\', html, re.S):
            item[\'company_business_scope\'] = re.findall(r\'经营范围</td> <td class="" colspan="3">(.*?)</td>\', html, re.S)[
                0].strip()
        else:
            item[\'company_business_scope\'] = \'-\'

        yield item
        time.sleep(random.uniform(0.5, 1))

3、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from .settings import *


class QichachaPipeline(object):
    def process_item(self, item, spider):
        print([item[\'company_name\'], item[\'company_person\'], item[\'company_money\'], item[\'company_establish\'],
               item[\'company_email\'], item[\'company_phone\'], item[\'company_address\'], item[\'company_risk\'],
               item[\'company_status\'], item[\'company_type\'], item[\'company_trade\'], item[\'company_link\'],
               item[\'company_city\'], item[\'company_business_scope\']])

        return item


class MysqlPipeline(object):
    def open_spider(self, spider):
        self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD,
                                  database=MYSQL_DB, charset=\'utf8\')
        self.cursor = self.db.cursor()

    def process_item(self, item, spider):
        ins = \'insert into qichachatab values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)\'
        info_list = [item[\'company_name\'], item[\'company_city\'], item[\'company_person\'], item[\'company_money\'],
                     item[\'company_establish\'], item[\'company_email\'], item[\'company_phone\'], item[\'company_address\'],
                     item[\'company_risk\'], item[\'company_status\'], item[\'company_type\'], item[\'company_trade\'],
                     item[\'company_link\'], item[\'company_business_scope\']]
        self.cursor.execute(ins, info_list)
        self.db.commit()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

4、middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class QichachaSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info(\'Spider opened: %s\' % spider.name)


class QichachaDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info(\'Spider opened: %s\' % spider.name)


from fake_useragent import UserAgent


class RandomUserAgentDownloaderMiddleware(object):
    def process_request(self, requset, spider):
        ua = UserAgent().random
        requset.headers[\'User-Agent\'] = ua


import redis
from .settings import *
from .proxies import ProxypoolSpider


class RandomProxyDownloaderMiddleware(object):
    def __init__(self):
        self.db = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PWD)

    def process_request(self, request, spider):
        proxy_list = self.db.zrangebyscore(REDIS_PROXY_KEY, 90, 100, withscores=True)
        if len(proxy_list) == 0:
            proxy_spider = ProxypoolSpider()
            proxy_spider.get_proxy()
        proxy = random.choice(proxy_list)[0].decode(\'utf-8\')
        request.meta[\'proxy\'] = proxy

    def process_response(self, request, response, spider):
        print(response.status, ": ", request.url)
        return request

    def process_exception(self, request, exception, spider):
        cur_proxy = request.meta[\'proxy\']
        print(\'异常来了\')
        self.db.zincrby(REDIS_PROXY_KEY, -1, cur_proxy)
        del request.meta[\'proxy\']
        return request

5、settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for Qichacha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import time
import random

BOT_NAME = \'Qichacha\'

SPIDER_MODULES = [\'Qichacha.spiders\']
NEWSPIDER_MODULE = \'Qichacha.spiders\'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = time.sleep(random.uniform(10, 20))
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
    \'Accept-Language\': \'en\',
    \'Cookie\': \'UM_distinctid=16bb7adb9252d8-09b3389bed6ae2-3a65420e-1fa400-16bb7adb92636e; zg_did=%7B%22did%22%3A%20%2216bb7adbb84740-04a7e287a3fa12-3a65420e-1fa400-16bb7adbb85669%22%7D; _uab_collina=156215474498922246746771; zg_63e87cf22c3e4816a30bfbae9ded4af2=%7B%22sid%22%3A%201562193465906%2C%22updated%22%3A%201562193465917%2C%22info%22%3A%201562193465914%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D; QCCSESSID=lnr0huo5t5s058h9tmlso56nu1; acw_tc=65e21c2915648350628141700eeaf85114e84964375db3a9f1b718d751; CNZZDATA1254842228=845561946-1562153840-https%253A%252F%252Fwww.baidu.com%252F%7C1565064428; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1565048078,1565048449,1565048590,1565067806; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201565067805845%2C%22updated%22%3A%201565069298085%2C%22info%22%3A%201564658796236%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22sp0.baidu.com%22%2C%22cuid%22%3A%20%22d48f6830513b318400fcc23636a23a7f%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1565069298\',
    \'Referer\': \'https://www.qichacha.com/\',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    \'Qichacha.middlewares.QichachaSpiderMiddleware\': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    \'Qichacha.middlewares.QichachaDownloaderMiddleware\': 543,
    # \'Qichacha.middlewares.RandomUserAgentDownloaderMiddleware\': 200,
    # \'Qichacha.middlewares.RandomProxyDownloaderMiddleware\': 250,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    \'scrapy.extensions.telnet.TelnetConsole\': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    \'Qichacha.pipelines.QichachaPipeline\': 300,
    \'Qichacha.pipelines.MysqlPipeline\': 100,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = \'httpcache\'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'

MYSQL_HOST = \'127.0.0.1\'
MYSQL_PORT = 3306
MYSQL_DB = \'qichachadb\'
MYSQL_USER = \'root\'
MYSQL_PWD = \'123456\'

# Redis数据库定义
REDIS_HOST = \'127.0.0.1\'
REDIS_PORT = 6379
REDIS_PWD = \'123456\'
REDIS_DB = 0
REDIS_PROXY_KEY = \'proxy\'

# 日志管理
LOG_LEVEL = \'WARNING\'
# LOG_FILE = \'qichacha.log\'

# 编码设置
FEED_EXPORT_ENCODING = \'utf-8\'

6、run.py

from scrapy import cmdline

cmdline.execute(\'scrapy crawl qichacha\'.split())