企查查网站中汇聚了有关注册企业的详细信息,为了更好的查询企业相关信息,本人对网站中安徽省境内的企业进行了爬取,其中遇到的问题和使用的技术如下:
1、遇到的问题:
1>企查查PC版数据只显示前500页,为了尽可能最大化爬取网站数据,本次爬取按照市级分别爬取,共计爬取安徽省境内16个市区共计80000条企业信息;
2>在爬取网站数据时,若爬取速度过快,会出现手动验证功能,为了解决手动验证,同时为了避免封号,直接采用随机更换IP代理,IP代理可以在《89免费代理》网站获取免费代理账号,网址为:http://www.89ip.cn/,可以一次性获取30个代理IP,如果不够用,
可以多次提取,然后构建代理池,本人试了,该网站的免费代理比西次代理和快代理网站的免费代理要好很多,如下图:
2、使用的技术:
1>请求模块:requests请求,为了避免反爬,采用随机代理,同时使用fake_useragent随机产生user-agent;
2>解析库:使用xpath和正则表达式
3>提速优化:采用多线程,同时对爬取的数据进行一次性保存,避免磁盘频繁IO;
3、核心代码如下:
import requests from lxml import etree from queue import Queue from threading import Thread from fake_useragent import UserAgent import csv import os import re import random import time from ippools import ProxySpider from proxy_ip import IP_LIST class QichachaSpider: def __init__(self): self.url = \'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}\' self.q = Queue() self.company_info = [] self.headers = { \'Host\': \'www.qichacha.com\', \'Referer\': \'https: // www.qichacha.com /\', \'X-Requested-With\': \'XMLHttpRequest\' } # 随机User-Agent def random_ua(self): ua = UserAgent() return ua.random # 随机IP def random_proxy(self): proxy_list = ProxySpider().get_training_ip(\'https://www.qichacha.com/\') return proxy_list # 爬取目标入队列 def put_url(self): self.headers[\'User-Agent\'] = self.random_ua() url = \'https://www.qichacha.com/\' html = requests.get(url, headers=self.headers).content.decode(\'utf-8\', \'ignore\') parse_html = etree.HTML(html) r_list = parse_html.xpath(\'//div[@class="areacom"]/div[2]/div[2]/a/@href\') for r in r_list: link = r.split(\'_\')[1:] for i in range(1, 501): url = self.url.format(link[0], link[1], i) print(url) self.q.put(url) # 获取一级页面数据 def get_data(self): while True: if not self.q.empty(): url = self.q.get() self.headers[\'User-Agent\'] = self.random_ua() # proxies = self.random_proxy() proxies = random.choice(IP_LIST) try: html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode(\'utf-8\', \'ignore\') # html = requests.get(url, headers=self.headers).content.decode(\'utf-8\', \'ignore\') # time.sleep(random.uniform(0.5, 1.5)) parse_html = etree.HTML(html) company_list = parse_html.xpath(\'//table[@class="m_srchList"]/tbody/tr\') if company_list is not None: for company in company_list: try: company_name = company.xpath(\'./td[2]/a/text()\')[0].strip() company_link = \'https://www.qichacha.com\' + company.xpath(\'./td[2]/a/@href\')[0].strip() company_type, company_industry, company_business_scope = self.get_company_info( company_link) company_person = company.xpath(\'./td[2]/p[1]/a/text()\')[0].strip() company_money = company.xpath(\'./td[2]/p[1]/span[1]/text()\')[0].split(\':\')[-1].strip() company_time = company.xpath(\'./td[2]/p[1]/span[2]/text()\')[0].split(\':\')[-1].strip() company_email = company.xpath(\'./td[2]/p[2]/text()\')[0].split(\':\')[-1].strip() company_phone = company.xpath(\'td[2]/p[2]/span/text()\')[0].split(\':\')[-1].strip() company_address = company.xpath(\'td[2]/p[3]/text()\')[0].split(\':\')[-1].strip() company_status = company.xpath(\'td[3]/span/text()\')[0].strip() company_dict = { \'公司名称\': company_name, \'公司链接\': company_link, \'公司类型\': company_type, \'所属行业\': company_industry, \'经营范围\': company_business_scope, \'公司法人\': company_person, \'注册资本\': company_money, \'注册时间\': company_time, \'邮箱\': company_email, \'电话\': company_phone, \'地址\': company_address, \'是否存续\': company_status, } print(company_dict) # self.company_info.append( # (company_name, company_link, company_type, company_industry, company_business_scope, # company_person, company_money, company_time, company_email, company_phone, # company_address, company_status)) info_list = [company_name, company_link, company_type, company_industry, company_business_scope, company_person, company_money, company_time, company_email, company_phone, company_address, company_status] self.save_data(info_list) except: with open(\'./bad.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: writer = csv.writer(f) writer.writerow(url) continue except: self.q.put(url) else: break # 获取二级页面数据 def get_company_info(self, company_link): headers = {\'User-Agent\': UserAgent().random} html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( \'utf-8\', \'ignore\') while True: if \'企业类型\' not in html: html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode( \'utf-8\', \'ignore\') else: break try: company_type = re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S)[0].strip() company_industry = re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S)[0].strip() company_business_scope = re.findall(r\'经营范围.*?"3">(.*?)</td>\', html, re.S)[0].strip() return company_type, company_industry, company_business_scope except: return \'无\', \'无\', \'无\' # 保存数据 def save_data(self, info): with open(\'./1111.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: writer = csv.writer(f) writer.writerow(info) def main(self): if os.path.exists(\'./1111.csv\'): os.remove(\'./1111.csv\') with open(\'./1111.csv\', \'a\', encoding=\'utf-8\', newline=\'\') as f: writer = csv.writer(f) writer.writerow( [\'公司名称\', \'公司链接\', \'公司类型\', \'所属行业\', \'经营范围\', \'公司法人\', \'注册资本\', \'注册时间\', \'邮箱\', \'电话\', \'地址\', \'是否存续\']) self.put_url() t_list = [] for i in range(0, 10): t = Thread(target=self.get_data) t_list.append(t) t.start() for j in t_list: j.join() if __name__ == "__main__": spider = QichachaSpider() spider.main()
未了提高爬取速率,同时为了显示高大上,下面使用scrapy框架进行爬取,代码如下:
1、items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class QichachaItem(scrapy.Item): company_name = scrapy.Field() company_person = scrapy.Field() company_money = scrapy.Field() company_establish = scrapy.Field() company_email = scrapy.Field() company_phone = scrapy.Field() company_address = scrapy.Field() company_risk = scrapy.Field() company_status = scrapy.Field() company_type = scrapy.Field() company_trade = scrapy.Field() company_business_scope = scrapy.Field() company_link = scrapy.Field() company_city = scrapy.Field()
2、qichacha.py
# -*- coding: utf-8 -*- import scrapy import time import random import re from ..items import QichachaItem class QichachaSpider(scrapy.Spider): name = \'qichacha\' allowed_domains = [\'www.qichacha.com\'] base_url = \'https://www.qichacha.com/gongsi_area.html?prov=AH&city={}&p={}\' city_code_list = [340100, 340200, 340300, 340400, 340500, 340600, 340700, 340800, 341000, 341100, 341200, 341300, 341500, 341600, 341700, 341800] city_name_list = [\'合肥市\', \'芜湖市\', \'蚌埠市\', \'淮南市\', \'马鞍山市\', \'淮北市\', \'铜陵市\', \'安庆市\', \'黄山市\', \'滁州市\', \'阜阳市\', \'宿州市\', \'六安市\', \'亳州市\', \'池州市\', \'宣城市\'] base_company_url = \'https://www.qichacha.com{}\' def start_requests(self): for i in range(len(self.city_code_list)): for j in range(1, 501): item = QichachaItem() item[\'company_city\'] = self.city_name_list[i] url = self.base_url.format(self.city_code_list[i], j) yield scrapy.Request( url=url, meta={\'item\': item}, callback=self.parse_page ) time.sleep(random.randint(30, 60)) def parse_page(self, response): item = response.meta[\'item\'] company_list = response.xpath(\'//*[@id="searchlist"]/table/tbody/tr\') for company in company_list: item[\'company_name\'] = company.xpath(\'td[2]/a/text()\').extract_first() item[\'company_link\'] = self.base_company_url.format(company.xpath(\'td[2]/a/@href\').extract_first()) item[\'company_person\'] = company.xpath(\'td[2]/p[1]/a/text()\').extract_first() item[\'company_money\'] = company.xpath(\'td[2]/p[1]/span[1]/text()\').extract_first().split(\':\')[-1] item[\'company_establish\'] = company.xpath(\'td[2]/p[1]/span[2]/text()\').extract_first().split(\':\')[-1] item[\'company_email\'] = company.xpath(\'td[2]/p[2]/text()\').extract_first().split(\':\')[-1].strip() item[\'company_phone\'] = company.xpath(\'td[2]/p[2]/span/text()\').extract_first().split(\':\')[-1] item[\'company_address\'] = company.xpath(\'td[2]/p[3]/text()\').extract_first().split(\':\')[-1].strip() item[\'company_status\'] = company.xpath(\'td[3]/span/text()\').extract_first().split(\':\')[-1] yield scrapy.Request( url=item[\'company_link\'], meta={\'item\': item}, callback=self.parse_company ) time.sleep(random.randint(10, 20)) def parse_company(self, response): item = response.meta[\'item\'] html = response.text if re.findall(r\'<h2>经营风险.*?<span>(.*?)</span>\', html, re.S): item[\'company_risk\'] = re.findall(r\'<h2>经营风险.*?<span>(.*?)</span>\', html, re.S)[0].strip() else: item[\'company_risk\'] = \'-\' if re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S): item[\'company_type\'] = re.findall(r\'企业类型</td> <td class="">(.*?)</td>\', html, re.S)[0].strip() else: item[\'company_type\'] = \'-\' if re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S): item[\'company_trade\'] = re.findall(r\'所属行业</td> <td class="">(.*?)</td>\', html, re.S)[0].strip() else: item[\'company_trade\'] = \'-\' if re.findall(r\'经营范围</td> <td class="" colspan="3">(.*?)</td>\', html, re.S): item[\'company_business_scope\'] = re.findall(r\'经营范围</td> <td class="" colspan="3">(.*?)</td>\', html, re.S)[ 0].strip() else: item[\'company_business_scope\'] = \'-\' yield item time.sleep(random.uniform(0.5, 1))
3、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don\'t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from .settings import * class QichachaPipeline(object): def process_item(self, item, spider): print([item[\'company_name\'], item[\'company_person\'], item[\'company_money\'], item[\'company_establish\'], item[\'company_email\'], item[\'company_phone\'], item[\'company_address\'], item[\'company_risk\'], item[\'company_status\'], item[\'company_type\'], item[\'company_trade\'], item[\'company_link\'], item[\'company_city\'], item[\'company_business_scope\']]) return item class MysqlPipeline(object): def open_spider(self, spider): self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD, database=MYSQL_DB, charset=\'utf8\') self.cursor = self.db.cursor() def process_item(self, item, spider): ins = \'insert into qichachatab values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)\' info_list = [item[\'company_name\'], item[\'company_city\'], item[\'company_person\'], item[\'company_money\'], item[\'company_establish\'], item[\'company_email\'], item[\'company_phone\'], item[\'company_address\'], item[\'company_risk\'], item[\'company_status\'], item[\'company_type\'], item[\'company_trade\'], item[\'company_link\'], item[\'company_business_scope\']] self.cursor.execute(ins, info_list) self.db.commit() return item def close_spider(self, spider): self.cursor.close() self.db.close()
4、middlewares.py
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class QichachaSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info(\'Spider opened: %s\' % spider.name) class QichachaDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info(\'Spider opened: %s\' % spider.name) from fake_useragent import UserAgent class RandomUserAgentDownloaderMiddleware(object): def process_request(self, requset, spider): ua = UserAgent().random requset.headers[\'User-Agent\'] = ua import redis from .settings import * from .proxies import ProxypoolSpider class RandomProxyDownloaderMiddleware(object): def __init__(self): self.db = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PWD) def process_request(self, request, spider): proxy_list = self.db.zrangebyscore(REDIS_PROXY_KEY, 90, 100, withscores=True) if len(proxy_list) == 0: proxy_spider = ProxypoolSpider() proxy_spider.get_proxy() proxy = random.choice(proxy_list)[0].decode(\'utf-8\') request.meta[\'proxy\'] = proxy def process_response(self, request, response, spider): print(response.status, ": ", request.url) return request def process_exception(self, request, exception, spider): cur_proxy = request.meta[\'proxy\'] print(\'异常来了\') self.db.zincrby(REDIS_PROXY_KEY, -1, cur_proxy) del request.meta[\'proxy\'] return request
5、settings.py
# -*- coding: utf-8 -*- # Scrapy settings for Qichacha project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import time import random BOT_NAME = \'Qichacha\' SPIDER_MODULES = [\'Qichacha.spiders\'] NEWSPIDER_MODULE = \'Qichacha.spiders\' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36\' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 10 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = time.sleep(random.uniform(10, 20)) # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\', \'Accept-Language\': \'en\', \'Cookie\': \'UM_distinctid=16bb7adb9252d8-09b3389bed6ae2-3a65420e-1fa400-16bb7adb92636e; zg_did=%7B%22did%22%3A%20%2216bb7adbb84740-04a7e287a3fa12-3a65420e-1fa400-16bb7adbb85669%22%7D; _uab_collina=156215474498922246746771; zg_63e87cf22c3e4816a30bfbae9ded4af2=%7B%22sid%22%3A%201562193465906%2C%22updated%22%3A%201562193465917%2C%22info%22%3A%201562193465914%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D; QCCSESSID=lnr0huo5t5s058h9tmlso56nu1; acw_tc=65e21c2915648350628141700eeaf85114e84964375db3a9f1b718d751; CNZZDATA1254842228=845561946-1562153840-https%253A%252F%252Fwww.baidu.com%252F%7C1565064428; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1565048078,1565048449,1565048590,1565067806; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201565067805845%2C%22updated%22%3A%201565069298085%2C%22info%22%3A%201564658796236%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22sp0.baidu.com%22%2C%22cuid%22%3A%20%22d48f6830513b318400fcc23636a23a7f%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1565069298\', \'Referer\': \'https://www.qichacha.com/\', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # \'Qichacha.middlewares.QichachaSpiderMiddleware\': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { \'Qichacha.middlewares.QichachaDownloaderMiddleware\': 543, # \'Qichacha.middlewares.RandomUserAgentDownloaderMiddleware\': 200, # \'Qichacha.middlewares.RandomProxyDownloaderMiddleware\': 250, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # \'scrapy.extensions.telnet.TelnetConsole\': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { \'Qichacha.pipelines.QichachaPipeline\': 300, \'Qichacha.pipelines.MysqlPipeline\': 100, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = \'httpcache\' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\' MYSQL_HOST = \'127.0.0.1\' MYSQL_PORT = 3306 MYSQL_DB = \'qichachadb\' MYSQL_USER = \'root\' MYSQL_PWD = \'123456\' # Redis数据库定义 REDIS_HOST = \'127.0.0.1\' REDIS_PORT = 6379 REDIS_PWD = \'123456\' REDIS_DB = 0 REDIS_PROXY_KEY = \'proxy\' # 日志管理 LOG_LEVEL = \'WARNING\' # LOG_FILE = \'qichacha.log\' # 编码设置 FEED_EXPORT_ENCODING = \'utf-8\'
6、run.py
from scrapy import cmdline cmdline.execute(\'scrapy crawl qichacha\'.split())