一、说明
1.1 开发环境说明
开发环境--PyCharm
爬虫框架--Scrapy
开发语言--Python 3.6
安装第三方库--Scrapy、pymysql、matplotlib
数据库--MySQL-5.5(监听地址--127.0.0.1:3306,用户名--root,密码--root,数据库--anjuke)
1.2 程序简要说明
本程序以安居客-深圳为例,其他城市使用的是一样的结构爬取其他程序基本修改start_urls和rules中的url即可移植
本程序实现安居客新楼盘和二手房的信息爬取,还存在一些小问题,但算基本可用
程序的总体思路是:使用CrawlSpider爬虫----从start_urls开始爬行----爬行到的url如果符合某个rule就会自动调用回调函数----回调函数使用xpath解析和获取楼盘信息item----pipe将传过来的item写入数据库--report脚本从数据库中读出数据生成图表
新楼盘和二手房爬取的区别是,新楼盘没有反爬虫机制。二手房一是限制了访问频率,如果超过某个频率就需要输入验证码才能访问(我这里通过限制5秒发一个请求进行处理),二是二手房信息页面经过javascript处理,禁用javascript时信息处理div[4]启用javascript时信息被移动到div[3],scrapy默认是不运行javascript的所以需要使用禁用javascript时的路径才能获取信息。
项目源码已上传github:https://github.com/PrettyUp/Anjuke
二、创建数据库表结构
sql创建代码:
# Host: localhost (Version: 5.5.53) # Date: 2018-06-06 18:27:08 # Generator: MySQL-Front 5.3 (Build 4.234) /*!40101 SET NAMES utf8 */; # # Structure for table "sz_loupan_info" # CREATE TABLE `sz_loupan_info` ( `loupan_name` varchar(255) DEFAULT NULL, `loupan_status` varchar(255) DEFAULT NULL, `loupan_price` int(11) DEFAULT NULL, `loupan_discount` varchar(255) DEFAULT NULL, `loupan_layout` varchar(255) DEFAULT NULL, `loupan_location` varchar(255) DEFAULT NULL, `loupan_opening` varchar(255) DEFAULT NULL, `loupan_transfer` varchar(255) DEFAULT NULL, `loupan_type` varchar(255) DEFAULT NULL, `loupan_age` varchar(255) DEFAULT NULL, `loupan_url` varchar(255) DEFAULT NULL ) ENGINE=MyISAM DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC; # # Structure for table "sz_sh_house_info" # CREATE TABLE `sz_sh_house_info` ( `house_title` varchar(255) DEFAULT NULL, `house_cost` varchar(255) DEFAULT NULL, `house_code` varchar(255) DEFAULT NULL, `house_public_time` varchar(255) DEFAULT NULL, `house_community` varchar(255) DEFAULT NULL, `house_location` varchar(255) DEFAULT NULL, `house_build_years` varchar(255) DEFAULT NULL, `house_kind` varchar(255) DEFAULT NULL, `house_layout` varchar(255) DEFAULT NULL, `house_size` varchar(255) DEFAULT NULL, `house_face_to` varchar(255) DEFAULT NULL, `house_point` varchar(255) DEFAULT NULL, `house_price` varchar(255) DEFAULT NULL, `house_first_pay` varchar(255) DEFAULT NULL, `house_month_pay` varchar(255) DEFAULT NULL, `house_decorate_type` varchar(255) DEFAULT NULL, `house_agent` varchar(255) DEFAULT NULL, `house_agency` varchar(255) DEFAULT NULL, `house_url` varchar(255) DEFAULT NULL ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
三、程序实现
3.1 使用scrapy创建项目
打开cmd,切换到PyCharm工程目录,执行:
scrapy startproject Anjuke
3.2 使用pycharm打开项目并安装好所需的第三方库
直接在pycharm中安装scrapy、pymysql和matplotlib(其他依赖库会自动安装);另外注意,安装scrapy安成后复制一份cmdline.py到项目主目录下
3.3 创建程序所需文件
anjuke_sz_spider.py----楼盘信息爬取脚本
anjuke_sz_sh_spider.py----二手房信息爬取脚本
anjuke_sz_report.py----楼盘信息报告图表生成脚本
anjuke_sz_sh_report.py----二手房信息报告图表生成脚本
项目目录结构如下:
3.4 配置好scrapy调试运行环境
为anjuke_sz_sider.py和anjuke_sz_sh_spider.py配置好运行参数
3.5 各文件实现
settings.py
BOT_NAME = \'Anjuke\' SPIDER_MODULES = [\'Anjuke.spiders\'] NEWSPIDER_MODULE = \'Anjuke.spiders\' USER_AGENT = \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0\' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 5 COOKIES_ENABLED = False ITEM_PIPELINES = { \'Anjuke.pipelines.AnjukeSZPipeline\': 300, \'Anjuke.pipelines.AnjukeSZSHPipeline\': 300, }
items.py
import scrapy class AnjukeSZItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() loupan_id = scrapy.Field() loupan_name = scrapy.Field() loupan_status = scrapy.Field() loupan_price = scrapy.Field() loupan_discount = scrapy.Field() loupan_layout = scrapy.Field() loupan_location = scrapy.Field() loupan_opening = scrapy.Field() loupan_transfer = scrapy.Field() loupan_type = scrapy.Field() loupan_age = scrapy.Field() loupan_url = scrapy.Field() class AnjukeSZSHItem(scrapy.Item): house_title = scrapy.Field() house_cost = scrapy.Field() house_code = scrapy.Field() house_public_time = scrapy.Field() house_community = scrapy.Field() house_location = scrapy.Field() house_build_years = scrapy.Field() house_kind = scrapy.Field() house_layout = scrapy.Field() house_size = scrapy.Field() house_face_to = scrapy.Field() house_point = scrapy.Field() house_price = scrapy.Field() house_first_pay = scrapy.Field() house_month_pay = scrapy.Field() house_decorate_type = scrapy.Field() house_agent = scrapy.Field() house_agency = scrapy.Field() house_url = scrapy.Field()
pipelines.py
import pymysql class AnjukeSZPipeline(object): def __init__(self): self.db = pymysql.connect("localhost", "root", "root", "anjuke", charset="utf8") self.cursor = self.db.cursor() def process_item(self, item, spider): sql = "insert into sz_loupan_info(loupan_name,loupan_status,loupan_price,loupan_discount,loupan_layout,loupan_location,loupan_opening,loupan_transfer,loupan_type,loupan_age,loupan_url)\ values(\'%s\',\'%s\',\'%d\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"\ %(item[\'loupan_name\'],item[\'loupan_status\'],int(item[\'loupan_price\']),item[\'loupan_discount\'],item[\'loupan_layout\'],item[\'loupan_location\'], \ item[\'loupan_opening\'],item[\'loupan_transfer\'],item[\'loupan_type\'],item[\'loupan_age\'],item[\'loupan_url\']) self.cursor.execute(sql) self.db.commit() return item def __del__(self): self.db.close() class AnjukeSZSHPipeline(object): def __init__(self): self.db = pymysql.connect("localhost", "root", "root", "anjuke", charset="utf8") self.cursor = self.db.cursor() def process_item(self, item, spider): sql = "insert into sz_sh_house_info(house_title,house_cost,house_code,house_public_time,house_community,house_location,house_build_years,house_kind,house_layout,house_size,\ house_face_to,house_point,house_price,house_first_pay,house_month_pay,house_decorate_type,house_agent,house_agency,house_url)\ values(\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"\ %(item[\'house_title\'],item[\'house_cost\'],item[\'house_code\'],item[\'house_public_time\'],item[\'house_community\'],item[\'house_location\'],\ item[\'house_build_years\'],item[\'house_kind\'], item[\'house_layout\'],item[\'house_size\'],item[\'house_face_to\'],item[\'house_point\'],item[\'house_price\'],\ item[\'house_first_pay\'],item[\'house_month_pay\'],item[\'house_decorate_type\'],item[\'house_agent\'],item[\'house_agency\'],item[\'house_url\']) self.cursor.execute(sql) self.db.commit() return item def __del__(self): self.db.close()
anjuke_sz_spider.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule from Anjuke.items import AnjukeSZItem class AnjukeSpider(scrapy.spiders.CrawlSpider): name = \'anjuke_sz\' allow_domains = ["anjuke.com"] start_urls = [ \'https://sz.fang.anjuke.com/loupan/all/p1/\', ] rules = [ Rule(LinkExtractor(allow=("https://sz\.fang\.anjuke\.com/loupan/all/p\d{1,}"))), Rule(LinkExtractor(allow=("https://sz\.fang\.anjuke\.com/loupan/\d{1,}")), follow=False, callback=\'parse_item\') ] def is_number(self,s): try: float(s) return True except ValueError: pass try: import unicodedata unicodedata.numeric(s) return True except (TypeError, ValueError): pass return False def get_sellout_item(self,response): loupan_nodes = {} loupan_nodes[\'loupan_name_nodes\'] = response.xpath(\'//*[@id="j-triggerlayer"]/text()\') loupan_nodes[\'loupan_status_nodes\'] = response.xpath(\'/html/body/div[1]/div[3]/div/div[2]/i/text()\') loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()\') if loupan_nodes[\'loupan_price_nodes\']: if self.is_number(loupan_nodes[\'loupan_price_nodes\'].extract()[0].strip()): loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()\') loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/div[1]/ul[1]/li/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/div[1]/ul[1]/li/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[1]/p/em/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[2]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[3]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/div[1]/ul[1]/li/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[2]/span/text()\') loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[3]/div[1]/ul[1]/li/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') if not loupan_nodes[\'loupan_location_nodes\']: loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()\') loupan_item = self.struct_loupan_item(loupan_nodes) return loupan_item def get_sellwait_item(self,response): loupan_nodes = {} loupan_nodes[\'loupan_name_nodes\'] = response.xpath(\'//*[@id="j-triggerlayer"]/text()\') loupan_nodes[\'loupan_status_nodes\'] = response.xpath(\'/html/body/div[1]/div[3]/div/div[2]/i/text()\') loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[2]/span/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[5]/p[1]/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[5]/p[2]/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()\') loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') if not loupan_nodes[\'loupan_location_nodes\']: loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()\') loupan_item = self.struct_loupan_item(loupan_nodes) return loupan_item def get_common_item(self,response): loupan_nodes = {} loupan_nodes[\'loupan_name_nodes\'] = response.xpath(\'//*[@id="j-triggerlayer"]/text()\') loupan_nodes[\'loupan_status_nodes\'] = response.xpath(\'/html/body/div[1]/div[3]/div/div[2]/i/text()\') loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[1]/p/em/text()\') if loupan_nodes[\'loupan_price_nodes\']: loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[2]/a[1]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[3]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[2]/dl/dd[4]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[5]/p[1]/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[5]/p[2]/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') else: loupan_nodes[\'loupan_price_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()\') loupan_nodes[\'loupan_discount_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/text()\') loupan_nodes[\'loupan_layout_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[4]/div/text()\') loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[5]/span/text()\') loupan_nodes[\'loupan_opening_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[1]/span/text()\') loupan_nodes[\'loupan_transfer_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/p[2]/span/text()\') loupan_nodes[\'loupan_type_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[1]/span/text()\') loupan_nodes[\'loupan_age_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[4]/div/ul[1]/li[2]/span/text()\') if not loupan_nodes[\'loupan_location_nodes\']: loupan_nodes[\'loupan_location_nodes\'] = response.xpath(\'/html/body/div[2]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()\') loupan_item = self.struct_loupan_item(loupan_nodes) return loupan_item def struct_loupan_item(self,loupan_nodes): loupan_item = AnjukeSZItem() if loupan_nodes[\'loupan_name_nodes\']: loupan_item[\'loupan_name\'] = loupan_nodes[\'loupan_name_nodes\'].extract()[0].strip() if loupan_nodes[\'loupan_status_nodes\']: loupan_item[\'loupan_status\'] = loupan_nodes[\'loupan_status_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_status\'] = \'\' if loupan_nodes[\'loupan_price_nodes\']: loupan_item[\'loupan_price\'] = loupan_nodes[\'loupan_price_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_price\'] = \'\' if loupan_nodes[\'loupan_discount_nodes\']: loupan_item[\'loupan_discount\'] = loupan_nodes[\'loupan_discount_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_discount\'] = \'\' if loupan_nodes[\'loupan_layout_nodes\']: loupan_item[\'loupan_layout\'] = loupan_nodes[\'loupan_layout_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_layout\'] = \'\' if loupan_nodes[\'loupan_location_nodes\']: loupan_item[\'loupan_location\'] = loupan_nodes[\'loupan_location_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_location\'] = \'\' if loupan_nodes[\'loupan_opening_nodes\']: loupan_item[\'loupan_opening\'] = loupan_nodes[\'loupan_opening_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_opening\'] = \'\' if loupan_nodes[\'loupan_transfer_nodes\']: loupan_item[\'loupan_transfer\'] = loupan_nodes[\'loupan_transfer_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_transfer\'] = \'\' if loupan_nodes[\'loupan_type_nodes\']: loupan_item[\'loupan_type\'] = loupan_nodes[\'loupan_type_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_type\'] = \'\' if loupan_nodes[\'loupan_age_nodes\']: loupan_item[\'loupan_age\'] = loupan_nodes[\'loupan_age_nodes\'].extract()[0].strip() else: loupan_item[\'loupan_age\'] = \'\' return loupan_item def parse_item(self, response): loupan_status_nodes = response.xpath(\'/html/body/div[1]/div[3]/div/div[2]/i/text()\') if loupan_status_nodes.extract()[0].strip() == \'售罄\': loupan_item = self.get_sellout_item(response) elif loupan_status_nodes.extract()[0].strip() == \'待售\': loupan_item = self.get_sellwait_item(response) else: loupan_item = self.get_common_item(response) loupan_item[\'loupan_url\'] = response.url return loupan_item
anjuke_sz_sh_spider.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule from Anjuke.items import AnjukeSZSHItem class AnjukeSpider(scrapy.spiders.CrawlSpider): name = \'anjuke_sz_sh\' allow_domains = ["anjuke.com"] start_urls = [ \'https://shenzhen.anjuke.com/sale/p1\', ] rules = [ Rule(LinkExtractor(allow=("https://shenzhen\.anjuke\.com/sale/p\d{1,}"))), Rule(LinkExtractor(allow=("https://shenzhen\.anjuke\.com/prop/view/A\d{1,}")), follow=False, callback=\'parse_item\') ] def get_house_item(self,response): house_nodes = {} house_nodes["house_title_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[3]/h3/text()\') house_nodes["house_cost_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[1]/span[1]/em/text()\') house_nodes["house_code_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/h4/span[2]/text()\') house_nodes["house_community_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[1]/dl[1]/dd/a/text()\') house_nodes["house_location_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[1]/dl[2]/dd/p\') house_nodes["house_build_years_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[1]/dl[3]/dd/text()\') house_nodes["house_kind_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[1]/dl[4]/dd/text()\') house_nodes["house_layout_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[2]/dl[1]/dd/text()\') house_nodes["house_size_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[2]/dl[2]/dd/text()\') house_nodes["house_face_to_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[2]/dl[3]/dd/text()\') house_nodes["house_point_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[2]/dl[4]/dd/text()\') house_nodes["house_price_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[3]/dl[1]/dd/text()\') house_nodes["house_first_pay_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[3]/dl[2]/dd/text()\') house_nodes["house_month_pay_nodes"] = response.xpath(\'//*[@id="reference_monthpay"]/text()\') house_nodes["house_decorate_type_nodes"] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[1]/div[3]/div/div[1]/div/div[3]/dl[4]/dd/text()\') house_nodes[\'house_agent_nodes\'] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[2]/div/div[1]/div[1]/div/div/text()\') house_nodes[\'house_agency_nodes\'] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[2]/div/div[1]/div[5]/div/p[1]/a/text()\') if not house_nodes[\'house_agency_nodes\']: house_nodes[\'house_agency_nodes\'] = response.xpath(\'/html/body/div[1]/div[2]/div[4]/div[2]/div/div[1]/div[5]/div/p/text()\') house_item = self.struct_house_item(house_nodes) return house_item def struct_house_item(self,house_nodes): house_item = AnjukeSZSHItem() if house_nodes[\'house_title_nodes\']: house_item[\'house_title\'] = house_nodes[\'house_title_nodes\'].extract()[0].strip() else: house_item[\'house_title\'] = \'\' if house_nodes[\'house_cost_nodes\']: house_item[\'house_cost\'] = house_nodes[\'house_cost_nodes\'].extract()[0].strip() else: house_item[\'house_cost\'] = \'\' if house_nodes[\'house_code_nodes\']: temp_dict = house_nodes[\'house_code_nodes\'].extract()[0].strip().split(\',\') house_item[\'house_code\'] = temp_dict[0] house_item[\'house_public_time\'] = temp_dict[1] else: house_item[\'house_code\'] = \'\' house_item[\'house_public_time\'] = \'\' if house_nodes[\'house_community_nodes\']: house_item[\'house_community\'] = house_nodes[\'house_community_nodes\'].extract()[0].strip() else: house_item[\'house_community\'] = \'\' if house_nodes[\'house_location_nodes\']: house_item[\'house_location\'] = house_nodes[\'house_location_nodes\'].xpath(\'string(.)\').extract()[0].strip().replace(\'\t\',\'\').replace(\'\n\',\'\') else: house_item[\'house_location\'] = \'\' if house_nodes[\'house_build_years_nodes\']: house_item[\'house_build_years\'] = house_nodes[\'house_build_years_nodes\'].extract()[0].strip() else: house_item[\'house_build_years\'] = \'\' if house_nodes[\'house_kind_nodes\']: house_item[\'house_kind\'] = house_nodes[\'house_kind_nodes\'].extract()[0].strip() else: house_item[\'house_kind\'] = \'\' if house_nodes[\'house_layout_nodes\']: house_item[\'house_layout\'] = house_nodes[\'house_layout_nodes\'].extract()[0].strip().replace(\'\t\',\'\').replace(\'\n\',\'\') else: house_item[\'house_layout\'] = \'\' if house_nodes[\'house_size_nodes\']: house_item[\'house_size\'] = house_nodes[\'house_size_nodes\'].extract()[0].strip() else: house_item[\'house_size\'] = \'\' if house_nodes[\'house_face_to_nodes\']: house_item[\'house_face_to\'] = house_nodes[\'house_face_to_nodes\'].extract()[0].strip() else: house_item[\'house_face_to\'] = \'\' if house_nodes[\'house_point_nodes\']: house_item[\'house_point\'] = house_nodes[\'house_point_nodes\'].extract()[0].strip() else: house_item[\'house_point\'] = \'\' if house_nodes[\'house_price_nodes\']: house_item[\'house_price\'] = house_nodes[\'house_price_nodes\'].extract()[0].strip() else: house_item[\'house_price\'] = \'\' if house_nodes[\'house_first_pay_nodes\']: house_item[\'house_first_pay\'] = house_nodes[\'house_first_pay_nodes\'].extract()[0].strip() else: house_item[\'house_first_pay\'] = \'\' if house_nodes[\'house_month_pay_nodes\']: house_item[\'house_month_pay\'] = house_nodes[\'house_month_pay_nodes\'].extract()[0].strip() else: house_item[\'house_month_pay\'] = \'\' if house_nodes[\'house_decorate_type_nodes\']: house_item[\'house_decorate_type\'] = house_nodes[\'house_decorate_type_nodes\'].extract()[0].strip() else: house_item[\'house_decorate_type\'] = \'\' if house_nodes[\'house_agent_nodes\']: house_item[\'house_agent\'] = house_nodes[\'house_agent_nodes\'].extract()[0].strip() else: house_item[\'house_agent\'] = \'\' if house_nodes[\'house_agency_nodes\']: house_item[\'house_agency\'] = house_nodes[\'house_agency_nodes\'].extract()[0].strip() else: house_item[\'house_agency\'] = \'\' return house_item def parse_item(self, response): house_item = self.get_house_item(response) house_item[\'house_url\'] = response.url return house_item
anjuke_sz_report.py
import matplotlib.pyplot as plt import pymysql import numpy as np class AjukeSZReport(): def __init__(self): self.db = pymysql.connect(\'127.0.0.1\', \'root\', \'root\', \'anjuke\', charset=\'utf8\') self.cursor = self.db.cursor() def export_result_piture(self): district = [\'南山\',\'宝安\',\'福田\',\'罗湖\',\'光明\',\'龙华\',\'龙岗\',\'坪山\',\'盐田\',\'大鹏\',\'深圳\',\'惠州\',\'东莞\'] x = np.arange(len(district)) house_price_avg = [] for district_temp in district: if district_temp == \'深圳\': sql = "select avg(loupan_price) from sz_loupan_info where loupan_location not like \'%周边%\' and loupan_price > 5000" else: sql = "select avg(loupan_price) from sz_loupan_info where loupan_location like \'%" + district_temp + "%\' and loupan_price > 5000" self.cursor.execute(sql) results = self.cursor.fetchall() house_price_avg.append(results[0][0]) bars = plt.bar(x, house_price_avg) plt.xticks(x, district) plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] i = 0 for bar in bars: plt.text((bar.get_x()+bar.get_width()/2),bar.get_height(),\'%d\'%house_price_avg[i],ha=\'center\',va=\'bottom\') i += 1 plt.show() def __del__(self): self.db.close() if __name__ == \'__main__\': anjukeSZReport = AjukeSZReport() anjukeSZReport.export_result_piture()
anjuke_sz_sh_report.py
import matplotlib.pyplot as plt import pymysql import numpy as np class AjukeSZSHReport(): def __init__(self): self.db = pymysql.connect(\'127.0.0.1\', \'root\', \'root\', \'anjuke\', charset=\'utf8\') self.cursor = self.db.cursor() def export_result_piture(self): district = [\'南山\',\'宝安\',\'福田\',\'罗湖\',\'光明\',\'龙华\',\'龙岗\',\'坪山\',\'盐田\',\'大鹏\',\'深圳\',\'惠州\',\'东莞\'] x = np.arange(len(district)) house_price_avg = [] for district_temp in district: if district_temp == \'深圳\': sql = "select house_price from sz_sh_house_info where house_location not like \'%周边%\'" else: sql = "select house_price from sz_sh_house_info where house_location like \'%" + district_temp + "%\'" self.cursor.execute(sql) results = self.cursor.fetchall() house_price_sum = 0 house_num = 0 for result in results: house_price_dict = result[0].split(\' \') house_price_sum += int(house_price_dict[0]) house_num += 1 house_price_avg.append(house_price_sum/house_num) bars = plt.bar(x, house_price_avg) plt.xticks(x, district) plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] i = 0 for bar in bars: plt.text((bar.get_x()+bar.get_width()/2),bar.get_height(),\'%d\'%house_price_avg[i],ha=\'center\',va=\'bottom\') i += 1 plt.show() def __del__(self): self.db.close() if __name__ == \'__main__\': anjukeSZReport = AjukeSZSHReport() anjukeSZReport.export_result_piture()
其他文件未做修改,保执自动生成时的模样不动即可
三、项目结果演示
anjuke_sz_spider.py收集部份楼盘数据截图
anjuke_sz_report.py生成图表截图:
anjuke_sz_sh_spider.py收集部份二手房数据截图:
anjuke_sz_sh_report.py生成报表截图: