seacode

转自 https://blog.csdn.net/u011660391/article/details/83991867?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control

一、说明:

在网上百度下载省市区等数据,不是要积分下载就是要钱,我等穷逼既无积分又无钱,那就只有另想办法,学过几天python,用python将就抓点数据,借鉴别人一些写法,再修修补补,调试bug,基本上可以运行,并将抓取的数据保存至MySQL数据库中(抓取之前换成自己的数据库配置)。开发环境python3.6,下面附上源码,对python略懂基本,代码写得不好,如有更好的抓取方式和写法,欢迎指导。

二、代码

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=yjrowe
 
 
import sys
import os
import re
from urllib import request
from bs4 import BeautifulSoup
import pymysql
import requests
import time 
 
# 请求页面
def http_request(url, charset=\'utf8\'):
    try:
        print(url)
        data = requests.get(url, headers=header, timeout=5)
        kk = data.content
        kk = kk.decode(charset)
 
    except Exception as e:
        print(e)
        kk = http_request(url, charset)
 
    return kk
 
 
# 获取全国省份和直辖市
def province():
    print(\'爬取省数据中。。。\')
    t = http_request(url, \'gbk\')
    province_list = {}
 
    # if t:
    #     soup = BeautifulSoup(t, \'html.parser\')
    #     for i in soup.find_all(attrs={\'class\': \'provincetr\'}):
    #         for a in i.find_all(\'a\'):
    #             id = re.sub("\D", "", a.get(\'href\'))
    #             province_list[id] = {\'id\': id, \'name\': a.text, \'code\': id, \'href\': url + a.get(\'href\')}
    #             #print(province_list[id])
    # # exit(province_list)
    province_list[0] = {\'id\': \'43\', \'name\': \'湖南省\', \'code\': \'43\', \'href\': \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43.html\'}
    insert_data(\'province\', province_list)
    return province_list
 
 
# 获取省下级市
def city(province_list):
    print(\'爬取市数据中。。。\')
    city_list = {}
 
    # for i in province_list:
    #     t = http_request(province_list[i][\'href\'], \'gbk\')
    #     parent_href = get_parent_url(province_list[i][\'href\'])
 
    #     if not t:
    #         continue
    #     soup = BeautifulSoup(t, \'html.parser\')
    #     for v in soup.find_all(attrs={\'class\': \'citytr\'}):
    #         id = str(v.find_all(\'td\')[0].text)
    #         city_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\'))
 
    #         city_list[id[0:4]] = {\'id\': id[0:4], \'name\': str(v.find_all(\'td\')[1].text), \'province_id\': i, \'code\': id,
    #                               \'href\': parent_href + city_href}
    city_list[\'4301\'] = {\'id\': \'4301\', \'name\': \'长沙市\', \'province_id\': 0, \'code\': \'430100000000\', \'href\': \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43/4301.html\'}
    #insert_data(\'city\', city_list)
    return city_list
 
 
# 获取市下级区县
def country(city_list):
    print(\'爬取区县数据中。。。\')
    county_list = {}
 
    for i in city_list:
        t = http_request(city_list[i][\'href\'], \'gbk\')
        parent_href = get_parent_url(city_list[i][\'href\'])
 
        if not t:
            continue
        soup = BeautifulSoup(t, \'html.parser\')
        for v in soup.find_all(attrs={\'class\': \'countytr\'}):
            id = str(v.find_all(\'td\')[0].text)
 
            if len(v.find_all(\'td\')[1].find_all(\'a\')):
                country_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\'))
            else:
                continue
 
            county_list[id[0:6]] = {\'id\': id[0:6], \'name\': str(v.find_all(\'td\')[1].text), \'city_id\': i, \'code\': id,
                                    \'href\': parent_href + country_href}
 
    insert_data(\'country\', county_list)
    return county_list
 
 
# 县下级街道、镇
def street(county_list):
    print(\'爬取街道数据中。。。\')
    street_list = {}
 
    for i in county_list:
        t = http_request(county_list[i][\'href\'], \'gbk\')
        parent_href = get_parent_url(county_list[i][\'href\'])
 
        if not t:
            continue
        soup = BeautifulSoup(t, \'html.parser\')
        time.sleep(1)
        for v in soup.find_all(attrs={\'class\': \'towntr\'}):
            id = str(v.find_all(\'td\')[0].text)
            street_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\'))
            street_list[id[0:9]] = {\'id\': id[0:9], \'name\': str(v.find_all(\'td\')[1].text), \'country_id\': i, \'code\': id,
                                    \'href\': parent_href + street_href}
    insert_data(\'street\', street_list)
    return street_list
 
 
# 社区、街道办
def community(street_list):
    print(\'爬取社区数据中。。。\')
    community_list = {}
    for i in street_list:
        t = http_request(street_list[i][\'href\'], \'gbk\')
        if not t:
            continue
        soup = BeautifulSoup(t, \'html.parser\')
        time.sleep(1)
        for v in soup.find_all(attrs={\'class\': \'villagetr\'}):
            id = str(v.find_all(\'td\')[0].text)
            community_list[id[0:12]] = {\'id\': id[0:12], \'name\': str(v.find_all(\'td\')[2].text), \'street_id\': i,
                                        \'code\': id, \'category\': str(v.find_all(\'td\')[1].text)}
    insert_data(\'community\', community_list)
    return community_list
 
 
# 获取上级url
def get_parent_url(href):
    # print(\'上级链接:\' + href)
    arr = href.split(\'/\')
    last_value = arr[len(arr) - 1]
    href = href.replace(last_value, \'\')
 
    return href
 
 
# 插入数据
def insert_data(table, data):
    if len(data) == 0:
        return False
 
    keys = list(data.keys())
    table_keys = \'`\'+\'`,`\'.join(list(data[keys[0]].keys()))+\'`\'
 
    for k in range(0, len(keys)):
        table_values = \'\\'\'+\'\\',\\'\'.join(list(data[keys[int(k)]].values()))+\'\\'\'
        sql = \'INSERT INTO %s (%s) VALUES (%s)\' % (table, table_keys, table_values)
        # exit(sql)
        try:
            cursor.execute(sql)
        except Exception as e:
            print(e)
    conn.commit()
 
 
if __name__ == \'__main__\':
    conn = pymysql.connect(host=\'139.199.16.31\', port=3379, user=\'root\', passwd=\'Yunzin@yz123\', db=\'yz_cbe\', charset=\'utf8\')
    cursor = conn.cursor()
    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
    url = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/\'
    header = {
        \'Cookie\': \'AD_RS_COOKIE=20181108\',
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ \'
                      \'Chrome/58.0.3029.110 Safari/537.36\'}
    provinceList = province()
    
    print(provinceList)
    #provinceList = [\'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43.html\']

    cityList = city(provinceList)
    
    # print(cityList)
    countryList = country(cityList)
    print(countryList)
    streetList = street(countryList)
    print(streetList)
    communityList = community(streetList)
    print(\'数据抓取完成\')
CREATE TABLE `province` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `name` varchar(255) DEFAULT NULL,
  `code` varchar(255) DEFAULT NULL,
  `href` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 

CREATE TABLE `city` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `name` varchar(255) DEFAULT NULL,
  `province_id` int(11) unsigned DEFAULT NULL,
  `code` varchar(255) DEFAULT NULL,
  `href` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `city_province_id_foreign` (`province_id`),
  CONSTRAINT `city_province_id_foreign` FOREIGN KEY (`province_id`) REFERENCES `province` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 



CREATE TABLE `country` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `name` varchar(255) DEFAULT NULL,
  `city_id` int(11) unsigned DEFAULT NULL,
  `code` varchar(255) DEFAULT NULL,
  `href` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `country_city_id_foreign` (`city_id`),
  CONSTRAINT `country_city_id_foreign` FOREIGN KEY (`city_id`) REFERENCES `city` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

 


CREATE TABLE `street` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `name` varchar(255) DEFAULT NULL,
  `country_id` int(11) unsigned DEFAULT NULL,
  `code` varchar(255) DEFAULT NULL,
  `href` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `street_country_id_foreign` (`country_id`),
  CONSTRAINT `street_country_id_foreign` FOREIGN KEY (`country_id`) REFERENCES `country` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

 


CREATE TABLE `community` (
  `id` varchar(100) NOT NULL,
  `name` varchar(255) DEFAULT NULL,
  `street_id` int(11) unsigned DEFAULT NULL,
  `code` varchar(255) DEFAULT NULL,
  `category` varchar(255) DEFAULT NULL COMMENT \'城乡分类代码\',
  PRIMARY KEY (`id`),
  KEY `community_street_id_foreign` (`street_id`),
  CONSTRAINT `community_street_id_foreign` FOREIGN KEY (`street_id`) REFERENCES `street` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

 

分类:

技术点:

相关文章: