转自 https://blog.csdn.net/u011660391/article/details/83991867?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control
一、说明:
在网上百度下载省市区等数据,不是要积分下载就是要钱,我等穷逼既无积分又无钱,那就只有另想办法,学过几天python,用python将就抓点数据,借鉴别人一些写法,再修修补补,调试bug,基本上可以运行,并将抓取的数据保存至MySQL数据库中(抓取之前换成自己的数据库配置)。开发环境python3.6,下面附上源码,对python略懂基本,代码写得不好,如有更好的抓取方式和写法,欢迎指导。
二、代码
#!/usr/bin/python3 # -*- coding: utf-8 -*- # author=yjrowe import sys import os import re from urllib import request from bs4 import BeautifulSoup import pymysql import requests import time # 请求页面 def http_request(url, charset=\'utf8\'): try: print(url) data = requests.get(url, headers=header, timeout=5) kk = data.content kk = kk.decode(charset) except Exception as e: print(e) kk = http_request(url, charset) return kk # 获取全国省份和直辖市 def province(): print(\'爬取省数据中。。。\') t = http_request(url, \'gbk\') province_list = {} # if t: # soup = BeautifulSoup(t, \'html.parser\') # for i in soup.find_all(attrs={\'class\': \'provincetr\'}): # for a in i.find_all(\'a\'): # id = re.sub("\D", "", a.get(\'href\')) # province_list[id] = {\'id\': id, \'name\': a.text, \'code\': id, \'href\': url + a.get(\'href\')} # #print(province_list[id]) # # exit(province_list) province_list[0] = {\'id\': \'43\', \'name\': \'湖南省\', \'code\': \'43\', \'href\': \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43.html\'} insert_data(\'province\', province_list) return province_list # 获取省下级市 def city(province_list): print(\'爬取市数据中。。。\') city_list = {} # for i in province_list: # t = http_request(province_list[i][\'href\'], \'gbk\') # parent_href = get_parent_url(province_list[i][\'href\']) # if not t: # continue # soup = BeautifulSoup(t, \'html.parser\') # for v in soup.find_all(attrs={\'class\': \'citytr\'}): # id = str(v.find_all(\'td\')[0].text) # city_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\')) # city_list[id[0:4]] = {\'id\': id[0:4], \'name\': str(v.find_all(\'td\')[1].text), \'province_id\': i, \'code\': id, # \'href\': parent_href + city_href} city_list[\'4301\'] = {\'id\': \'4301\', \'name\': \'长沙市\', \'province_id\': 0, \'code\': \'430100000000\', \'href\': \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43/4301.html\'} #insert_data(\'city\', city_list) return city_list # 获取市下级区县 def country(city_list): print(\'爬取区县数据中。。。\') county_list = {} for i in city_list: t = http_request(city_list[i][\'href\'], \'gbk\') parent_href = get_parent_url(city_list[i][\'href\']) if not t: continue soup = BeautifulSoup(t, \'html.parser\') for v in soup.find_all(attrs={\'class\': \'countytr\'}): id = str(v.find_all(\'td\')[0].text) if len(v.find_all(\'td\')[1].find_all(\'a\')): country_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\')) else: continue county_list[id[0:6]] = {\'id\': id[0:6], \'name\': str(v.find_all(\'td\')[1].text), \'city_id\': i, \'code\': id, \'href\': parent_href + country_href} insert_data(\'country\', county_list) return county_list # 县下级街道、镇 def street(county_list): print(\'爬取街道数据中。。。\') street_list = {} for i in county_list: t = http_request(county_list[i][\'href\'], \'gbk\') parent_href = get_parent_url(county_list[i][\'href\']) if not t: continue soup = BeautifulSoup(t, \'html.parser\') time.sleep(1) for v in soup.find_all(attrs={\'class\': \'towntr\'}): id = str(v.find_all(\'td\')[0].text) street_href = str(v.find_all(\'td\')[1].find_all(\'a\')[0].get(\'href\')) street_list[id[0:9]] = {\'id\': id[0:9], \'name\': str(v.find_all(\'td\')[1].text), \'country_id\': i, \'code\': id, \'href\': parent_href + street_href} insert_data(\'street\', street_list) return street_list # 社区、街道办 def community(street_list): print(\'爬取社区数据中。。。\') community_list = {} for i in street_list: t = http_request(street_list[i][\'href\'], \'gbk\') if not t: continue soup = BeautifulSoup(t, \'html.parser\') time.sleep(1) for v in soup.find_all(attrs={\'class\': \'villagetr\'}): id = str(v.find_all(\'td\')[0].text) community_list[id[0:12]] = {\'id\': id[0:12], \'name\': str(v.find_all(\'td\')[2].text), \'street_id\': i, \'code\': id, \'category\': str(v.find_all(\'td\')[1].text)} insert_data(\'community\', community_list) return community_list # 获取上级url def get_parent_url(href): # print(\'上级链接:\' + href) arr = href.split(\'/\') last_value = arr[len(arr) - 1] href = href.replace(last_value, \'\') return href # 插入数据 def insert_data(table, data): if len(data) == 0: return False keys = list(data.keys()) table_keys = \'`\'+\'`,`\'.join(list(data[keys[0]].keys()))+\'`\' for k in range(0, len(keys)): table_values = \'\\'\'+\'\\',\\'\'.join(list(data[keys[int(k)]].values()))+\'\\'\' sql = \'INSERT INTO %s (%s) VALUES (%s)\' % (table, table_keys, table_values) # exit(sql) try: cursor.execute(sql) except Exception as e: print(e) conn.commit() if __name__ == \'__main__\': conn = pymysql.connect(host=\'139.199.16.31\', port=3379, user=\'root\', passwd=\'Yunzin@yz123\', db=\'yz_cbe\', charset=\'utf8\') cursor = conn.cursor() sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) url = \'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/\' header = { \'Cookie\': \'AD_RS_COOKIE=20181108\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ \' \'Chrome/58.0.3029.110 Safari/537.36\'} provinceList = province() print(provinceList) #provinceList = [\'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/43.html\'] cityList = city(provinceList) # print(cityList) countryList = country(cityList) print(countryList) streetList = street(countryList) print(streetList) communityList = community(streetList) print(\'数据抓取完成\')
CREATE TABLE `province` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `code` varchar(255) DEFAULT NULL, `href` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `city` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `province_id` int(11) unsigned DEFAULT NULL, `code` varchar(255) DEFAULT NULL, `href` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`), KEY `city_province_id_foreign` (`province_id`), CONSTRAINT `city_province_id_foreign` FOREIGN KEY (`province_id`) REFERENCES `province` (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `country` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `city_id` int(11) unsigned DEFAULT NULL, `code` varchar(255) DEFAULT NULL, `href` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`), KEY `country_city_id_foreign` (`city_id`), CONSTRAINT `country_city_id_foreign` FOREIGN KEY (`city_id`) REFERENCES `city` (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `street` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `name` varchar(255) DEFAULT NULL, `country_id` int(11) unsigned DEFAULT NULL, `code` varchar(255) DEFAULT NULL, `href` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`), KEY `street_country_id_foreign` (`country_id`), CONSTRAINT `street_country_id_foreign` FOREIGN KEY (`country_id`) REFERENCES `country` (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `community` ( `id` varchar(100) NOT NULL, `name` varchar(255) DEFAULT NULL, `street_id` int(11) unsigned DEFAULT NULL, `code` varchar(255) DEFAULT NULL, `category` varchar(255) DEFAULT NULL COMMENT \'城乡分类代码\', PRIMARY KEY (`id`), KEY `community_street_id_foreign` (`street_id`), CONSTRAINT `community_street_id_foreign` FOREIGN KEY (`street_id`) REFERENCES `street` (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;