一、主题式网络爬虫设计方案
1.爬虫名称:爬取新冠病毒疫情数据。
2.爬取内容:腾讯新闻网实时疫情数据。
3.网络爬虫设计方案概述:从网页源代码中找出数据对应标签,对数据进行分析和可视化处理。
二、主题页面的结构特征分析
1.主题页面的结构与特征分析:
2.Htmls页面解析:页面中按F12查看网页源代码
3.节点(标签)查找方法与遍历方法:在所需数据位置右键→查找,即可找到标签
三、网络爬虫程序设计
1.数据爬取与采集:
import requests import json import time import datetime import matplotlib.pyplot as plt import numpy as np url = \'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d\'%int(time.time()*1000) data = json.loads(requests.get(url=url).json()[\'data\']) num = data[\'areaTree\'][0][\'children\']
2.数据分析与可视化:
# 解析确诊数据 total_data = {} for item in num: if item[\'name\'] not in total_data: total_data.update({item[\'name\']:0}) for city_data in item[\'children\']: total_data[item[\'name\']] +=int(city_data[\'total\'][\'confirm\']) # 解析疑似数据 total_suspect_data = {} for item in num: if item[\'name\'] not in total_suspect_data: total_suspect_data.update({item[\'name\']:0}) for city_data in item[\'children\']: total_suspect_data[item[\'name\']] +=int(city_data[\'total\'][\'suspect\']) # 解析死亡数据 total_dead_data = {} for item in num: if item[\'name\'] not in total_dead_data: total_dead_data.update({item[\'name\']:0}) for city_data in item[\'children\']: total_dead_data[item[\'name\']] +=int(city_data[\'total\'][\'dead\']) # 解析治愈数据 total_heal_data = {} for item in num: if item[\'name\'] not in total_heal_data: total_heal_data.update({item[\'name\']:0}) for city_data in item[\'children\']: total_heal_data[item[\'name\']] +=int(city_data[\'total\'][\'heal\']) # 解析新增确诊数据 total_new_data = {} for item in num: if item[\'name\'] not in total_new_data: total_new_data.update({item[\'name\']:0}) for city_data in item[\'children\']: total_new_data[item[\'name\']] +=int(city_data[\'today\'][\'confirm\']) #统计数据并输出 names = list(total_data.keys()) num1 = list(total_data.values()) num2 = list(total_suspect_data.values()) num3 = list(total_dead_data.values()) num4 = list(total_heal_data.values()) num5 = list(total_new_data.values()) today=datetime.date.today() f=open(\'./疫情-%s.csv\'%(today),\'w\',encoding=\'utf-8\') f.write(\'省份,确诊人数,死亡人数,治愈人数,新增确诊\n\') i = 0 while i<len(names): f.write(names[i]+\',\'+str(num1[i])+\',\'+str(num3[i])+\',\'+str(num4[i])+\',\'+str(num5[i])+\'\n\') i = i + 1 #绘制柱形图 plt.figure(figsize=[100,60]) plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] plt.rcParams[\'axes.unicode_minus\'] = False #绘制确诊数据 p1 = plt.subplot(221) names = total_data.keys() nums = total_data.values() print(names) print(nums) print(total_data) plt.bar(names, nums, width=0.5, color=\'green\') plt.ylabel("确诊人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) plt.sca(p1) #绘制新增确诊数据 p2 = plt.subplot(222) names = total_new_data.keys() nums = total_new_data.values() print(names) print(nums) plt.bar(names, nums, width=0.5, color=\'yellow\') plt.ylabel("新增确诊人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) plt.sca(p2) #绘制死亡数据 p3 = plt.subplot(223) names = total_dead_data.keys() nums = total_dead_data.values() print(names) print(nums) plt.bar(names, nums, width=0.5, color=\'blue\') plt.xlabel("地区") plt.ylabel("死亡人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) plt.sca(p3) #绘制治愈数据 p4 = plt.subplot(224) names = total_heal_data.keys() nums = total_heal_data.values() print(names) print(nums) plt.bar(names, nums, width=0.3, color=\'red\') plt.xlabel("地区") plt.ylabel("治愈人数", rotation=90,size=50) plt.xticks(list(names), rotation=-60, size=50) for a, b in zip(list(names), list(nums)): plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) plt.sca(p4) plt.show()
3.将以上数据汇总:
1 import requests 2 import json 3 import time 4 import datetime 5 import matplotlib.pyplot as plt 6 import numpy as np 7 8 url = \'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d\'%int(time.time()*1000) 9 data = json.loads(requests.get(url=url).json()[\'data\']) 10 num = data[\'areaTree\'][0][\'children\'] 11 12 # 解析确诊数据 13 total_data = {} 14 for item in num: 15 if item[\'name\'] not in total_data: 16 total_data.update({item[\'name\']:0}) 17 for city_data in item[\'children\']: 18 total_data[item[\'name\']] +=int(city_data[\'total\'][\'confirm\']) 19 20 # 解析疑似数据 21 total_suspect_data = {} 22 for item in num: 23 if item[\'name\'] not in total_suspect_data: 24 total_suspect_data.update({item[\'name\']:0}) 25 for city_data in item[\'children\']: 26 total_suspect_data[item[\'name\']] +=int(city_data[\'total\'][\'suspect\']) 27 28 29 # 解析死亡数据 30 total_dead_data = {} 31 for item in num: 32 if item[\'name\'] not in total_dead_data: 33 total_dead_data.update({item[\'name\']:0}) 34 for city_data in item[\'children\']: 35 total_dead_data[item[\'name\']] +=int(city_data[\'total\'][\'dead\']) 36 37 # 解析治愈数据 38 total_heal_data = {} 39 for item in num: 40 if item[\'name\'] not in total_heal_data: 41 total_heal_data.update({item[\'name\']:0}) 42 for city_data in item[\'children\']: 43 total_heal_data[item[\'name\']] +=int(city_data[\'total\'][\'heal\']) 44 45 # 解析新增确诊数据 46 total_new_data = {} 47 for item in num: 48 if item[\'name\'] not in total_new_data: 49 total_new_data.update({item[\'name\']:0}) 50 for city_data in item[\'children\']: 51 total_new_data[item[\'name\']] +=int(city_data[\'today\'][\'confirm\']) 52 53 54 #统计数据并输出 55 names = list(total_data.keys()) 56 num1 = list(total_data.values()) 57 num2 = list(total_suspect_data.values()) 58 num3 = list(total_dead_data.values()) 59 num4 = list(total_heal_data.values()) 60 num5 = list(total_new_data.values()) 61 62 63 today=datetime.date.today() 64 f=open(\'./疫情-%s.csv\'%(today),\'w\',encoding=\'utf-8\') 65 f.write(\'省份,确诊人数,死亡人数,治愈人数,新增确诊\n\') 66 i = 0 67 while i<len(names): 68 f.write(names[i]+\',\'+str(num1[i])+\',\'+str(num3[i])+\',\'+str(num4[i])+\',\'+str(num5[i])+\'\n\') 69 i = i + 1 70 71 72 73 74 #绘制柱形图 75 76 plt.figure(figsize=[100,60]) 77 plt.rcParams[\'font.sans-serif\'] = [\'SimHei\'] 78 plt.rcParams[\'axes.unicode_minus\'] = False 79 80 #绘制确诊数据 81 p1 = plt.subplot(221) 82 names = total_data.keys() 83 nums = total_data.values() 84 print(names) 85 print(nums) 86 print(total_data) 87 plt.bar(names, nums, width=0.5, color=\'green\') 88 plt.ylabel("确诊人数", rotation=90,size=50) 89 plt.xticks(list(names), rotation=-60, size=50) 90 for a, b in zip(list(names), list(nums)): 91 plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) 92 plt.sca(p1) 93 94 #绘制新增确诊数据 95 p2 = plt.subplot(222) 96 names = total_new_data.keys() 97 nums = total_new_data.values() 98 print(names) 99 print(nums) 100 plt.bar(names, nums, width=0.5, color=\'yellow\') 101 plt.ylabel("新增确诊人数", rotation=90,size=50) 102 plt.xticks(list(names), rotation=-60, size=50) 103 for a, b in zip(list(names), list(nums)): 104 plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) 105 plt.sca(p2) 106 107 #绘制死亡数据 108 p3 = plt.subplot(223) 109 names = total_dead_data.keys() 110 nums = total_dead_data.values() 111 print(names) 112 print(nums) 113 plt.bar(names, nums, width=0.5, color=\'blue\') 114 plt.xlabel("地区") 115 plt.ylabel("死亡人数", rotation=90,size=50) 116 plt.xticks(list(names), rotation=-60, size=50) 117 for a, b in zip(list(names), list(nums)): 118 plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) 119 plt.sca(p3) 120 121 #绘制治愈数据 122 p4 = plt.subplot(224) 123 names = total_heal_data.keys() 124 nums = total_heal_data.values() 125 print(names) 126 print(nums) 127 plt.bar(names, nums, width=0.3, color=\'red\') 128 plt.xlabel("地区") 129 plt.ylabel("治愈人数", rotation=90,size=50) 130 plt.xticks(list(names), rotation=-60, size=50) 131 for a, b in zip(list(names), list(nums)): 132 plt.text(a, b, b, ha=\'center\', va=\'bottom\', size=35) 133 plt.sca(p4) 134 plt.show()
四、结论
1.确诊人数,死亡人数和治愈人数湖北均较高,新增人数人数陕西较多
2.本次作业了解到自身短板过多,很多知识没有学习透彻,导致遇到在过程中遇到很多问题,并且不能完整的完成本次作业