前期准备
获取39药品网所有药品ID
1.下载39药品网所有药品页面
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/10/15
@Author: Zhang Yafei
"""
import re
import logging
import requests
import os
import time
from retrying import retry
from urllib.request import urljoin
from urllib.parse import urlsplit
# from scrapy import Selector
from lxml import etree
# from fake_useragent import UserAgent
from multiprocessing import Pool
from ids import Diabetes_ids
# ua = UserAgent()
# headers = {\'User-Agent\':ua.random}
headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36\'}
proxies = {\'http\':\'http://61.135.217.7:80\',\'https\':\'http://171.113.156.168:8010\'}
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# DOWNLOAD_DIR = os.path.join(BASE_DIR,\'药品\')
DOWNLOAD_DIR = os.path.join(BASE_DIR,\'糖尿病\')
file_path = os.path.join(BASE_DIR,\'drug_ruls.txt\')
RUN_LOG_FILE = os.path.join(BASE_DIR,\'log\',\'run.log\')
ERROR_LOG_FILE = os.path.join(BASE_DIR,\'log\',\'error_log\')
if not os.path.exists(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
class Logger(object):
"""
logger对象:打印日志
"""
def __init__(self):
self.run_log_file = RUN_LOG_FILE
self.error_log_file = ERROR_LOG_FILE
self.run_log = None
self.error_log = None
self.initialize_run_log()
self.initialize_error_log()
@staticmethod
def check_path_exist(log_abs_file):
log_path = os.path.split(log_abs_file)[0]
if not os.path.exists(log_path):
os.mkdir(log_path)
def initialize_run_log(self):
self.check_path_exist(self.run_log_file)
fh = logging.FileHandler(self.run_log_file, \'a\', encoding=\'utf-8\')
sh = logging.StreamHandler()
# fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s : %(message)s")
# fh.setFormatter(fmt)
# sh.setFormatter(fmt)
logger1 = logging.Logger(\'run_log\', level=logging.INFO)
logger1.addHandler(fh)
logger1.addHandler(sh)
self.run_logger = logger1
def initialize_error_log(self):
self.check_path_exist(self.error_log_file)
fh = logging.FileHandler(self.error_log_file, \'a\', encoding=\'utf-8\')
sh = logging.StreamHandler()
# fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s : %(message)s")
# fh.setFormatter(fmt)
# sh.setFormatter(fmt)
logger1 = logging.Logger(\'error_log\', level=logging.ERROR)
logger1.addHandler(fh)
logger1.addHandler(sh)
self.error_logger = logger1
def log(self, message, mode=True):
"""
写入日志
:param message: 日志信息
:param mode: True表示运行信息,False表示错误信息
:return:
"""
if mode:
self.run_logger.info(message)
else:
self.error_logger.error(message)
logger = Logger()
class Drug(object):
"""
self.base_url 药物抓取基础url=药品概述url
self.manual_url 药品详细说明书url
self.comment_url 药品用药经验url
self.ask_url 药品咨询url
self.logger 打印日志
"""
def __init__(self,base_url):
self.base_url = base_url
self.drug_id = self.base_url.split(\'/\')[-2]
self.manual_url = urljoin(base_url,\'manual\')
self.comment_url = urljoin(base_url,\'comment\')
self.ask_url = urljoin(base_url,\'ask\')
self.make_drug_dir()
method_list = [self.summary,self.manual,self.comment,self.ask]
map(lambda x:x,[x() for x in method_list])
def make_drug_dir(self):
"""
创建每一种药品所有网页文件文件夹
:return:
"""
# self.check_download_dir()
response = requests.get(self.base_url,headers=headers)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
# selector = Selector(response)
# drug_name = selector.css(\'.t1 h1 a::text\').extract_first()
try:
drug_name = html.xpath(\'//div[@class="t1"]/h1/a/text()\')[0]
except IndexError:
drug_name = html.xpath(\'//div[@class="t1"]/h1/text()\')[0]
self.drug_name = self.validateTitle(drug_name)
self.drug_dir_path = os.path.join(DOWNLOAD_DIR,\'{}[{}]\'.format(self.drug_name,self.drug_id))
if not os.path.exists(self.drug_dir_path):
os.mkdir(self.drug_dir_path)
def validateTitle(self,title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # \'/ \ : * ? " < > |\'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
@retry(stop_max_attempt_number=3)
def retry_download(self, url):
"""
通过装饰器封装重试下载模块,最多重试三次
:param url_str: 下载网页的最终地址
:param data: Post传输数据
:param method: 下载方法GET或POST
:param proxies: 代理服务器
:return: 下载结果
"""
result = requests.get(url, headers=headers, proxies=proxies,timeout=3)
assert result.status_code == 200 # 使用断言判断下载状态,成功则返回结果,失败抛出异常
return result
def download(self, url):
"""
真正的下载类,代理模式
:param url_str:下载的链接
:param data:post需要传输的数据
:param method:请求方法
:param proxies:代理
:return:下载的结果
"""
try:
result = self.retry_download(url)
except Exception as e: # 异常处理尽量使用具体的异常
print(e)
# logger.log(url,False)
result = None
return result
def summary(self):
"""
抓取药品概述页
:return:
"""
summary_path = os.path.join(self.drug_dir_path,\'{}[{}]-药品概述.html\'.format(self.drug_name,self.drug_id))
if os.path.exists(summary_path):
print(\'{}药品概述已经下载过了\'.format(self.drug_name))
else:
response = requests.get(self.base_url,headers=headers)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-药品概述下载失败-{}\'.format(self.drug_name,self.drug_id,self.base_url),False)
logger.log(\'{}\'.format(self.base_url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
with open(summary_path,\'w\',encoding=\'gb2312\') as file:
file.write(response)
logger.log(\'{}[{}]-药品概述下载完成\'.format(self.drug_name,self.drug_id))
def manual(self):
"""
抓取药品详细说明书
:return:
"""
manual_path = os.path.join(self.drug_dir_path,\'{}[{}]-详细说明书.html\'.format(self.drug_name,self.drug_id))
if os.path.exists(manual_path):
print(\'{}详细说明书已经下载过了\'.format(self.drug_name))
else:
response = requests.get(self.manual_url,headers=headers)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-详细说明书下载失败-{}\'.format(self.drug_name,self.drug_id,self.manual_url),False)
logger.log(\'{}\'.format(self.manual_url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
with open(manual_path,\'w\',encoding=\'gb2312\') as file:
file.write(response)
logger.log(\'{}[{}]-详细说明书下载完成\'.format(self.drug_name,self.drug_id))
def comment(self):
"""
药品用药经验页
:return:
"""
response = requests.get(self.comment_url,headers=headers)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-用药经验下载失败\'.format(self.drug_name,self.drug_id,self.comment_url),False)
logger.log(\'{}\'.format(self.comment_url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
html = etree.HTML(response)
try:
comment_nums = int(html.xpath(\'//div[@class="dps"]/cite/font/text()\')[0])
except IndexError as e:
logger.log(\'{}[{}]-用药经验页评论数为零\'.format(self.drug_name,self.drug_id))
comment_nums = 0
# selector = Selector(response)
# comment_nums = int(selector.css(\'.dps cite font::text\').extract_first())
num,remainder = divmod(comment_nums,20)
for x in range(1,num+2):
url = urljoin(self.base_url,\'comment/k0_p{}\'.format(x))
self.comment_page(url)
def comment_page(self,url):
"""
抓取用药经验详情页
:param url:
:return:
"""
comment_path = os.path.join(self.drug_dir_path,\'{}[{}]-用药经验{}.html\'.format(self.drug_name,self.drug_id,url[-1]))
if os.path.exists(comment_path):
print(\'{}[{}]-用药经验{}已经下载过了\'.format(self.drug_name,self.drug_id,url[-1]))
else:
response = requests.get(url,headers=headers)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-用药经验{}下载失败-{}\'.format(self.drug_name,self.drug_id,url[-1],url),False)
logger.log(\'{}\'.format(url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
with open(comment_path,\'w\',encoding=\'gb2312\') as file:
file.write(response)
logger.log(\'{}[{}]-用药经验{}下载完成\'.format(self.drug_name,self.drug_id,url[-1]))
def ask(self):
"""
药品用药咨询页
:return:
"""
response = requests.get(self.ask_url)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-用药咨询下载失败-{}\'.format(self.drug_name,self.drug_id,self.ask_url),False)
logger.log(\'{}\'.format(self.ask_url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
html = etree.HTML(response)
try:
ask_nums = html.xpath(\'//span[@class="pages"]/span[@class="pgleft"]/b/text()\')[0]
ask_nums = int(re.match(\'.*?(\d+).*\',ask_nums).group(1))
except Exception as e:
ask_nums = 0
logger.log(\'{}[{}]-用药咨询页无人提问\'.format(self.drug_name,self.drug_id))
# selector = Selector(response)
# ask_nums = int(selector.css(\'.pages .pgleft b::text\').re(\'\d+\')[0])
num,remainder = divmod(ask_nums,5)
for x in range(1,num+2):
url = urljoin(self.base_url,\'ask/p{}\'.format(x))
self.ask_page(url)
def ask_page(self,url):
"""
抓取用药咨询详情页
:param url:
:return:
"""
ask_path = os.path.join(self.drug_dir_path,\'{}[{}]-用药咨询{}.html\'.format(self.drug_name,self.drug_id,url[-1]))
if os.path.exists(ask_path):
print(\'{}[{}]-用药咨询{}已经下载过了\'.format(self.drug_name,self.drug_id,url[-1]))
else:
response = requests.get(url,headers=headers)
if response.status_code != 200:
response = self.download(self.base_url)
if not response:
# self.logger.log(\'{}[{}]-用药咨询{}下载失败-{}\'.format(self.drug_name,self.drug_id,url[-1],url),False)
logger.log(\'{}\'.format(url),False)
return
response = response.content.decode(\'gb2312\',\'ignore\')
with open(ask_path,\'w\',encoding=\'gb2312\') as file:
file.write(response)
logger.log(\'{}[{}]-用药咨询{}下载完成\'.format(self.drug_name,self.drug_id,url[-1]))
def transform_urls(filename):
drug_id = re.findall(r\'.*?\[(\d+)\]\', filename)[-1]
drug_url = \'http://ypk.39.net/{}/\'.format(drug_id)
return drug_url
def check_downloaded(func):
def inner(drug_urls):
file_list = os.listdir(DOWNLOAD_DIR)
file_list = map(transform_urls,[filename for filename in file_list])
# print(len(list(file_list)))
files = set(drug_urls)-set(file_list)
# print(len(drug_urls))
# print(len(files))
func(list(files))
return inner
def get_drug_urls():
"""读取所有要抓取药品的url地址"""
with open(file_path,\'r\',encoding=\'utf-8\') as f:
drug_urls = f.readlines()
drug_urls = list(map(lambda x: x.strip(), list(drug_urls)))
return drug_urls
def get_diabetes_urls():
return list(set(list(map(lambda x:\'http://ypk.39.net/{}/\'.format(x),Diabetes_ids))))
def main(drug_base_url):
"""创建Drug类实例,进行每一种药品的抓取"""
Drug(drug_base_url)
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # \'/ \ : * ? " < > |\'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
def spider(url):
url_path = urlsplit(url)
drug_id = url_path.path.strip(\'/\')
try:
response = requests.get(url=url,headers=headers,timeout=3)
# response.encoding = response.apparent_encoding
response = response.content.decode(\'gb2312\',\'ignore\')
html = etree.HTML(response)
drug_name = html.xpath(\'//div[@class="t1"]/h1/text()\')[0]
drug_name = validateTitle(drug_name)
except Exception as e:
print(e)
logger.log(url,False)
return
drug_dir_path = os.path.join(DOWNLOAD_DIR, \'{}[{}]\'.format(drug_name, drug_id))
if not os.path.exists(drug_dir_path):
os.mkdir(drug_dir_path)
drug_html_detail = os.path.join(drug_dir_path,\'{}[{}].html\'.format(drug_name,drug_id))
if not os.path.exists(drug_html_detail):
with open(drug_html_detail,\'w\',encoding=\'gb2312\') as file:
file.write(response)
print(drug_name,\'下载成功\')
@check_downloaded
def run(drug_urls):
"""创建进程池"""
print(drug_urls)
print(len(drug_urls))
pool = Pool(5)
pool.map(main,drug_urls) #drug_urls[7010:12000]
# pool.map(spider,drug_urls)
pool.close()
pool.join()
if __name__ == \'__main__\':
drug_urls = get_drug_urls()
run(drug_urls)
# urls = get_diabetes_urls()
# run(urls)
具体实现:进程池,requests+lxml,打印日志,类,重新下载排除已下载页面
2.解析所有药品页面提取有价值信息
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/10/13
@Author: Zhang Yafei
"""
import csv
import json
import os
import re
from scrapy.selector import Selector
import logging
import pandas
# import numpy as np
BASE_DIRS = os.path.dirname(os.path.abspath(__file__))
drug_path = os.path.join(BASE_DIRS,\'药品\')
dirs_list = os.listdir(drug_path)
analysis_file_path = os.path.join(BASE_DIRS,\'drug_info.tsv\')
RUN_LOG_FILE = os.path.join(BASE_DIRS,\'analysis_log\',\'run.log\')
ERROR_LOG_FILE = os.path.join(BASE_DIRS,\'analysis_log\',\'error_log\')
def get_unresoved_drug_list():
"""得到未解析药品列表"""
data = pandas.read_csv(\'drug_info.tsv\',sep=\'\t\',encoding=\'utf-8\')
try:
resoved_drug_list = data.apply(lambda row:\'{}[{}]\'.format(row[\'药品名称\'],row[\'药品ID\']),axis=1).tolist()
except AttributeError as e:
resoved_drug_list = []
for index,row in data.iterrows():
drug_name = \'{}[{}]\'.format(row[\'药品名称\'],row[\'药品ID\'])
resoved_drug_list.append(drug_name)
unresoved_drug_list = list(set(dirs_list) - set(resoved_drug_list))
return unresoved_drug_list
#1.index方式
# resoved_drug_list = []
# for row in data.index:
# drug_name = \'{}[{}]\'.format(data.iloc[row][\'药品名称\'],data.iloc[row][\'药品ID\'])
# resoved_drug_list.append(drug_name)
#2.iterrows方式
# for index,row in data.iterrows():
# drug_name = \'{}[{}]\'.format(row[\'药品名称\'],row[\'药品ID\'])
# resoved_drus_list.append(drug_name)
# print(dirs_list.__len__(),resoved_drug_list.__len__(),unresoved_drug_list.__len__())
def write_resoved_drag_list(drag):
"""将解析完成的药品写入文件"""
if not os.path.exists(\'resolved_drag_list.py\'):
resoved_drag_list = set()
resoved_drag_list.add(drag)
else:
with open(\'resolved_drag_list.py\', \'r\', encoding=\'utf-8\') as f:
resoved_drag_list = set(json.load(f))
resoved_drag_list.add(drag)
with open(\'resolved_drag_list.py\',\'w\',encoding=\'utf-8\') as f:
json.dump(list(resoved_drag_list),f)
def write_error_drag_list(drag):
"""将错误drug写入文件"""
if not os.path.exists(\'error_drag_list.py\'):
error_drag_list = set()
error_drag_list.add(drag)
else:
with open(\'error_drag_list.py\', \'r\', encoding=\'utf-8\') as f:
error_drag_list = set(json.load(f))
error_drag_list.add(drag)
with open(\'error_drag_list.py\',\'w\',encoding=\'utf-8\') as f:
json.dump(list(error_drag_list),f)
class Logger(object):
def __init__(self):
self.run_log_file = RUN_LOG_FILE
self.error_log_file = ERROR_LOG_FILE
self.run_log = None
self.error_log = None
self.initialize_run_log()
self.initialize_error_log()
@staticmethod
def check_path_exist(log_abs_file):
log_path = os.path.split(log_abs_file)[0]
if not os.path.exists(log_path):
os.mkdir(log_path)
def initialize_run_log(self):
self.check_path_exist(self.run_log_file)
fh = logging.FileHandler(self.run_log_file, \'a\', encoding=\'utf-8\')
sh = logging.StreamHandler()
fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s : %(message)s")
# fh.setFormatter(fmt)
sh.setFormatter(fmt)
logger1 = logging.Logger(\'run_log\', level=logging.INFO)
logger1.addHandler(fh)
logger1.addHandler(sh)
self.run_logger = logger1
def initialize_error_log(self):
self.check_path_exist(self.error_log_file)
fh = logging.FileHandler(self.error_log_file, \'a\', encoding=\'utf-8\')
sh = logging.StreamHandler()
fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s : %(message)s")
# fh.setFormatter(fmt)
sh.setFormatter(fmt)
logger1 = logging.Logger(\'error_log\', level=logging.ERROR)
logger1.addHandler(fh)
logger1.addHandler(sh)
self.error_logger = logger1
def log(self, message, mode=True):
"""
写入日志
:param message: 日志信息
:param mode: True表示运行信息,False表示错误信息
:return:
"""
if mode:
self.run_logger.info(message)
else:
self.error_logger.error(message)
class DrugInfo(object):
"""
提取的药品信息:
self.drug_name #药品名称
self.category #药品类型
self.cite #国家标准
self.company #生产厂家
self.address #厂家地址
self.license_number #批准文号
self.approval_date #批准日期
self.form_drug #剂型
self.spec #规格
self.store #储存方法
self.period_valid #有效期限
self.attention_rank #关注度排名
self.indication #适应症
self.component #成分
self.function #功能主治
self.usage_dosage #用法用量
self.contraindication #禁忌症
self.special_population #特殊人群用药
self.indications #适应症概况
self.is_or_not_medical_insurance #是否属于医保
self.is_or_not_infections #是否有传染性
self.related_symptoms #相关症状
self.related_examination #相关检查
self.adverse_reaction #不良反应
self.attention_matters #注意事项
self.interaction #药物相互作用
self.pharmacological_action #药理作用
self.revision_date #说明书修订日期
self.drug_use_consult #用药咨询
self.drug_use_experience #用药经验
"""
def __init__(self,drug):
drug_dir = os.path.join(drug_path, drug)
self.drug_name = re.findall(\'(.*?)\[\d+\]\',drug)[0]
self.drug_id = re.findall(\'.*?\[(\d+)\].*\',drug)[0]
self.drug_dir = drug_dir
self.drug_use_experience = \'\'
self.drug_use_consult = \'\'
self.file_list = os.listdir(self.drug_dir)
self.logger = Logger()
self.result = True
self.dispatch()
if self.drug_use_consult.__len__()==0:self.drug_use_consult = \'无\'
if self.drug_use_experience.__len__()==0:self.drug_use_experience = \'无\'
def dispatch(self):
for file in self.file_list:
if file.endswith(\'药品概述.html\'):
self.drug_summary(self.file_path(file))
elif file.endswith(\'详细说明书.html\'):
self.drug_instruction(self.file_path(file))
elif re.match(\'.*?用药咨询.*\',file):
self.drug_consultation(self.file_path(file))
elif re.match(\'.*?用药经验.*\',file):
self.drug_experience(self.file_path(file))
else:
self.result = False
break
def file_path(self,file):
return os.path.join(self.drug_dir,file)
def read_file(self,file):
with open(file,\'r\') as f:
html = f.read()
return html
def drug_summary(self,file):
"""药品概况"""
html = self.read_file(file)
selector = Selector(text=html)
self.category = selector.xpath(\'//div[@class="t1"]/cite[1]/span/text()\').extract_first() #药品类型
if not self.category:
self.category = \'未知\'
self.cite = selector.xpath(\'//div[@class="t1"]/cite[2]/span/text()\').extract_first() #国家标准
if not self.cite:
self.cite = \'未知\'
try:
self.company = selector.css(\'.t3 .company a::text\').extract()[0] #生产厂家
except IndexError as e:
self.company = \'未知\'
try:
self.address = selector.css(\'.t3 .address::text\').extract()[0] #厂家地址
except IndexError as e:
self.address = \'未知\'
try:
self.license_number = selector.xpath(\'//ul[@class="xxs"]/li[1]/text()\').extract_first().strip() #批准文号
except AttributeError:
self.license_number = \'未知\'
try:
self.approval_date = selector.xpath(\'//ul[@class="xxs"]/li[2]/text()\').extract_first().strip() #批准日期
except AttributeError:
self.approval_date = \'未知\'
try:
self.form_drug = selector.xpath(\'//ul[@class="showlis"]/li[1]/text()\').extract_first().strip() #剂型
except AttributeError:
self.form_drug = \'未知\'
try:
self.spec = selector.xpath(\'//ul[@class="showlis"]/li[2]/text()\').extract_first().strip() #规格
except AttributeError:
self.spec = \'未知\'
try:
self.store = selector.xpath(\'//ul[@class="showlis"]/li[3]/text()\').extract_first().strip().strip(\'。\') #储存方法
except AttributeError:
self.store = \'未知\'
try:
self.period_valid = selector.xpath(\'//ul[@class="showlis"]/li[4]/text()\').extract_first().strip(\'。\').replace(\'\n\',\'\') #有效期限
except AttributeError:
self.period_valid = \'未知\'
self.attention_rank = selector.css(\'.guanzhu cite font::text\').extract_first() #关注度排名
if not self.attention_rank:
self.attention_rank = \'未知\'
self.indication = \',\'.join(selector.css(\'.whatsthis li::text\').extract()) #适应症
if self.indication == \'\':
self.indication = \'未知\'
usage_dosage = selector.css(\'.ps p:nth-child(3)::text\').extract_first() #用法用量
if usage_dosage:
self.usage_dosage = re.sub(\'<.*?>\',\'\',usage_dosage).strip().replace(\'\n\',\'\') #禁忌症
else:
self.usage_dosage = \'未知\'
indications = selector.css(\'#diseaseintro::text\').extract_first() #适应症概况
if indications:
self.indications = re.sub(\'<.*?>\',\'\',indications).strip().replace(\'\n\',\'\') #禁忌症
else:
self.indications = \'未知\'
try:
self.is_or_not_medical_insurance = selector.css(\'.syz_cons p:nth-child(2)::text\').extract_first().split(\':\')[1] #是否属于医保
except AttributeError as e:
self.is_or_not_medical_insurance = \'未知\'
try:
self.is_or_not_infections = selector.css(\'.syz_cons p:nth-child(3)::text\').extract_first().split(\':\')[1].strip() #是否有传染性
except AttributeError as e:
self.is_or_not_infections = \'未知\'
self.related_symptoms = \',\'.join(selector.css(\'.syz_cons p:nth-child(4) a::text\').extract()[:-1]) #相关症状
if len(self.related_symptoms) == 0:
self.related_symptoms = \'未知\'
self.related_examination = \',\'.join(selector.css(\'.syz_cons p:nth-child(5) a::text\').extract()[:-1]) #相关检查
if len(self.related_examination) == 0:
self.related_examination = \'未知\'
def drug_instruction(self,file):
"""详细说明书"""
html = self.read_file(file)
selector = Selector(text=html)
#注:不同药品之间网页结构有差别,提取的时候应注意
component = selector.xpath(\'//dt[text()="【成份】"]/following::*[1]\').extract_first()
if not component:
self.component = \'未知\'
else:
self.component = re.sub(\'<.*?>\',\'\',component).strip() #成分
contraindication= selector.xpath(\'//dt[text()="【禁忌】"]/following::*[1]\').extract_first()
if contraindication:
self.contraindication = re.sub(\'<.*?>\',\'\',contraindication).strip().replace(\'\n\',\'\') #禁忌症
else:
self.contraindication = \'未知\'
function = selector.xpath(\'//dt[text()="【功能主治】"]/following::*[1]\').extract_first()
if function:
self.function = re.sub(\'<.*?>\',\'\',function).strip() #功能主治
else:
self.function = \'未知\'
try:
self.adverse_reaction = selector.xpath(\'//dt[text()="【不良反应】"]/following::*[1]/p/text()\').extract_first().strip(\'。\') #不良反应
except AttributeError as e:
try:
self.adverse_reaction = selector.xpath(\'//dt[text()="【不良反应】"]/following::*[1]/text()\').extract_first().strip(\'。\') #不良反应
self.adverse_reaction = re.sub(\'<.*?>\',\'\',self.adverse_reaction).strip().replace(\'\n\',\'\') #注意事项
except AttributeError:
self.adverse_reaction = \'未知\'
attention_matters = selector.xpath(\'//dt[text()="【注意事项】"]/following::*[1]\').extract_first()
if attention_matters:
self.attention_matters = re.sub(\'<.*?>\',\'\',attention_matters).strip().replace(\'\n\',\'\') #注意事项
else:
self.attention_matters = \'未知\'
self.logger.log(\'{}[{}]-注意事项为空\'.format(self.drug_name,self.drug_id),False)
try:
self.interaction = selector.xpath(\'//dt[text()="【药物相互作用】"]/following::*[1]/p/text()\').extract_first() #药物相互作用
self.interaction = re.sub(\'<.*?>\',\'\',self.interaction).strip().replace(\'\n\',\'\') #注意事项
except TypeError:
self.interaction = \'未知\'
try:
self.pharmacological_action = selector.xpath(\'//dt[text()="【药理作用】"]/following::*[1]/p/text()\').extract_first() #药理作用
self.pharmacological_action = re.sub(\'<.*?>\',\'\',self.pharmacological_action).strip().replace(\'\n\',\'\')
except TypeError:
self.pharmacological_action = \'未知\'
try:
self.revision_date = selector.xpath(\'//dt[text()="【说明书修订日期】"]/following::*[1]/text()\').extract_first().strip() #说明书修订日期
except AttributeError:
self.revision_date = \'未知\'
try:
self.special_population = selector.xpath(\'//dt[text()="【特殊人群用药】"]/following::*[1]/text()\').extract_first() #特殊人群用药
self.special_population = re.sub(\'<.*?>\',\'\',self.special_population).strip().replace(\'\n\',\'\') #特殊人群用药
except TypeError:
self.special_population = \'未知\'
def drug_consultation(self,file):
"""用药咨询"""
html = self.read_file(file)
selector = Selector(text=html)
drug_use_consult = selector.css(\'.dpzx_con .zx p::text\').extract()
drug_use_consult = \'\'.join(drug_use_consult)
drug_use_consult = re.sub(\'<.*?>\',\'\',drug_use_consult).strip().replace(\'\n\',\'\') #用药咨询
self.drug_use_consult += drug_use_consult
def drug_experience(self,file):
"""用药经验"""
html = self.read_file(file)
selector = Selector(text=html)
drug_use_experience = selector.css(\'.pls_box .pls_mid p::text\').extract()
drug_use_experience = \'\'.join(drug_use_experience)
drug_use_experience = re.sub(\'<.*?>\',\'\',drug_use_experience).strip().replace(\'\n\',\'\') #用药经验
self.drug_use_experience += drug_use_experience.strip()
@staticmethod
def write_to_fileheader():
with open(\'drug_info.tsv\',\'w\',newline=\'\',encoding=\'utf-8\') as MyFile:
writer = csv.writer(MyFile,dialect=\'excel-tab\')
drug_header = [\'药品名称\',\'药品ID\',\'药品类型\',\'国家标准\',\'生产厂家\',\'厂家地址\',\'批准文号\',\'批准日期\',\'剂型\',\'规格\',\'储存方法\',\'有效期限\',\'关注度排名\',\'适应症\',\'成分\',\'功能主治\',\'用发用量\',\'禁忌症\',\'特殊人群用药\',\'适应症概况\',\'是否用于医保\',\'是否具有传染性\',\'相关症状\',\'相关检查\',\'不良反应\',\'注意事项\',\'药物相互作用\',\'药理作用\',\'说明书修订日期\',\'用药经验\',\'用药咨询\']
writer.writerow(drug_header)
def write_to_file(self):
with open(\'drug_info.tsv\', \'a\', newline=\'\', encoding=\'utf-8\') as MyFile:
writer = csv.writer(MyFile, dialect=\'excel-tab\')
druginfo_list = [self.drug_name,self.drug_id,self.category,self.cite,self.company,self.address,self.license_number,self.approval_date,
self.form_drug,self.spec,self.store,self.period_valid,self.attention_rank,
self.indication,self.component,self.function,self.usage_dosage,self.contraindication,
self.special_population,self.indications,self.is_or_not_medical_insurance,self.is_or_not_infections,
self.related_symptoms,self.related_examination,self.adverse_reaction,self.attention_matters,
self.interaction,self.pharmacological_action,self.revision_date,self.drug_use_experience,
self.drug_use_consult,
]
writer.writerow(druginfo_list)
self.logger.log(\'{}[{}]信息写入文件完毕\'.format(self.drug_name,self.drug_id))
def main(drug):
"""主函数"""
druginfo = DrugInfo(drug)
# print(druginfo.drug_name,druginfo.drug_id)
# print(druginfo.drug_use_experience)
# print(druginfo.drug_use_consult)
if druginfo.result:
druginfo.write_to_file()
write_resoved_drag_list(drug)
else:
druginfo.logger.log(\'{}[{}]\'.format(druginfo.drug_name,druginfo.drug_id),False)
write_error_drag_list(drug)
def new_data(row):
"""增加别名列"""
# print(row[\'药品名称\'],row[\'别名\'])
drug_name = row[\'药品名称\']
try:
row[\'别名\'] = drug_name.rsplit(\'(\',1)[1].strip(\')\')
row[\'药品名称\'] = drug_name.rsplit(\'(\',1)[0]
except IndexError as e:
row[\'别名\'] = np.NAN
return row
def update_drug_name():
"""更新药品文件信息"""
data = pandas.read_csv(\'drug_info.tsv\',sep=\'\t\',encoding=\'utf-8\')
col_name = data.columns.tolist()
# print(dir(col_name))
col_name.insert(col_name.index(\'药品名称\')+1,\'别名\')
# col_name.insert(1,\'别名\')
data = data.reindex(columns=col_name)
new_drug = data.apply(new_data,axis=1)
new_drug.to_csv(\'new_drug_info.tsv\',index=False,sep=\'\t\',encoding=\'utf-8\')
print(\'文件保存成功\')
# print(new_drug[[\'药品名称\',\'别名\']])
# for row in data[10:13].iterrows():
# drug_name = row[\'药品名称\'].values
# drug_alias = drug_name.rsplit(\'(\',1)[1].strip(\')\')
# print(drug_name)
# print(drug_alias)
# print(data.tail(10).index)
# print(data.iloc[:2,1:8])
# print(data.iloc[1][\'注意事项\'].replace(\'\n\',\'\'))
# print(data.iloc[2][\'注意事项\'].replace(\'\n\',\'\'))
# print(data.__len__())
# resoved_drus_list = data.apply(lambda row:\'{}[{}]\'.format(row[\'药品名称\'],row[\'药品ID\']),axis=1).tolist()
# print(resoved_drus_list.__len__())
# unresoved_drug_list = set(dirs_list) - set(resoved_drus_list)
# print(unresoved_drug_list.__len__())
if __name__ == \'__main__\':
if not os.path.exists(analysis_file_path):
DrugInfo.write_to_fileheader()
drug_list = get_unresoved_drug_list()
print(drug_list.__len__())
list(map(main,drug_list))
# with open(\'error_drag_list.py\',\'r\',encoding=\'utf-8\') as f:
# data = json.load(f)
# print(data)
# update_drug_name()
# newdata = pandas.read_csv(\'new_drug_info.tsv\',sep=\'\t\',encoding=\'utf-8\')
# print(newdata.head())
3.下载糖尿病相关药品页面
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/10
@Author: Zhang Yafei
"""
import json
import requests
from scrapy.selector import Selector
from lxml import etree
from multiprocessing import Pool
ids_list = []
headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36\'}
def spider(url):
response = requests.get(url,headers=headers)
# selector = Selector(response=response)
html = etree.HTML(response.text)
# ids = selector.css(\'.search_ul li a:nth-child(1)::attr(href)\').extract()
ids = html.xpath(\'//ul[@class="search_ul search_ul_yb"]/li/a/@href\')
ids = list(map(lambda x:x.strip(\'/\'),ids))
ids_list.extend(ids)
if __name__ == \'__main__\':
urls = [\'http://ypk.39.net/tangniaobing/p{}\'.format(i) for i in range(1,135)]
pool = Pool(4)
pool.map(spider,urls)
list(map(spider,urls))
with open(\'ids.py\',\'w\',encoding=\'utf-8\') as f:
json.dump(ids_list,f)
4.更新药品信息,拆分药名列分为药品名称列和别名列
def new_data(row):
"""增加别名列"""
drug_name = row[\'药品名称\']
try:
row[\'别名\'] = drug_name.rsplit(\'(\',1)[1].strip(\')\')
row[\'药品名称\'] = drug_name.rsplit(\'(\',1)[0]
except IndexError as e:
row[\'别名\'] = np.NAN
return row
def update_drug_name():
"""更新药品文件信息"""
data = pandas.read_csv(\'drug_info.tsv\',sep=\'\t\',encoding=\'utf-8\')
col_name = data.columns.tolist()
# print(dir(col_name))
col_name.insert(col_name.index(\'药品名称\')+1,\'别名\')
# col_name.insert(1,\'别名\')
data = data.reindex(columns=col_name)
new_drug = data.apply(new_data,axis=1)
new_drug.to_csv(\'new_drug_info.tsv\',index=False,sep=\'\t\',encoding=\'utf-8\')
print(\'文件保存成功\')
if __name__ == \'__main__\':
update_drug_name()
5.抓取所有药品评论数,并构建药品评论数字典
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/10
@Author: Zhang Yafei
"""
import pandas
import os
import re
# import jieba
from multiprocessing.pool import Pool
from scrapy import Selector
import json
import numpy
import time
import csv
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
drug_path = os.path.join(BASE_DIR, \'药品\')
dirs_list = os.listdir(drug_path)
result = {}
k_list = []
v_list = []
class_list = []
# comment_data = pandas.read_csv(\'comment_num_grade.csv\',encoding=\'utf-8\')
# data = list(comment_data.药品名称.values)
# comment_data[\'类别\'] = \'\'
count = 0
class DrugInfo(object):
"""构造药品评论数字典"""
def __init__(self, drug):
self.drug = drug
drug_dir = os.path.join(drug_path, drug)
self.drug_name = re.findall(\'(.*?)\[\d+\]\', drug)[0]
self.drug_id = re.findall(\'.*?\[(\d+)\].*\', drug)[0]
self.drug_dir = drug_dir
self.file_list = os.listdir(self.drug_dir)
self.dispatch()
def dispatch(self):
for file in self.file_list:
# if file.endswith(\'药品概述.html\'):
# self.drug_summary(self.file_path(file))
# if re.match(\'.*?用药咨询.*\',file):
# self.drug_consultation(self.file_path(file))
if re.match(\'.*?用药经验.*\', file):
self.drug_experience(self.file_path(file))
def file_path(self, file):
return os.path.join(self.drug_dir, file)
def read_file(self, file):
with open(file, \'r\') as f:
html = f.read()
return html
def drug_summary(self, file):
"""药品概况"""
html = self.read_file(file)
selector = Selector(text=html)
category = selector.xpath(\'//div[@class="subs"]/p/a[last()]/text()\').extract_first()
print(category)
# class_list.append(category)
index = comment_data.loc[comment_data.药品名称 == self.drug, \'类别\'].index.values[0]
comment_data.loc[index, \'类别\'] = category
def drug_experience(self, file):
# print(file)
"""用药经验"""
html = self.read_file(file)
selector = Selector(text=html)
drug_use_experience_num = selector.css(\'.dps cite font::text\').extract_first()
if not drug_use_experience_num:
self.drug_use_experience_num = 0
else:
self.drug_use_experience_num = int(drug_use_experience_num)
result[self.drug] = self.drug_use_experience_num
print(self.drug,self.drug_use_experience_num)
def write_to_file(self):
try:
with open(\'comment_num_grade.csv\', \'a\', newline=\'\', encoding=\'utf_8_sig\') as MyFile:
writer = csv.writer(MyFile)
druginfo_list = [self.drug,self.drug_use_experience_num]
writer.writerow(druginfo_list)
print(\'{}写入文件完毕\'.format(self.drug))
except AttributeError:
return
def write_num():
with open(\'comment.py\', \'w\', encoding=\'utf-8\') as f:
json.dump(result, f)
# for k,v in result.items():
# k_list.append(k)
# v_list.append(v)
data = {\'药品名称\': list(result.keys()), \'评论数\': list(result.values())}
df = pandas.DataFrame(data)
comment_data = df.sort_values(by=\'评论数\', ascending=False)
comment_data.to_csv(\'comment_num_grade.csv\', sep=\',\', encoding=\'utf_8_sig\', mode=\'w\', index=False)
return comment_data
def read_num():
with open(\'comment.py\', \'r\', encoding=\'utf-8\') as f:
num = json.load(f)
for k, v in num.items():
k_list.append(k)
v_list.append(v)
data = {\'药品名称\': k_list, \'评论数\': v_list}
df = pandas.DataFrame(data)
comment_data = df.sort_values(by=\'评论数\', ascending=False)
comment_data.to_csv(\'comment_num_grade.csv\', sep=\',\', encoding=\'utf_8_sig\', mode=\'w\', index=False)
return comment_data
def main(drug):
"""主函数"""
DrugInfo(drug)
# try:
# result[d.drug] = d.drug_use_experience_num
# except:
# result[d.drug] = 0
# write_to_file(d)
if __name__ == \'__main__\':
start = time.time()
# pool = Pool(4)
# pool.map(main,dirs_list)
# pool.close()
# pool.join()
list(map(main,dirs_list))
write_num()
# comment_data.to_csv(\'new_comment_num_grade.csv\',encoding=\'utf_8_sig\',mode=\'w\',index=False)
print(\'总花费:{}秒\'.format(time.time() - start))
# comment_data = read_num()
# print(comment_data)
# print(len(num))
6.提取评论数量最多的前10个药品评论信息
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/10
@Author: Zhang Yafei
"""
import csv
import numpy
import pandas
import os
import re
import jieba
from scrapy import Selector
import re
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
drug_path = os.path.join(BASE_DIR,\'药品\')
dirs_list = os.listdir(drug_path)
comment_info_filename = \'first50_comment.csv\'
class DrugInfo(object):
"""
提取的用户评论信息:
"""
def __init__(self,drug):
drug_dir = os.path.join(drug_path, drug)
self.drug_name = re.findall(\'(.*?)\[\d+\]\',drug)[0]
self.drug_id = re.findall(\'.*?\[(\d+)\].*\',drug)[0]
self.drug_dir = drug_dir
self.drug_use_experience = \'\'
self.file_list = os.listdir(self.drug_dir)
self.result = True
self.dispatch()
def dispatch(self):
for file in self.file_list:
# if re.match(\'.*?用药咨询.*\',file):
# self.drug_consultation(self.file_path(file))
if re.match(\'.*?用药经验.*\',file):
self.drug_experience(self.file_path(file))
def file_path(self,file):
return os.path.join(self.drug_dir,file)
def read_file(self,file):
with open(file,\'r\') as f:
html = f.read()
return html
def drug_experience(self,file):
print(file)
"""用药经验"""
html = self.read_file(file)
selector = Selector(text=html)
drug_use_experience = selector.css(\'.pls_box\')
try:
page = selector.css(\'.dpzx .pages .pgleft span::text\').extract()[0]
except IndexError:
page = 1
drug_url = \'http://ypk.39.net/{}/comment/k0_p{}\'.format(self.drug_id,page)
if not drug_use_experience:
self.write_to_file(numpy.NAN,numpy.NAN,numpy.NAN,drug_url)
return
for drug in drug_use_experience:
self.drug_use_experience = drug.css(\'.pls_mid p::text\').extract()[0].replace(\' \',\'\').strip(\'\n\')
commter_info = drug.css(\'.pls_top cite::text\').extract()[0].replace(\'\n\',\'\').strip(\'来自\').strip(\' \').replace(\' \',\'/\').rstrip(\'/\')
cut_info = \'/\'.join(list(jieba.cut(self.drug_use_experience)))
cut_info = cut_info.strip(\'/ /\')
time = drug.css(\'.pls_top i::text\').extract()[0].strip().strip(\'点评时间:\')
if not time:
time = numpy.NAN
self.write_to_file(commter_info,cut_info,time,drug_url)
def write_to_file(self,commter_info,cut_info,time,drug_url):
with open(comment_info_filename, \'a\', newline=\'\', encoding=\'utf_8_sig\') as MyFile:
writer = csv.writer(MyFile)
druginfo_list = [self.drug_name,self.drug_id,commter_info,self.drug_use_experience,cut_info,time,drug_url]
writer.writerow(druginfo_list)
print(\'{}写入文件完毕\'.format(drug_url))
def write_to_fileheader():
with open(comment_info_filename,\'w\',newline=\'\',encoding=\'utf_8_sig\') as MyFile:
writer = csv.writer(MyFile)
drug_header = [\'药品名称\',\'药品ID\',\'评论者信息\',\'评论\',\'分词\',\'评论时间\',\'url\']
writer.writerow(drug_header)
def main(drug):
"""主函数"""
DrugInfo(drug)
print(\'解析完成\')
def read_comment_num_first50():
"""
读取前评论数前10多药品
:return: 评论数前10多药品名称的列表
"""
data = pandas.read_csv(\'concat_first50_comment.csv\',encoding=\'utf-8\')
drugs = data.药品名称.values.tolist()
drugs_id = list(map(lambda x:re.findall(\'\d+\',x)[-1],drugs))
df = pandas.DataFrame({\'drug_name\':dirs_list})
drugs = list(map(lambda x:df[df.drug_name.str.contains(x)].drug_name.values,drugs_id))
drugs = list(filter(lambda x:x.__len__(),drugs))
return [x[0] for x in drugs]
if __name__ == \'__main__\':
if not os.path.exists(os.path.join(BASE_DIR,comment_info_filename)):
write_to_fileheader()
drugs = read_comment_num_first50()
print(drugs.__len__())
list(map(main,drugs))
print(drugs.__len__())
7.分析药品评论数量所占比例
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 12 19:28:09 2018
@author: Zhang Yafei
"""
import json
import os
import pandas
#from wordcloud import WordCloud as wc
#from pyecharts import WordCloud
import matplotlib as mpl
from matplotlib import pyplot as plt
#import wordcloud
import numpy as np
from PIL import Image
data = pandas.read_csv(\'new_comment_num_grade.csv\',encoding=\'utf-8\')
drug_type_num = data.类别.value_counts()
drug_type_names = data.类别.value_counts().index.values
drug_type_dict = {}
def parse(drug_type_name):
drug_type_frequence = data[data[\'类别\']==drug_type_name].评论数.sum()
drug_type_dict[drug_type_name] = int(drug_type_frequence)
def plot_wordcloud(drug_dict=None):
if drug_dict:
label = drug_dict.keys()
attr = drug_dict.values()
else:
label = drug_type_dict.keys()
attr = drug_type_dict.values()
wordcloud = WordCloud(width=800, height=620)
wordcloud.add(\'\', label, attr, word_size_range=[20, 100])
wordcloud.render(\'drug_comment_wordcloud.html\')
def plot_wc(drug_dict=None):
mask = np.array(Image.open(\'mask1.jpg\'))
word_plot = wc(
font_path=\'font/simsun.ttc\', # 设置字体格式
mask=mask, # 设置背景图
max_words=200, # 最多显示词数
max_font_size=100 # 字体最大值
)
if drug_dict:
word_plot = word_plot.generate_from_frequencies(drug_dict) # 从字典生成词云
else:
word_plot = word_plot.generate_from_frequencies(drug_type_dict) # 从字典生成词云
image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
word_plot.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案
plt.imshow(word_plot) # 显示词云
plt.axis(\'off\') # 关闭坐标轴
plt.show() # 显示图像
word_plot.to_file(\'comment_num.jpg\')
def plot_series_pie():
mpl.rcParams[\'font.sans-serif\'] = [\'SimHei\']
pie_data = pandas.read_csv(\'drug_type_num_sum.csv\',encoding=\'utf-8\')
numbers = np.array(pie_data[pie_data.评论总数>0].评论总数)
drug_type = pie_data[pie_data.评论总数>0].类别名称
series = pandas.Series(numbers,index=drug_type,name=\'药物类型评论数饼状图\')
series.plot.pie(figsize=(8,8),autopct=\'%.2f\')
def plot_mpl_pie():
font = {
\'family\': \'SimHei\'
}
mpl.rc(\'font\', **font)
pie_data = pandas.read_csv(\'drug_type_num_sum.csv\', encoding=\'utf-8\')
numbers = np.array(pie_data[pie_data.评论总数 > 0].评论总数)
drug_type = pie_data.类别名称
plt.pie(numbers, labels=drug_type, autopct=\'%.2f%%\',
shadow=True, labeldistance=1.1, startangle=90, pctdistance=0.6)
plt.title(\'药物类型评论数饼状图\')
plt.savefig(\'药物类别与评论数量饼状图(mpl).png\')
plt.show()
def type_drug_num_pie():
font = {
\'family\': \'SimHei\'
}
mpl.rc(\'font\', **font)
pie_data = pandas.read_csv(\'drug_type_num_sum.csv\', encoding=\'utf-8\')
numbers = np.array(pie_data.药品数量)
drug_type = pie_data.类别名称
plt.pie(numbers, labels=drug_type, autopct=\'%.2f%%\',
shadow=True, labeldistance=1.1, startangle=90, pctdistance=0.6)
plt.title(\'药物类型药品数量数饼状图\')
plt.savefig(\'药物类别与药品数量饼状图(mpl).png\')
plt.show()
def wirte_to_file():
with open(\'comment_num_dict.py\',\'w\',encoding=\'utf-8\') as f:
json.dump(drug_type_dict,f)
def read_from_file():
with open(\'comment_num_dict.py\',\'r\',encoding=\'utf-8\') as f:
drug_type_dict = json.load(f)
return drug_type_dict
def write_type_num_to_file():
drug_type_dict = read_from_file()
type_name = list(drug_type_dict.keys())
type_num = list(drug_type_dict.values())
drug_type_nums = data.类别.value_counts().values
df_data = {\'类别名称\':type_name,\'药品数量\':drug_type_nums,\'评论总数\':type_num,}
df = pandas.DataFrame(df_data)
df.to_csv(\'drug_type_num_sum.csv\',mode=\'w\',encoding=\'utf_8_sig\',index=False)
def write_new_file():
new_data = pandas.read_csv(\'drug_type_num_sum.csv\', encoding=\'utf-8\')
new_data[\'药品数量所占比例\'] = round(new_data.药品数量/new_data.药品数量.sum(),4)
new_data[\'评论数量所占比例\'] = round(new_data.评论总数/new_data.评论总数.sum(),4)
new_data.to_csv(\'drug_type_num_sum.csv\',mode=\'w\',encoding=\'utf_8_sig\',index=False)
def main():
if os.path.exists(\'comment_num_dict.py\'):
drug_dict = read_from_file()
# plot_wordcloud(drug_dict)
plot_wc(drug_dict)
else:
list(map(parse,drug_type_names))
wirte_to_file()
# plot_wordcloud()
plot_wc()
if __name__ == \'__main__\':
# 1.计算每人评论数量所占比例,并生成词云
# main()
# write_type_num_to_file()
# 2.画饼状图
# plot_series_pie()
# plot_mpl_pie()
# type_drug_num_pie()
# write_new_file()
8.前50药品数据合并
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 3 20:50:12 2018
@author: Zhang Yafei
"""
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
data1 = pd.read_csv(\'comment_num_grade_zhangyafei.csv\',encoding=\'utf-8\')
data2 = pd.read_csv(\'comment_num_grade_wangyuxin.csv\',encoding=\'utf-8\')
data3 = pd.read_csv(\'comment_num_grade_liangwenqi.csv\',encoding=\'utf-8\')
data4 = pd.read_csv(\'comment_num_grade_zhangxinrui.csv\',encoding=\'utf-8\')
data5 = pd.read_table(\'macaizhen.txt\',encoding=\'utf-8\',header=None,names=[\'药品名称\',\'评论数\'])
data6 = pd.read_csv(\'comment_num_grade_wangshuai.csv\',encoding=\'utf-8\')
data7 = pd.read_csv(\'comment_num_grade_wangqi.csv\',encoding=\'utf-8\')
data8 = pd.read_csv(\'tangao.txt\',encoding=\'utf-8\',delimiter=\'\t\',header=None,names=[\'药品名称\',\'评论数\'])
data1[\'who\'] = \'张亚飞\'
data2[\'who\'] = \'王于心\'
data3[\'who\'] = \'梁雯琪\'
data4[\'who\'] = \'张昕瑞\'
data5[\'who\'] = \'马彩珍\'
data6[\'who\'] = \'王帅\'
data7[\'who\'] = \'王琪\'
data8[\'who\'] = \'唐奥\'
data_concat = pd.concat([data1,data2,data3,data4,data5,data6,data7,data8],ignore_index=True,sort=True).sort_values(\'评论数\',ascending=False).reset_index().drop(\'index\',axis=1)[:50]
print(data_concat)
data_concat.who.value_counts()
data_concat.评论数.sum()
groupby_data = data_concat.groupby(by=\'who\')[\'评论数\'].agg(np.sum)
data9 = pd.read_csv(\'first50_comment_zhangyafei.csv\',encoding=\'utf-8\')
data10 = pd.read_csv(\'first50_comment_zhangxinrui.csv\',encoding=\'utf-8\')
data11 = pd.read_csv(\'first50_comment_wangqi.csv\',encoding=\'utf-8\')
data12 = pd.read_csv(\'first50_comment_tangao.csv\',encoding=\'utf-8\')
data13 = pd.read_csv(\'first50_comment_wangshuai.csv\',encoding=\'utf-8\')
data14 = pd.read_csv(\'first50_comment_wangyuxin.csv\',encoding=\'utf-8\')
data15 = pd.read_csv(\'first50_comment_liangwenqi.csv\',encoding=\'utf-8\')
data16 = pd.read_csv(\'first50_comment_macaizhen.csv\',encoding=\'utf-8\')
data_concat2 = pd.concat([data9,data10,data11,data12,data13,data14,data15,data16],ignore_index=True)
def plot_hist():
"""画出评论数量分布直方图"""
font = {\'family\' : \'SimHei\'}
matplotlib.rc(\'font\', **font)
plt.figure(figsize=(15,8),dpi=80)
# x = data_concat.评论数.values
x = data_concat2.药品ID.value_counts().values
# num_bins 分组数
num_bins = int((max(x)-min(x))//10)
plt.hist(x,num_bins,facecolor=\'blue\')
plt.xticks(range(int(min(x)),int(max(x))+10,10))
plt.grid(alpha=0.5)
plt.title(\'评论总数前50名药品数量分布状况\')
plt.xlabel(\'评论数量\')
plt.ylabel(\'分布情况\')
plt.savefig(\'评论总数前50名药品数量分布状况1.png\')
plt.show()
def plot_bar():
"""画出每个人的评论数量对比条形图"""
font = {\'family\':\'SimHei\'}
matplotlib.rc(\'font\', **font)
plt.figure(figsize=(11,6),dpi=80)
plt.bar(groupby_data.index,groupby_data.values)
plt.xlabel(\'姓名\')
plt.ylabel(\'评论数\')
plt.title(\'评论数量前50名个人所占评论总数对比\')
plt.savefig(\'评论数量前50名个人所占评论总数对比.png\')
plt.show()
#plot_bar()
##
#plot_hist()
# df = pd.DataFrame(np.arange(24).reshape(6,4),columns=[\'A\',\'B\',\'C\',\'D\'])
## df[2] = 1
# df
# df[:1]
def label_recognition(df):
"""标注识别"""
# label1 = df[df.apply(lambda x:x.分词 == x.分词2,axis=1)]
label1 = df[df.分词 == df.分词2]
label2 = df[(df.分词 == df.分词2) & (df.分词 == df.分词3)]
return label1, label2
if __name__ == \'__main__\':
# data_concat.to_csv(\'concat_first50_comment.csv\',encoding=\'utf_8_sig\',index=False)
# data_concat2.to_csv(\'first50_comment.csv\',encoding=\'utf_8_sig\',index=False)
label1 = pd.read_excel(io=\'first50_comment_zhangxinrui2.xlsx\',encoding=\'utf-8\')
label,label2 = label_recognition(label1)
writer = pd.ExcelWriter(\'three_people_same_label.xlsx\')
label2.to_excel(writer,\'diabetes\')
writer.save()
new_label = label.drop(\'分词2\',axis=1)
new_label.to_csv(\'label.csv\',encoding=\'utf_8_sig\',index=False)
9.适应症和不良反应数据字典的构建
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/1/10
@Author: Zhang Yafei
"""
import numpy
import re
from scrapy.selector import Selector
import pandas
from twisted.web.client import getPage, defer
from twisted.internet import reactor
pandas.set_option(\'display.max_columns\', None)
data_list = []
n = 0
re_data = pandas.DataFrame(columns=[\'通用名称\', \'商品名称\', \'适应症\', \'不良反应\', \'url\'])
def parse(content, url):
"""
详细说明书中提取适应症和不良反应
:param content:
:param url:
:return: 通用名称 商品名称 适应症 不良反应 url
"""
global n
n += 1
print(n, url)
# text = content.decode(\'GB2312\')
text = content.decode(\'gbk\')
selector = Selector(text=text)
# 注:不同药品之间网页结构有差别,提取的时候应注意
drug_name = selector.xpath(\'//dt[text()="【药品名称】"]/following::*[1]\').extract_first()
if not drug_name:
drug_name = selector.xpath(\'//dt[text()="【产品名称】"]/following::*[1]\').extract_first()
generic_name = re.findall(\'通用名称:(.*)<br>\', drug_name)[0]
trade_name = re.findall(\'商品名称:(.*)<br>\', drug_name)[0]
# trade_name = numpy.NAN
function = selector.xpath(\'//dt[text()="【功能主治】"]/following::*[1]\').extract_first()
if function:
function = re.sub(\'<.*?>\', \'\', function).strip() # 功能主治
else:
function = numpy.NAN
indiction = selector.xpath(\'//dt[text()="【适应症】"]/following::*[1]\')
if indiction:
indiction = indiction.xpath(\'string(.)\').extract_first().strip().replace(\'\n\', \'\')
else:
indiction = numpy.NAN
indictions = indiction if indiction is not numpy.NAN else function
try:
adverse_reaction = selector.xpath(\'//dt[text()="【不良反应】"]/following::*[1]/p/text()\').extract_first().strip(
\'。\') # 不良反应
except AttributeError:
try:
adverse_reaction = selector.xpath(\'//dt[text()="【不良反应】"]/following::*[1]/text()\').extract_first().strip(
\'。\') # 不良反应
adverse_reaction = re.sub(\'<.*?>\', \'\', adverse_reaction).strip().replace(\'\n\', \'\')
except AttributeError:
adverse_reaction = numpy.NAN
data = {\'通用名称\': generic_name, \'商品名称\': trade_name,
\'适应症\': indictions, \'不良反应\': adverse_reaction,
\'url\': url,
}
data_list.append(data)
def stop_loop(arg):
reactor.stop()
def main(url_list):
"""
主函数:利用twisted实现基于事件循环的异步非阻塞IO
:param url_list:
:return:
"""
# 制定任务计划:分配请求任务和添加回调函数
defered_list = []
for url in url_list:
defered = getPage(bytes(url, encoding=\'utf-8\'))
defered.addCallback(callback=parse, url=url)
defered_list.append(defered)
# 将任务计划告诉领导和下属,并通知任务结束之后停止
dlist = defer.DeferredList(defered_list)
dlist.addBoth(stop_loop)
# 开始执行任务
reactor.run()
if __name__ == \'__main__\':
# 1.读取数据url下载响应信息
# data = pandas.read_excel(\'three_people_same_label.xlsx\')
# url_list = [\'http://ypk.39.net/{}/manual\'.format(i) for i in data.药品ID.unique().tolist()]
# data = pandas.read_excel(\'drug_dict.xlsx\')
# has_url = set(data.url.tolist())
# urls = list(set(url_list) - has_url)
# main(urls)
#
# # 2. 将下载信息写入文件
# df = pandas.DataFrame(data=data_list)
# df = df.loc[:, [\'通用名称\',\'商品名称\',\'适应症\',\'不良反应\',\'url\']]
# result = pandas.concat([data, df])
# writer = pandas.ExcelWriter(\'drug_dict.xlsx\')
# result.to_excel(writer, \'drug_dict\', index=False)
# writer.save()
# 3.合并39药品数据和不良反应数据库数据
# df1 = pandas.read_excel(\'adverse_reaction_database.xlsx\')
# df2 = pandas.read_excel(\'drug_dict.xlsx\')
# df2[\'适应症2\'] = numpy.NAN
# df2[\'不良反应2\'] = numpy.NAN
# print(df1.药品通用名称)
# print(df2.通用名称)
# index = df2.通用名称.apply(lambda x: x in df1.药品通用名称.values)
# df3 = df2.loc[index, :]
# df4 = pandas.DataFrame(columns=[\'药品通用名称\', \'适应症\', \'不良反应\'])
# # df3.通用名称.apply(judge)
# for k in df3.通用名称.values:
# data = df1[df1.药品通用名称 == k]
# df4 = df4.append(data, ignore_index=True)
# writer = pandas.ExcelWriter(\'drug_dict2.xlsx\')
# df4.to_excel(writer, \'drug_dict\', index=False)
# writer.save()
# 4.读取drug_dict2.xlsx,合并相关数据
df4 = pandas.read_excel(\'drug_dict2.xlsx\')
drug_list = []
for name in df4.药品通用名称.unique():
result = df4[df4.药品通用名称 == name]
indiction = \'/\'.join(str(s) for s in result.适应症.values if s is not numpy.NAN).strip()
adverse = \'/\'.join(str(s) for s in result.不良反应.values if s is not numpy.NAN).strip()
dict = {
\'药品通用名称\': name,
\'适应症\': indiction,
\'不良反应\': adverse,
}
drug_list.append(dict)
df5 = pandas.DataFrame(data=drug_list)
df5 = df5.loc[:, [\'药品通用名称\',\'适应症\',\'不良反应\']]
writer = pandas.ExcelWriter(\'database_dict.xlsx\')
df5.to_excel(writer, sheet_name=\'database_dict\', index=False)
writer.save()