数据来自 UCI 数据集 匹马印第安人糖尿病数据集
载入数据
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams[\'font.sans-serif\']=[u\'simHei\'] matplotlib.rcParams[\'axes.unicode_minus\']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_breast_cancer data_set = pd.read_csv(\'pima-indians-diabetes.csv\') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y)
建立决策树,网格搜索微调模型
# In[1] 网格搜索微调模型 pipeline = Pipeline([ (\'clf\',DecisionTreeClassifier(criterion=\'entropy\')) ]) parameters={ \'clf__max_depth\':(3,5,10,15,20,25,30,35,40), \'clf__min_samples_split\':(2,3), \'clf__min_samples_leaf\':(1,2,3) } #GridSearchCV 用于系统地遍历多种参数组合,通过交叉验证确定最佳效果参数。 grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring=\'f1\') grid_search.fit(X_train,y_train) # 获取搜索到的最优参数 best_parameters = grid_search.best_estimator_.get_params() print("最好的F1值为:",grid_search.best_score_) print(\'最好的参数为:\') for param_name in sorted(parameters.keys()): print(\'t%s: %r\' % (param_name,best_parameters[param_name])) # In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
最好的F1值为: 0.5573515325670498 最好的参数为: tclf__max_depth: 5 tclf__min_samples_leaf: 1 tclf__min_samples_split: 2
评价模型
# In[2] 输出预测结果并评价 predictions = grid_search.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support
0.0 0.74 0.89 0.81 124
1.0 0.67 0.43 0.52 68
画出决策树
# In[3]打印树 from sklearn import tree feature_name=data_set.columns.values.tolist()[:-1] # 列名称 DT = tree.DecisionTreeClassifier(criterion=\'entropy\',max_depth=5,min_samples_split=2,min_samples_leaf=5) DT.fit(X_train,y_train) \'\'\' # 法一 import pydotplus from sklearn.externals.six import StringIO dot_data = StringIO() tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name, class_names=["有糖尿病","无病"],filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") print(\'Visible tree plot saved as pdf.\') \'\'\' # 法二 import graphviz #ID3为决策树分类器fit之后得到的模型,注意这里必须在fit后执行,在predict之后运行会报错 dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP #在同级目录下生成tree.pdf文件 graph.render("tree2") # doctest: +SKIP
随机森林
# -*- coding: utf-8 -*- import pandas as pd import matplotlib matplotlib.rcParams[\'font.sans-serif\']=[u\'simHei\'] matplotlib.rcParams[\'axes.unicode_minus\']=False from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer data_set = pd.read_csv(\'pima-indians-diabetes.csv\') data = data_set.values[:,:] y = data[:,8] X = data[:,:8] X_train,X_test,y_train,y_test = train_test_split(X,y) RF = RandomForestClassifier(n_estimators=10,random_state=11) RF.fit(X_train,y_train) predictions = RF.predict(X_test) print(classification_report(y_test,predictions))
precision recall f1-score support 0.0 0.82 0.91 0.86 126 1.0 0.78 0.61 0.68 66 micro avg 0.81 0.81 0.81 192 macro avg 0.80 0.76 0.77 192 weighted avg 0.80 0.81 0.80 192