决策树算法小结

sklearn实现

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np 
import pandas as pd

# 加载iris数据集

data_sets=load_iris()
x=data_sets.data
y=data_sets.target
data=np.c_[x,y]
cols=data_sets.feature_names+['class']
df=pd.DataFrame(data,columns=cols)
print(df.shape)
df.head()

(150, 5)

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

# 处理数据

X=df.loc[:,df.columns!='class']
y=df['class']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,shuffle=True,random_state=1)

# 构造模型

dtree=DecisionTreeClassifier(criterion="gini",# 特征选择标准：默认是gini,可选entropy信息增益
                            splitter="best",# 节点处划分标准：
                                            # 默认best用于小样本集，在特征所有划分点钟找出最优划分点；
                                            # random用于大样本集，在特征部分划分点中找出局部最优划分点；
                            max_depth=None,# 树的最大深度：默认None，数据小时不限制，数据大时灵活限制
                            min_samples_split=2, # 内部节点再划分所需要的最小样本数：默认2，数据集大时才调整
                            min_samples_leaf=1,# 限制叶子结点内的最小样本数：默认是1，小于设定值，会和兄弟节
                                               # 点一起被剪枝。数据集大时才调整
                            min_weight_fraction_leaf=0,# 叶子节点最小的样本权重和：默认0，适用于缺失值过多
                            max_features=None,# 划分时考虑最大特征数：默认None,表示考虑全部，特征数过多时考虑
                            max_leaf_nodes=None,# 限制最大叶子节点数，防止过拟合:默认None,特征过多时考虑
                            min_impurity_decrease=0,# 预剪枝策略，默认0，如果当前叶节点纯度小于阈值被剪枝
                            class_weight=None)# 指定类别权重：如果样本类别分布特别不均衡，用balanced自动调整

dtree.fit(x_train,y_train) # 训练模型
features_im=dtree.feature_importances_.tolist() # 特征重要性
print('特征重要性：',dict(zip(X.columns,features_im)))

y_pred=dtree.predict(x_test) # 预测样本
score=dtree.score(x_test,y_test) # 评价方式为准确率
print("准确率：",score)

特征重要性： {'sepal length (cm)': 0.02146946564885496, 'sepal width (cm)': 0.02146946564885496, 'petal length (cm)': 0.5719647633364664, 'petal width (cm)': 0.3850963053658237}
准确率： 0.9555555555555556

# 可视化
from IPython.display import Image  
from sklearn import tree
import pydotplus  

dot_data = tree.export_graphviz(dtree, # *
                                out_file=None, 
                                feature_names=data_sets.feature_names,# *
                                class_names=data_sets.target_names, # *  
                                filled=True, 
                                rounded=True,  
                                special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

决策树算法小结

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2