Python_sklearn机器学习库学习笔记（四）decision_tree（决策树）

# 决策树

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

import zipfile
#压缩节省空间
z=zipfile.ZipFile(\'ad-dataset.zip\')
# df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)
# df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)

df=pd.read_csv(\'.\\tree_data\\ad.data\',header=None)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]
#最后一列是代表的标签类型
explanatory_variable_columns.remove(len(df.columns)-1)

y=[1 if e ==\'ad.\' else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]

#匹配？字符，并把值转化为-1
X.replace(to_replace=\' *\?\', value=-1, regex=True, inplace=True)

X_train,X_test,y_train,y_test=train_test_split(X,y)
#用信息增益启发式算法建立决策树
pipeline=Pipeline([(\'clf\',DecisionTreeClassifier(criterion=\'entropy\'))])
parameters = {
\'clf__max_depth\': (150, 155, 160),
\'clf__min_samples_split\': (1, 2, 3),
\'clf__min_samples_leaf\': (1, 2, 3)
}
#f1查全率和查准率的调和平均
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,
                         verbose=1,scoring=\'f1\')
grid_search.fit(X_train,y_train)
print \'最佳效果：%0.3f\'%grid_search.best_score_
print \'最优参数\'
best_parameters=grid_search.best_estimator_.get_params()
best_parameters

输出结果：

Fitting 3 folds for each of 27 candidates, totalling 81 fits

[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   34.7s finished

最佳效果：0.888
最优参数

Out[123]:

{\'clf\': DecisionTreeClassifier(class_weight=None, criterion=\'entropy\', max_depth=160,
             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=3, min_weight_fraction_leaf=0.0,
             presort=False, random_state=None, splitter=\'best\'),
 \'clf__class_weight\': None,
 \'clf__criterion\': \'entropy\',
 \'clf__max_depth\': 160,
 \'clf__max_features\': None,
 \'clf__max_leaf_nodes\': None,
 \'clf__min_samples_leaf\': 1,
 \'clf__min_samples_split\': 3,
 \'clf__min_weight_fraction_leaf\': 0.0,
 \'clf__presort\': False,
 \'clf__random_state\': None,
 \'clf__splitter\': \'best\',
 \'steps\': [(\'clf\',
   DecisionTreeClassifier(class_weight=None, criterion=\'entropy\', max_depth=160,
               max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
               min_samples_split=3, min_weight_fraction_leaf=0.0,
               presort=False, random_state=None, splitter=\'best\'))]}

for param_name in sorted(parameters.keys()):
    print (\'\t%s:%r\'%(param_name,best_parameters[param_name]))
predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

输出结果：

clf__max_depth:150
clf__min_samples_leaf:1
clf__min_samples_split:1
             precision    recall f1-score   support

          0       0.97      0.99      0.98       703
          1       0.91      0.84      0.87       117

avg / total       0.96      0.96      0.96       820

df.head()

输出结果;

	0	1	2	3	...	1558
0	125	125	1.0	1	...	ad.
1	57	468	8.2105	1	...	ad.
2	33	230	6.9696	1	...	ad.
3	60	468	7.8	1	...	ad.
4	60	468	7.8	1	...	ad.

# 决策树集成

#coding:utf-8
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

df=pd.read_csv(\'.\\tree_data\\ad.data\',header=None,low_memory=False)
explanatory_variable_columns=set(df.columns.values)
response_variable_column=df[len(df.columns.values)-1]

df.head()

	0	1	2	3	...	1558
0	125	125	1.0	1	...	ad.
1	57	468	8.2105	1	...	ad.
2	33	230	6.9696	1	...	ad.
3	60	468	7.8	1	...	ad.
4	60	468	7.8	1	...	ad.

#The last column describes the targets(去掉最后一列)
explanatory_variable_columns.remove(len(df.columns.values)-1)
y=[1 if e==\'ad.\' else 0 for e in response_variable_column]
X=df.loc[:,list(explanatory_variable_columns)]
#置换有？的为-1
X.replace(to_replace=\' *\?\', value=-1, regex=True, inplace=True)
X_train,X_test,y_train,y_test=train_test_split(X,y)
pipeline=Pipeline([(\'clf\',RandomForestClassifier(criterion=\'entropy\'))])
parameters = {
\'clf__n_estimators\': (5, 10, 20, 50),
\'clf__max_depth\': (50, 150, 250),
\'clf__min_samples_split\': (1, 2, 3),
\'clf__min_samples_leaf\': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring=\'f1\')
grid_search.fit(X_train,y_train)

print(u\'最佳效果：%0.3f\'%grid_search.best_score_)
print u\'最优的参数：\'
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(\'\t%s:%r\'%(param_name,best_parameters[param_name]))

输出结果：

最佳效果：0.929 最优的参数： clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50

predictions=grid_search.predict(X_test)
print classification_report(y_test,predictions)

输出结果：

     precision    recall f1-score   support

          0       0.98      1.00      0.99       705
          1       0.97      0.90      0.93       115

avg / total       0.98      0.98      0.98       820