import warnings
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier,RandomForestClassifier,ExtraTreesClassifier
warnings.filterwarnings('ignore')
X,y=make_circles(n_samples=300,noise=0.15,factor=0.5,random_state=233)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y== 1, 0], X[y== 1, 1])
# plt.show()
X_train,X_test,y_train,y_test=train_test_split(X,y)
print('X_train.shape=',X_train.shape)
print('X_test.shape=',X_test.shape)
print('===========knn==============')
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
print('knn accuracy={}'.format(knn_clf.score(X_test,y_test)))
print('\n')
print('===========logistic regression==============')
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
print('logistic regression accuracy={}'.format(log_clf.score(X_test, y_test)))
print('\n')
print('===========SVM==============')
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
print('SVM accuracy={}'.format(svm_clf.score(X_test, y_test)))
print('\n')
print('===========Decison tree==============')
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
print('Decison tree accuracy={}'.format(dt_clf.score(X_test, y_test)))
print('\n')
print('===========ensemble classfier==============')
voting_clf=VotingClassifier(estimators=[('knn',KNeighborsClassifier()),
('logistic', LogisticRegression()),
('SVM',SVC()),
('decision tree',DecisionTreeClassifier())],
voting='hard')#严格遵守少数服从多数
voting_clf.fit(X_train,y_train)
print('voting classfier accuracy={}'.format(voting_clf.score(X_test, y_test)))
print('\n')
print('===========random forest==============')
rf_clf=RandomForestClassifier(n_estimators=500,#500棵树
max_depth=6,#每颗树的深度
bootstrap=True,# 放回抽样
oob_score=True,#使用没有被抽到的数据做验证
)
rf_clf.fit(X,y)#由于oob_score为true 故直接fit整个训练集
print('rf accuracy={}'.format(rf_clf.oob_score_))
print('\n')
print('===========extreme random tree==============')
ex_clf=ExtraTreesClassifier(n_estimators=500,
max_depth=6,
bootstrap=True,
oob_score=True)
ex_clf.fit(X,y)
print('extreme random treeaccuracy={}'.format(ex_clf.oob_score_))
print('\n')
由上述可以看出Extremely Randomized Trees 算法精度最高,它不仅在构建数据子集时对样本的选择进行随机抽取,而且还会对样本的特征进行随机抽取(即在建树模型时,采用部分特征而不是全部特征进行训练)。换句话说,就是对于特征集 X,随机森林只是在行上随机,Extremely Randomized Trees是在行和列上都随机。