#coding=utf-8
import matplotlib.colors as mc
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import calinski_harabaz_score
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
colors =['black','lightcoral','orange','tan','lightgreen','cornflowerblue','lime','cyan','purple','yellow','fuchsia','darkblue','plum','palegreen','pink']
data,y = ds.make_blobs(400, n_features=2, centers=4, cluster_std=[1,1,2,0.5], random_state=3)
#绘制原始数据图
plt.subplot(211)
plt.title(u"原始数据")
for i in range(4):
plt.scatter(data[y==i][:,0],data[y==i][:,1],color=colors[i])
#开始聚类
model = DBSCAN(eps=1.4,min_samples=6)
model.fit(data)
#统计簇的个数
labels = model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
plt.subplot(212)
plt.title(u"聚类结果")
for i in range(n_clusters_):
plt.scatter(data[labels==i][:,0],data[labels==i][:,1],color=colors[i])
#绘制离散点
plt.scatter(data[labels==-1][:,0],data[labels==-1][:,1],marker='*',label=u'离散点')
plt.legend(loc='upper left')
plt.show()
print "聚类数目:",n_clusters_
#利用ch指数确定最佳参数
eps=[0.2,0.4,0.6,0.8,1,1.2,1.4]
min_samples=[4,6,8,10,12]
for i in eps:
for j in min_samples:
model = DBSCAN(eps=i,min_samples=j)
model.fit(data)
labels = model.labels_[model.labels_!=-1]
if(np.sum(labels==0)!=len(labels)):
print "eps:",i,"min_samples:",j,"离散点个数:",len(model.labels_[model.labels_==-1]),"score:",calinski_harabaz_score(data[model.labels_!=-1],labels)
#在结果中选择,离散点数目少,且score值大的参数组合,发现eps=1.4,min_sample=6是最佳参数组合。
'''
#无效代码
#利用k-距离调参eps,和min_samples
def k_dis(data,y,k):
neigbors = KNeighborsClassifier(n_neighbors=k)
neigbors.fit(data,y)
distance,index=neigbors.kneighbors(data)
#计算k-距离的平均值 eps的值要选平均值之上
#print np.max(distance[:,-1])
mean_dis = np.mean(distance[:,-1])
return mean_dis
a=[]
for j in range(2,10):
dis = k_dis(data,y,j)
a.append(dis)
plt.plot(range(2,10),a)
plt.show()
'''

