在构成圆形的30000个随机样本点上,设置7个簇,使用K-Means算法聚类
from math import pi, sin, cos
from collections import namedtuple
from random import random, choice
from copy import copy
import matplotlib.pyplot as plt
import numpy as np
import sys
try:
import psyco
psyco.full()
except ImportError:
pass
FLOAT_MAX = 1e100
class Point:
__slots__ = ["x", "y", "group"]
def __init__(self, x=0.0, y=0.0, group=1):
self.x, self.y, self.group = x, y, group
def generate_points(npoints, radius):
points = [Point() for _ in range(npoints)]
# note: this is not a uniform 2-d distribution
for p in points:
r = random() * radius
ang = random() * 2 * pi
p.x = r * cos(ang)
p.y = r * sin(ang)
return points
def nearest_cluster_center(point, cluster_centers):
"""Distance and index of the closest cluster center"""
def sqr_distance_2D(a, b):
return (a.x - b.x) ** 2 + (a.y - b.y) ** 2
min_index = point.group
min_dist = FLOAT_MAX
for i, cc in enumerate(cluster_centers):
d = sqr_distance_2D(cc, point)
if min_dist > d:
min_dist = d
min_index = i
return (min_index, min_dist)
def kpp(points, cluster_centers):
cluster_centers[0] = copy(choice(points))
d = [0.0 for _ in range(len(points))]
for i in range(1, len(cluster_centers)):
sum = 0
for j, p in enumerate(points):
d[j] = nearest_cluster_center(p, cluster_centers[:i])[1]
sum += d[j]
sum *= random()
for j, di in enumerate(d):
sum -= di
if sum > 0:
continue
cluster_centers[i] = copy(points[j])
break
for p in points:
p.group = nearest_cluster_center(p, cluster_centers)[0]
def lloyd(points, nclusters):
cluster_centers = [Point() for _ in range(nclusters)]
# call k++ init
kpp(points, cluster_centers)
lenpts10 = len(points) >> 10
changed = 0
while True:
# group element for centroids are used as counters
for cc in cluster_centers:
cc.x = 0
cc.y = 0
cc.group = 0
for p in points:
cluster_centers[p.group].group += 1
cluster_centers[p.group].x += p.x
cluster_centers[p.group].y += p.y
for cc in cluster_centers:
cc.x /= cc.group
cc.y /= cc.group
# find closest centroid of each PointPtr
changed = 0
for p in points:
min_i = nearest_cluster_center(p, cluster_centers)[0]
if min_i != p.group:
changed += 1
p.group = min_i
# stop when 99.9% of points are good
if changed <= lenpts10:
break
for i, cc in enumerate(cluster_centers):
cc.group = i
return cluster_centers
def kmeans(points, nclusters):
cluster_centers = [copy(choice(points)) for _ in xrange(nclusters)]
lenpts10 = len(points) >> 10
changed = 0
while True:
for p in points:
cluster_centers[p.group].group += 1
cluster_centers[p.group].x += p.x
cluster_centers[p.group].y += p.y
for cc in cluster_centers:
#print cc.x,cc.y,cc.group
cc.x /= cc.group
cc.y /= cc.group
changed = 0
for p in points:
min_i = nearest_cluster_center(p, cluster_centers)[0]
if min_i != p.group:
changed += 1
p.group = min_i
# stop when 99.9% of points are good
if changed <= lenpts10:
break
for i, cc in enumerate(cluster_centers):
cc.group = i
return cluster_centers
def print_eps(points, cluster_centers):
c = np.random.rand(len(cluster_centers)+1,3)
for i, cc in enumerate(cluster_centers):
plot_x = []
plot_y = []
print (cc.x,cc.y,cc.group)
for p in points:
if p.group != i:
continue
plot_x.append(p.x)
plot_y.append(p.y)
plt.scatter(plot_x,plot_y,color=c[i],s=1)
plt.scatter(cc.x,cc.y, color=c[len(c)-i-1], marker='^',s=50)
plt.show()
def main():
npoints = 30000
k = 7 # # clusters
points = generate_points(npoints, 10)
cluster_centers = lloyd(points, k)
#cluster_centers = kmeans(points,k)
print_eps(points, cluster_centers)
main()
4.752567342690781 -4.334890736846155 0
-1.3573754262792683 6.986665631867061 1
-0.15969173199245487 2.506935816564994 2
-6.2690747305026315 0.7728991240509486 3
-2.713392739613833 -5.828585413438269 4
5.771515155816214 2.91539616422218 5
0.07082712668661448 -0.7504142150100154 6