主函數(shù)
# -*- coding: utf-8 -*-
"""
參考: https://gist.github.com/iandanforth/5862470
"""
import random
from kmeans_tools import Cluster, get_distance, gen_random_sample
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
def kmeans(samples, k, cutoff):
"""
kmeans函數(shù)
"""
# 隨機選k個樣本點作為初始聚類中心
init_samples = random.sample(samples, k)
# 創(chuàng)建k個聚類衰齐,聚類的中心分別為隨機初始的樣本點
clusters = [Cluster([sample]) for sample in init_samples]
# 迭代循環(huán)直到聚類劃分穩(wěn)定
n_loop = 0
while True:
# 初始化一組空列表用于存儲每個聚類內(nèi)的樣本點
lists = [[] for _ in clusters]
# 開始迭代
n_loop += 1
# 遍歷樣本集中的每個樣本
for sample in samples:
# 計算樣本點sample和第一個聚類中心的距離
smallest_distance = get_distance(sample, clusters[0].centroid)
# 初始化屬于聚類 0
cluster_index = 0
# 計算和其他聚類中心的距離
for i in range(k - 1):
# 計算樣本點sample和聚類中心的距離
distance = get_distance(sample, clusters[i+1].centroid)
# 如果存在更小的距離获列,更新距離
if distance < smallest_distance:
smallest_distance = distance
cluster_index = i + 1
# 找到最近的聚類中心丰泊,更新所屬聚類
lists[cluster_index].append(sample)
# 初始化最大移動距離
biggest_shift = 0.0
# 計算本次迭代中,聚類中心移動的距離
for i in range(k):
shift = clusters[i].update(lists[i])
# 記錄最大移動距離
biggest_shift = max(biggest_shift, shift)
# 如果聚類中心移動的距離小于收斂閾值循帐,即:聚類穩(wěn)定
if biggest_shift < cutoff:
print("第{}次迭代后,聚類穩(wěn)定庸娱。".format(n_loop))
break
# 返回聚類結(jié)果
return clusters
def run_main():
"""
主函數(shù)
"""
# 樣本個數(shù)
n_samples = 1000
# 特征個數(shù) (特征維度)
n_feat = 2
# 特征數(shù)值范圍
lower = 0
upper = 200
# 聚類個數(shù)
n_cluster = 3
# 生成隨機樣本
samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)]
# 收斂閾值
cutoff = 0.2
clusters = kmeans(samples, n_cluster, cutoff)
# 輸出結(jié)果
for i, c in enumerate(clusters):
for sample in c.samples:
print('聚類--{}啃炸,樣本點--{}'.format(i, sample))
# 可視化結(jié)果
plt.subplot()
color_names = list(mcolors.cnames)
for i, c in enumerate(clusters):
x = []
y = []
random.choice
color = [color_names[i]] * len(c.samples)
for sample in c.samples:
x.append(sample.coords[0])
y.append(sample.coords[1])
plt.scatter(x, y, c=color)
plt.show()
if __name__ == '__main__':
run_main()
k-means_tools
# -*- coding: utf-8 -*-
"""
參考: https://gist.github.com/iandanforth/5862470
"""
import math
import random
class Cluster(object):
"""
聚類
"""
def __init__(self, samples):
if len(samples) == 0:
# 如果聚類中無樣本點
raise Exception("錯誤:一個空的聚類赃额!")
# 屬于該聚類的樣本點
self.samples = samples
# 該聚類中樣本點的維度
self.n_dim = samples[0].n_dim
# 判斷該聚類中所有樣本點的維度是否相同
for sample in samples:
if sample.n_dim != self.n_dim:
raise Exception("錯誤: 聚類中樣本點的維度不一致加派!")
# 設(shè)置初始化的聚類中心
self.centroid = self.cal_centroid()
def __repr__(self):
"""
輸出對象信息
"""
return str(self.samples)
def update(self, samples):
"""
計算之前的聚類中心和更新后聚類中心的距離
"""
old_centroid = self.centroid
self.samples = samples
self.centroid = self.cal_centroid()
shift = get_distance(old_centroid, self.centroid)
return shift
def cal_centroid(self):
"""
對于一組樣本點計算其中心點
"""
n_samples = len(self.samples)
# 獲取所有樣本點的坐標(biāo)(特征)
coords = [sample.coords for sample in self.samples]
unzipped = zip(*coords)
# 計算每個維度的均值
centroid_coords = [math.fsum(d_list)/n_samples for d_list in unzipped]
return Sample(centroid_coords)
class Sample(object):
"""
樣本點類
"""
def __init__(self, coords):
self.coords = coords # 樣本點包含的坐標(biāo)
self.n_dim = len(coords) # 樣本點維度
def __repr__(self):
"""
輸出對象信息
"""
return str(self.coords)
def get_distance(a, b):
"""
返回樣本點a, b的歐式距離
參考:https://en.wikipedia.org/wiki/Euclidean_distance#n_dimensions
"""
if a.n_dim != b.n_dim:
# 如果樣本點維度不同
raise Exception("錯誤: 樣本點維度不同阁簸,無法計算距離!")
acc_diff = 0.0
for i in range(a.n_dim):
square_diff = pow((a.coords[i]-b.coords[i]), 2)
acc_diff += square_diff
distance = math.sqrt(acc_diff)
return distance
def gen_random_sample(n_dim, lower, upper):
"""
生成隨機樣本
"""
sample = Sample([random.uniform(lower, upper) for _ in range(n_dim)])
return sample