聚類和決策樹一樣施蜜,屬于無監(jiān)督學(xué)習(xí)辈灼。也就是說數(shù)據(jù)樣本只有特征x,沒有給定y。聚類的目的是找到樣本特征潛在的類別翘单,將同類別的樣本放在一起。
kmeans的具體邏輯如下:
1.隨機(jī)選取k個(gè)簇心巾陕;
2.對(duì)于每一個(gè)樣例琳钉,計(jì)算其屬于的類;
3.循環(huán)完所有的樣例后戳表,重新計(jì)算每個(gè)簇的簇心桶至;
4.重復(fù)第二步第三部,直到簇心不再變化或達(dá)到最大迭代值匾旭。
import numpy as np
import matplotlib.pyplot as plt
#讀取數(shù)據(jù)
def loaddate(filename):
datamat = []
fr = open(filename)
for line in fr.readlines():
datamat.append(map(float, line.strip().split('\t')))
datamat = pd.DataFrame(datamat)
return datamat
#計(jì)算歐式距離
def distance(vecA, vecB):
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
#產(chǎn)生隨機(jī)簇心
def getcenter(data, k):
_, n = data.shape
center = pd.DataFrame(np.zeros([k, 2]))
for i in range(n):
minJ = np.min(data.iloc[:, i])
maxJ = np.max(data.iloc[:, i])
rangJ = float(maxJ - minJ)
center.iloc[:, i] = minJ +rangJ * np.random.rand(k, 1)
return center
#計(jì)算每個(gè)樣例歸屬的簇镣屹,并重新計(jì)算簇心
def kmeans(data, k, maxiter):
ceter = getcenter(data, k)
m, n = data.shape
position = pd.DataFrame(np.zeros((m, 2)), columns = ['dis', 'cindex'])
#position第一列放置樣例距簇心的距離,第二列放置樣本歸屬的簇心
itercount = 0
#迭代次數(shù)价涝,如果迭代次數(shù)超過最大迭代次數(shù)女蜈,則停止
clusterchange = True
#簇心是否發(fā)生變化
while itercount < maxiter and clusterchange:
itercount += 1
clusterchange = False
for i in range(m):
minindex = 0
mindist = np.inf
#設(shè)置position的初始值
for j in range(k):
dis = distance(data.iloc[i, :], ceter.iloc[j, :])
#計(jì)算歐式距離
if dis < mindist:
minindex = j
mindist = dis
#替換掉初始化的數(shù)據(jù)
if position.iloc[i, 1] != minindex:
clusterchange = True
#觀察歸屬的簇是否發(fā)生變化
position.iloc[i, :] = mindist, minindex
for cent in range(k):
ptscluster = data.iloc[list(position.loc[position['cindex'] == cent, 'cindex'].index), :]
if ptscluster.shape[0] > 0:
ceter.iloc[cent, :] = np.mean(ptscluster, axis = 0)
#計(jì)算新的簇心
return ceter
if __name__ == '__main__':
file = loaddate('testSet.txt')
ceter = kmeans(file, 2, 5)
plt.scatter(file.iloc[:, 0], file.iloc[:, 1], marker = '*', c = 'b')
plt.scatter(ceter.iloc[:, 0], ceter.iloc[:, 1], marker = 'o', c = 'r')
plt.show()