- 閔可夫斯基距離(Minkowski Distance)
- 歐式距離(Euclidean Distance)
- 標(biāo)準(zhǔn)歐式距離(Standardized Euclidean Distance)
- 曼哈頓距離(Manhattan Distance)
- 切比雪夫距離(Chebyshev Distance)
- 馬氏距離(Mahalanobis Distance)
- 巴氏距離(Bhattacharyya Distance)
- 漢明距離(Hamming Distance)
- 皮爾遜系數(shù)(Pearson Correlation Coefficient)
- 信息熵(Informationentropy)
- 夾角余弦(Cosine)
- 杰卡德相似系數(shù)(Jaccard similarity coefficient)
- 經(jīng)典貝葉斯公式
- 堪培拉距離(Canberra Distance)
import numpy as np
import operator
import scipy.spatial.distance as dist
def pp_ps(inX, dataSet,function):
# 將點(diǎn)與點(diǎn)的距離寫為點(diǎn)與數(shù)據(jù)集的計(jì)算,返回一維數(shù)據(jù)
distances = np.array([function(inX, dataSet[i]) for i in range(group.shape[0])])
return distances
閔可夫斯基距離(Minkowski Distance)
-
:曼哈頓距離
-
:歐式距離
-
:切比雪夫距離
np.linalg.norm
def Minkowski_distance_1(vector1, vector2, p):
return pdist(np.vstack((vector1, vector2)), 'minkowski', p)
20190503191108.png
歐式距離(Euclidean Distance)
def euclidean_distance_1(inX, dataSet):
# 點(diǎn)與樣本集的歐式距離
sub = inX - dataSet
squ_sub = sub**2
sum_squ_sub = np.sum(squ_sub, axis=1)
distances = sum_squ_sub**0.5
return distances
def euclidean_distance_2(inX, dataSet):
# 點(diǎn)與樣本集的歐式距離
return np.linalg.norm(inX - dataSet,axis=1)
def euclidean_distance_3(vector1, vector2):
# 點(diǎn)與點(diǎn)的歐式距離
return pdist(np.vstack((vector1, vector2)), 'euclidean')
標(biāo)準(zhǔn)歐式距離(Standardized Euclidean Distance)
def euclidean_distance_1(vector1, vector2):
# 點(diǎn)與點(diǎn)的標(biāo)準(zhǔn)歐式距離,v是方差向量林螃,表示 v[i]表示第i個(gè)分量的方差戏锹,如果缺失宋下。默認(rèn)自動(dòng)計(jì)算政模。
return pdist(X, 'seuclidean', V=None)
曼哈頓距離(Manhattan Distance)
def manhattan_distance_1(inX, dataSet):
# 點(diǎn)與樣本集的曼哈頓距離
sub = inX - dataSet
abs_sub = np.abs(sub)
distances = np.sum(abs_sub, axis=1)
return distances
def manhattan_distance_2(inX, dataSet):
# 點(diǎn)與樣本集的曼哈頓距離
return np.linalg.norm(inX - dataSet,axis=1,ord=1)
def manhattan_distance_3(vector1, vector2):
# 點(diǎn)與點(diǎn)的曼哈頓距離
return pdist(np.vstack((vector1, vector2)), 'cityblock')
切比雪夫距離(Chebyshev Distance)
def chebyshev_distance_1(inX, dataSet):
# 點(diǎn)與樣本集的切比雪夫距離
sub = inX - dataSet
abs_sub = np.abs(sub)
distances = np.max(abs_sub, axis=1)
return distances
def chebyshev_distance_2(inX, dataSet):
# 點(diǎn)與樣本集的切比雪夫距離
return np.linalg.norm(inX - dataSet,axis=1,ord=np.inff)
def chebyshev_distance_3(vector1, vector2):
# 點(diǎn)與點(diǎn)的切比雪夫距離
return pdist(np.vstack((vector1, vector2)), 'chebyshev')
馬氏距離(Mahalanobis Distance)
- 要求樣本數(shù)要大于維數(shù)笋熬,否則無法求協(xié)方差矩陣
有M個(gè)樣本向量X1~Xm,協(xié)方差矩陣記為S菩鲜,均值記為向量μ园细,則其中樣本向量X到u的馬氏距離表示為:
其中向量之間的馬氏距離定義為:
若協(xié)方差矩陣是單位矩陣(各個(gè)樣本向量之間獨(dú)立同分布),則公式就成了歐式距離:
#方法一:根據(jù)公式求解
def Mahalanobis_distance_1(x,y):
X=np.vstack([x,y])
XT=X.T
S=np.cov(X) #兩個(gè)維度之間協(xié)方差矩陣
SI = np.linalg.inv(S) #協(xié)方差矩陣的逆矩陣
#馬氏距離計(jì)算兩個(gè)樣本之間的距離,此處共有10個(gè)樣本接校,兩兩組合猛频,共有45個(gè)距離。
n=XT.shape[0]
d1=[]
for i in range(0,n):
for j in range(i+1,n):
delta=XT[i]-XT[j]
d=np.sqrt(np.dot(np.dot(delta,SI),delta.T))
d1.append(d)
return d1
#方法二:根據(jù)scipy庫求解
def Mahalanobis_distance_2(x,y):
X=np.vstack([x,y])
XT=X.T
d2=pdist(XT,'mahalanobis')
return d2
巴氏距離(Bhattacharyya Distance)
其中
def bhattacharyya_distance_1(vector1, vector2):
# 點(diǎn)與樣本集的巴氏距離:
BC = np.sum(np.sqrt(vector1 * vector2))
return -np.log(BC)
漢明距離(Hamming Distance)
兩個(gè)等長字符串s1與s2之間的漢明距離定義為將其中一個(gè)變?yōu)榱硗庖粋€(gè)所需要作的最小替換次數(shù)蛛勉。例如字符串“1111”與“1001”之間的漢明距離為2鹿寻。
def hamming_distance_1(vector1, vector2):
# 點(diǎn)與點(diǎn)的漢明距離
return np.shape(np.nonzero(vector1 - vector2)[0])[0]
def hamming_distance_2(vector1, vector2):
# 點(diǎn)與點(diǎn)的漢明距離
return pdist(np.vstack((vector1, vector2)), 'hamming')
皮爾遜系數(shù)(Pearson Correlation Coefficient)
from scipy.stats import pearsonr
x = [0.5, 0.4, 0.6, 0.3, 0.6, 0.2, 0.7, 0.5]
y = [0.6, 0.4, 0.4, 0.3, 0.7, 0.2, 0.5, 0.6]
pearsonr(x, y)
# 輸出:(r, p)
# r:相關(guān)系數(shù)[-1,1]之間
# p:p值越小
信息熵(Informationentropy)
data1=np.array(['a','b','c','a','a','b'])
#計(jì)算信息熵的方法
def entropy(x):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
print(ent)
calc_ent(data1),data1
是類別數(shù)诽凌,
是第i類的概率
夾角余弦(Cosine)
def Cosine_distance_1(vector1, vector2):
# 點(diǎn)與點(diǎn)的夾角余弦距離
return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
def Cosine_distance_2(vector1, vector2):
# 點(diǎn)與點(diǎn)的夾角余弦距離
return pdist(np.vstack((vector1, vector2)), 'cosine')
杰卡德相似系數(shù)(Jaccard similarity coefficient)
- 相似系數(shù)
- 杰卡德距離
def jaccard_similarity_coefficient(vector1, vector2):
# 點(diǎn)與點(diǎn)的杰卡德距離
return dist.pdist(np.array([vector1, vector2]), 'jaccard')
經(jīng)典貝葉斯公式
堪培拉距離(Canberra Distance)
def canberra_distance_1(vector1, vector2):
# 點(diǎn)與點(diǎn)的堪培拉距離
return dist.pdist(np.array([vector1, vector2]), 'canberra')