一、PCA的原理
import numpy as np
from sklearn import datasets
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
X,y = datasets.load_iris(True)
pca = PCA(n_components=0.95,whiten=True) # whiten 白化理璃谨,
#n_components=0.95--保留特征的數(shù)量
標(biāo)準(zhǔn)化(方差是1沙庐,平均值是0)正太分布
X_pca = pca.fit_transform(X)
X_pca.shape
display(X_pca.std(axis = 0))
display(X_pca.mean(axis = 0))
array([0.99666109, 0.99666109])
array([-1.42108547e-15, -1.81188398e-15])
自己寫(xiě)代碼,實(shí)現(xiàn)佳吞,PCA
矩陣的特征值拱雏,和特征向量
線性代數(shù)中的概念A(yù)
第一步,去中心化
# 特征底扳,將每個(gè)特征的平均值求解,
# 數(shù)據(jù)有四個(gè)特征铸抑,返回四個(gè)數(shù)
A = X - X.mean(axis = 0)
第二步,求解協(xié)方差
# 協(xié)方差衷模,方差
# 方差是協(xié)方差的一種特殊形式
# 圓是橢圓的一種特例
# 方差表示屬性內(nèi)在的關(guān)系
# 協(xié)方差鹊汛,兩個(gè)屬性蒲赂,進(jìn)行對(duì)比
# 學(xué)校里,吳同學(xué)柒昏,受女同學(xué)的歡迎程度(屬性)和吳同學(xué)的猥瑣程度(另一個(gè)屬性) ---->協(xié)方差
# rowvar row 行 = True默認(rèn)計(jì)算行和行的協(xié)方差
B = np.cov(A,rowvar=False)#計(jì)算列凳宙,二維數(shù)組中:行表示樣本,列表示的屬性(excel职祷,pandas氏涩、mysql)
# 4 * 4 = 16
B
array([[ 0.68569351, -0.042434 , 1.27431544, 0.51627069],
[-0.042434 , 0.18997942, -0.32965638, -0.12163937],
[ 1.27431544, -0.32965638, 3.11627785, 1.2956094 ],
[ 0.51627069, -0.12163937, 1.2956094 , 0.58100626]])
# 方差計(jì)算公式
np.var(X[:,0])/149*150
0.6856935123042507
((X[:,0] - X[:,0].mean())**2).sum()/149
0.6856935123042507
第三步,進(jìn)行特征值和特征向量的計(jì)算
value,vector = np.linalg.eigh(B)
display(value,vector)
array([0.02383509, 0.0782095 , 0.24267075, 4.22824171])
array([[ 0.31548719, 0.58202985, 0.65658877, -0.36138659],
[-0.3197231 , -0.59791083, 0.73016143, 0.08452251],
[-0.47983899, -0.07623608, -0.17337266, -0.85667061],
[ 0.75365743, -0.54583143, -0.07548102, -0.3582892 ]])
第四步有梆,根據(jù)特征值的權(quán)重是尖,篩選特征向量
C = vector[:,2:]
v = value[::-1]
v
array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])
# 累加和
v.cumsum()/v.sum()
array([0.92461872, 0.97768521, 0.99478782, 1. ])
pca = PCA(n_components=0.93)#下限,只要大于泥耀,就可以
pca.fit_transform(X).shape
(150, 2)
第五步饺汹,進(jìn)行矩陣運(yùn)算
D = A.dot(C)
r,c = D.shape
coef = [-1 if (i+1)%2 == 0 else 1 for i in range(c)]#偶數(shù)列,乘了負(fù)一
E = (D*coef)[:,[1,0]]
E[:5]
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943],
[-2.74534286, -0.31829898],
[-2.72871654, 0.32675451]])
X_pca[:5]
array([[-1.30533786, 0.64836932],
[-1.31993521, -0.35930856],
[-1.40496732, -0.29424412],
[-1.33510889, -0.64613986],
[-1.32702321, 0.6633044 ]])
第六步痰催,進(jìn)行標(biāo)準(zhǔn)化
F = (E - E.mean(axis = 0))/E.std(axis = 0)
F[:5]
array([[-1.30971087, 0.65054141],
[-1.32435711, -0.36051227],
[-1.40967409, -0.29522986],
[-1.33958163, -0.64830449],
[-1.33146886, 0.66552653]])
X_pca[:5]
array([[-1.30533786, 0.64836932],
[-1.31993521, -0.35930856],
[-1.40496732, -0.29424412],
[-1.33510889, -0.64613986],
[-1.32702321, 0.6633044 ]])
二兜辞、代碼實(shí)例
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
# 降解:由多變少,化繁為簡(jiǎn)
# principle component analysis 主要成分
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
# 搬家夸溶,丟掉很多逸吵,沒(méi)有的東西!
X.shape#150樣本缝裁,4個(gè)屬性扫皱,4個(gè)屬性,必然是捷绑,有的重要韩脑,有的不重要,希望去掉的不重要粹污。
(150, 4)
# n_components保留幾個(gè)最重要特征
# whiten 白色段多,標(biāo)準(zhǔn)化
# n_components=0.9 90%重要性的特征保留下來(lái)(一個(gè)特征50%,另一個(gè)41%)
pca = PCA(n_components=0.98,whiten = False)
'''y : None
Ignored variable.'''
# 為了以后厕怜,可以兼容衩匣,后面版本,改進(jìn)粥航,加入y
# sklearn 寫(xiě)代碼琅捏,深思熟慮,保留了y位置递雀,現(xiàn)在柄延,沒(méi)用!
pca.fit(X)
X_pca = pca.transform(X)#變換
# 降維,樣本數(shù)量150會(huì)變化嗎搜吧?
X_pca.shape# 經(jīng)過(guò)PCA降維市俊,發(fā)現(xiàn)數(shù)據(jù)由原來(lái)的150*4 ------> 150*2
(150, 3)
沒(méi)有降維的數(shù)據(jù),進(jìn)行訓(xùn)練預(yù)測(cè)
knn = KNeighborsClassifier(n_neighbors=5)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.9666666666666667
降維的數(shù)據(jù)滤奈,效果
knn = KNeighborsClassifier(n_neighbors=5)
X_train,X_test,y_train,y_test = train_test_split(X_pca,y,test_size = 0.2,random_state = 1024)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.9666666666666667