library(fpc)#使用kmeansruns函數(shù)
library(mlbench)#使用數(shù)據(jù)
library(ggplot2)
data('Vehicle',package = 'mlbench')
data<-Vehicle[,-19]
set.seed(1111)
##combine opel and saab to car
class<-Vehicle[,19]
levels(class)[levels(class)=='saab']<-'car'
levels(class)[levels(class)=='opel']<-'car'
ggplot(data=Vehicle,aes(x=class))+geom_bar()
fit<-kmeansruns(data,krange = 1:8,
criterion='ch',
runs=100,
scaledata=TRUE,
critout = TRUE,
)
attributes(fit)
table(fit$cluster,class)
#class
#bus car van
#1 161 195 195
#2 57 234 4
##聚類2可能代表了car?
模糊k均值法:允許樣本屬于多個簇
library(fclust)
fit2<-FKM(data,k=2,m=2,RS=10,stand=1)
attributes(fit2)
head(fit2$clus)#查看聚類的概率
table(fit2$clus[,1],class)
#class
##bus car van
#1 158 186 195
#2 60 243 4
###可以通過評測聚類有效值犀勒,如輪廓值來評估最佳K值
Fclust.index(fit2,index = 'SIL.F')
#The default value alpha=1 has been set for computing SIL.F
#[1] 0.6353147
系統(tǒng)聚類分析爵卒,即層次聚類
library(pvclust)
data<-scale(data)
set.seed(2021)
fit3<-pvclust(data,
method.hclust = 'ward.D',
nboot = 5000,
method.dist = 'euclidean')
##上述函數(shù)通過hclust函數(shù)進行聚類分析
print(fit3)
#結果中au為近似無偏P值锌仅,bp為自助概率P值,se.au表示對自身P值的估計
plot(fit3)
pvrect(fit3,alpha = 0.95)
從數(shù)據(jù)集中隨機抽取子集數(shù)據(jù)瞎抛,然后進行這些子集數(shù)據(jù)長度聚類分析艺演。再進行大量多次的循環(huán)運算,計算每個聚類簇發(fā)生的次數(shù)比例(自助概率BP)
利用不同抽樣規(guī)模的重抽樣來估計每個聚類簇的p值桐臊,產(chǎn)生AUP值(近似無偏P值)胎撤。
高AU值的聚類簇表示受到數(shù)據(jù)的高度支持。
au 的P值95%的區(qū)間
基于模型聚類断凶,假定數(shù)據(jù)是服從高斯分布的
library(mclust)
library(dplyr)
set.seed(1111)
data<-Vehicle[,-19]
fit4<-Mclust(as.matrix(data),G=1:8,modelNames =c("EII", "VII", "EEI", "EVI", "VEI", "VVI"))#將模型分為1-8個高斯成分,通過貝葉斯信息準則來選擇最佳模型
?Mclust
attributes(fit4)
fit4$modelName#最佳模型
fit4$BIC#根據(jù)BIC選最佳個數(shù)
head(fit4$BIC)
bic<-as.matrix(fit4$BIC)
attributes(bic)
bic<-matrix(data=bic[1:48],
nrow=8,ncol=6,
byrow = FALSE,
dimnames = list(1:8,c('EII','VII','EEI','EVI','VEI','VVI')))
bic<-as.data.frame(bic)
bic<-mutate(bic,num=1:8)
ggplot()+
geom_line(data=bic,aes(x=num,y=EII,colour='EII'))+
geom_point(data =bic,aes(x=num,y=EII,colour='EII'))+
geom_line(data=bic,aes(x=num,y=VII,colour='VII'))+
geom_point(data =bic,aes(x=num,y=VII,colour='VII'))+
geom_line(data=bic,aes(x=num,y=EEI,colour='EEI'))+
geom_point(data =bic,aes(x=num,y=EEI,colour='EEI'))+
geom_line(data=bic,aes(x=num,y=EVI,colour='EVI'))+
geom_point(data =bic,aes(x=num,y=EVI,colour='EVI'))+
geom_line(data=bic,aes(x=num,y=VEI,colour='VEI'))+
geom_point(data =bic,aes(x=num,y=VEI,colour='VEI'))+
geom_line(data=bic,aes(x=num,y=VVI,colour='VVI'))+
geom_point(data =bic,aes(x=num,y=VVI,colour='VVI'))+
ylab('GIC')
#從2開始平緩伤提,所以選擇2或者3作為聚類個數(shù)