載入數(shù)據(jù)
library(readr)
# 載入數(shù)據(jù)
sampleTraits<-read_csv("2-datTraits_95.csv", col_names = T)
#datExpr1<-read_csv("6-DEGset-95_DEseq_miRBAse_batch_scale.csv", col_names = T) #用于box
#datExpr2<-read_csv("5-DEGset_95_DEseq_miRBAse_batch.csv", col_names = T) #用于lasso
#datExpr3<-read_csv("6-DEGset-95_DEseq_miRBAse_batch_scale_final.csv", col_names = T) #用于box
#datExpr4<-read_csv("6-DEGset_95_DEseq_miRBAse_batch_final.csv", col_names = T) #用于lasso
datExpr5<-read_csv("6-DEGset_95_DEseq_miRBAse_batch_final_log2fc.csv", col_names = T) #用于lasso
datExpr<-datExpr5 #**************************
dim(datExpr)
# 矩陣轉(zhuǎn)置
miR<-as.vector(unlist(datExpr[,1]))
datExpr_t<-t(datExpr[,-1])
#datExpr_t[c(1:3),c(1:3)]
colnames(datExpr_t)<-miR
datExpr_t[c(1:3),c(1:3)] # 列為樣本抛蚁,行為變量
1.封裝法
#遞歸特征消除(Recursive Feature Elimination)
library(caret)
#library(gam)
data.x<-datExpr_t[,c(1:dim(datExpr_t)[2])] #矩陣格式
data.outcome<-factor(sampleTraits$Diagnosis, level = c('ASD', 'CTL')) # $用于列表
set.seed(123)
filter1<-rfe(x = data.x,
y = data.outcome,
sizes = seq(5,16,1), #sizes:通過一個整數(shù)向量钉跷,指定需要保留的特征數(shù)量
#rfFuncs(隨機森林)爷辙,lmFuncs(線性回歸)膝晾,nbFuncs(樸素貝葉斯)务冕,treebagFuncs(裝袋決策樹)时甚,caretFuncs(自定義的訓(xùn)練模型)
rfeControl = rfeControl(functions = rfFuncs,
method = 'cv', #Cross-Validated (10 fold)
repeats = 5)) #抽嚷嵬恕?組樣本
plot(filter1, type=c("g", "o"))
print(filter1)
predictors(filter1)
2.LASSO回歸
# Lasso全名The leastAbsolute shrinkage and Selectionator operator(最小絕對(值)收縮和變量選擇)
library(glmnet)
set.seed(123)
filter2<-cv.glmnet(x = data.x,
y = data.outcome,
family='binomial', #可選gaussian一維連續(xù)因變量滋戳,binomial二元離散因變量
nfolds=5, #默認為10
type.measure = "auc") #可選deviance啥刻,mse可帽,mae,class
filter3<-glmnet(x = data.x,
y = data.outcome,
family='binomial')
plot(filter2);plot(filter3, xvar = 'lambda', label=TRUE)
filter2$lambda.min #最佳lambda值
filter2$lambda.1se #$lambda.1se#指在lambda.min一個標準差范圍內(nèi)最簡模型的lambda值钝满。
filter2.coef.lambda.1se<-coef(filter2, s=filter2$lambda.1se)
filter2.coef.lambda.1se #篩后指標
filter2.1se.out<-filter2.coef.lambda.1se[which(filter2.coef.lambda.1se != 0),]
filter2.1se.out<-round(filter2.1se.out, 4) #保留小數(shù)位數(shù)
filter2.1se.out;length(filter2.1se.out)
3.隨機森林法(袋外誤差(OOB))
## https://blog.csdn.net/wishchin/article/details/52515516
if(!suppressWarnings(require(varSelRF)))
{
install.packages('varSelRF')
require(varSelRF)
}
setwd('C:/Users/xllix/Documents/WORK/2019論文準備/1-論文初稿/2-results')
library(varSelRF)
library(readr)
set.seed(123)
rf.vs1<-varSelRF(data.x,
data.outcome,
c.sd = 1, mtryFactor = 1,
ntree = 5000,ntreeIterat = 2000, #默認值500
vars.drop.num = NULL, vars.drop.frac = 0.1,
whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE,
returnFirstForest = TRUE, fitted.rf = NULL, keep.forest = FALSE)
rf.vs1
select.history<-rf.vs1$selec.history;names(select.history)
select.history[select.history$Number.Variables == rf.vs1$best.model.nvars,]
selected.vars<-rf.vs1$selected.vars
plot(rf.vs1)
dev.off()
write.table(select.history,"7.4-for_svm_oob_log2fc2.txt", row.names = F, quote = F)
4.特征匯總
box<-predictors(filter1)
lasso<-row.names(as.data.frame(filter2.1se.out))[-1]
oob<-selected.vars
sect<-Reduce(intersect, list(box, oob, lasso));length(sect) #7
print('封裝法/遞歸特征消除');box
print('LASSO回歸');lasso
print('隨機森林oob');oob
print('取交集');sect
write.table(box, "7.4-for_svm_box_log2fc.csv",row.names = F) #都不用scale
write.table(lasso, "7.4-for_svm_lasso_log2fc_5fc.csv",row.names = F)
write.table(oob, "7.4-for_svm_oob_log2fc.csv",row.names = F)
write.csv(sect, "7.5-for_svm_sec_log2fc.csv",row.names = F)