使用pbmc數(shù)據(jù)集做演示,使用矩陣是@data矩陣硕盹,基因是2000高變基因褒纲,細(xì)胞是所有細(xì)胞涩馆。(pbmc的數(shù)據(jù)不適合拿來跑WGCNA框都,這里只做為演示确丢。)
1. 導(dǎo)入數(shù)據(jù)
library(WGCNA)
pbmc <- readRDS("pbmc.rds")
datadf <- as.matrix(pbmc@assays$RNA@data )
dim(datadf)
df <- datadf[intersect(Seurat::VariableFeatures(pbmc),rownames(datadf)),]
dim(df)
df[1:4,1:4]
class(df)
# [1] "matrix" "array"
# 轉(zhuǎn)換為樣品在行茫多,基因在列的矩陣
dataExpr <- as.data.frame(t(df))
dim(dataExpr)
head(dataExpr)[,1:8]
nGenes = ncol(dataExpr)
nSamples = nrow(dataExpr)
WGCNA基本參數(shù)設(shè)置
type = "unsigned" # 官方推薦 "signed" 或 "signed hybrid"
corType = "pearson" # 相關(guān)性計算 官方推薦 biweight mid-correlation & bicor corType: pearson or bicor
corFnc = ifelse(corType=="pearson", cor, bicor)
corFnc
maxPOutliers = ifelse(corType=="pearson",1,0.05) # 對二元變量祈匙,如樣本性狀信息計算相關(guān)性時, # 或基因表達(dá)嚴(yán)重依賴于疾病狀態(tài)時天揖,需設(shè)置下面參數(shù)
# 關(guān)聯(lián)樣品性狀的二元變量時夺欲,設(shè)置
robustY = ifelse(corType=="pearson",T,F)
2. 檢測缺失值和離群樣本
#檢測缺失值
gsg = goodSamplesGenes(dataExpr, verbose = 3)
gsg$allOK
gsg$goodSamples
# 查看是否有離群樣品
sampleTree = hclust(dist(dataExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
3. 挑選軟閾值
powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(dataExpr, powerVector=powers,
networkType="signed", verbose=5)
par(mfrow = c(1,2))
cex1 = 0.9
# 橫軸是Soft threshold (power),縱軸是無標(biāo)度網(wǎng)絡(luò)的評估參數(shù)今膊,數(shù)值越高些阅,
# 網(wǎng)絡(luò)越符合無標(biāo)度特征 (non-scale)
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# 篩選標(biāo)準(zhǔn)。R-square=0.85
abline(h=0.85,col="red")
# Soft threshold與平均連通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
其他方法挑選power值
# 自動挑選軟閾值
power = sft$powerEstimate
softPower = power
softPower
# [1] 6
# 經(jīng)驗power值
if (is.na(power)){
power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
ifelse(type == "unsigned", 6, 12))
)
)
}
power
# [1] 6
4. 構(gòu)建共表達(dá)網(wǎng)絡(luò)
#一步法網(wǎng)絡(luò)構(gòu)建:One-step network construction and module detection##
cor <- WGCNA::cor
net = blockwiseModules(dataExpr, power = power, maxBlockSize = nGenes,#nGenes
TOMType = "unsigned", minModuleSize = 10,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs=TRUE, corType = corType,
maxPOutliers=maxPOutliers, loadTOMs=TRUE,
saveTOMFileBase = paste0("dataExpr", ".tom"),
verbose = 3)
# power: 上一步計算的軟閾值
# maxBlockSize: 計算機(jī)能處理的最大模塊的基因數(shù)量 (默認(rèn)5000)项郊;
# 4G內(nèi)存電腦可處理8000-10000個,16G內(nèi)存電腦可以處理2萬個姻政,32G內(nèi)存電腦可
# 以處理3萬個
# 計算資源允許的情況下最好放在一個block里面呆抑。
# corType: pearson or bicor
# numericLabels: 返回數(shù)字而不是顏色作為模塊的名字,后面可以再轉(zhuǎn)換為顏色
# saveTOMs:最耗費時間的計算汁展,存儲起來鹊碍,供后續(xù)使用
# mergeCutHeight: 合并模塊的閾值,越大模塊越少
# type = unsigned
計算過程
Calculating module eigengenes block-wise from all genes
Flagging genes and samples with too many missing values...
..step 1
..Working on block 1 .
TOM calculation: adjacency..
..will not use multithreading.
Fraction of slow calculations: 0.000000
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
..saving TOM for block 1 into file dataExpr.tom-block.1.RData
....clustering..
....detecting modules..
....calculating module eigengenes..
....checking kME in modules..
..removing 1 genes from module 2 because their KME is too low.
..merging modules that are too close..
mergeCloseModules: Merging modules whose distance is less than 0.25
Calculating new MEs...
查看一下返回的net結(jié)果
table(net$colors)
# 0 1 2 3 4
# 1855 59 52 17 17
net$unmergedColors
head(net$MEs)
# ME4 ME2 ME1 ME3 ME0
# AAACATACAACCAC-1 0.004299074 -0.0024229364 -0.012777424 -0.012259802 -0.011847288
# AAACATTGAGCTAC-1 -0.007987395 -0.0024229364 -0.007647816 0.036779648 0.003124504
# AAACATTGATCAGC-1 -0.005377686 -0.0024229364 -0.010071728 -0.016748944 -0.001960111
# AAACCGTGCTTCCG-1 -0.005498834 -0.0002653717 0.037286420 0.018902740 0.027428832
# AAACCGTGTATGCG-1 0.053098755 -0.0024229364 -0.009787707 -0.007069281 -0.002900237
# AAACGCACTGGTAC-1 -0.010385161 -0.0024229364 -0.008945768 -0.005881210 -0.008029291
net$goodSamples
net$goodGenes
net$TOMFiles
# [1] "dataExpr.tom-block.1.RData" "dataExpr.tom-block.2.RData" "dataExpr.tom-block.3.RData"
net$blockGenes
net$blocks
net$MEsOK
net$MEs是所有細(xì)胞*5個模塊的矩陣(可以類比PCA的PC來理解)食绿。??這個矩陣是非常重要的侈咕,后面做隨機(jī)森林也是用的這個矩陣。
這里用不同的顏色來代表那些所有的模塊器紧,其中灰色默認(rèn)是無法歸類于任何模塊的那些基因耀销,如果灰色模塊里面的基因太多,那么前期對表達(dá)矩陣挑選基因的步驟可能就不太合適铲汪。
## 灰色的為**未分類**到模塊的基因熊尉。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
moduleColors
# Plot the dendrogram and the module colors underneath
# 如果對結(jié)果不滿意,還可以recutBlockwiseTrees掌腰,節(jié)省計算時間
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
灰色模塊一般如果基因不多的話,在后續(xù)分析的時候去掉就可以了齿梁。但是這個里面太多了催植,應(yīng)該是前面基因的選擇有點問題,還有就是用所有細(xì)胞來做的WGCNA勺择,可能矩陣還是太稀疏了创南。
5. 網(wǎng)絡(luò)可視化 (TOM plot)
# module eigengene, 可以繪制線圖,作為每個模塊的基因表達(dá)趨勢的展示
MEs = net$MEs
### 不需要重新計算省核,改下列名字就好
### 官方教程是重新計算的稿辙,起始可以不用這么麻煩
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(
as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
# 根據(jù)基因間表達(dá)量進(jìn)行聚類所得到的各模塊間的相關(guān)性圖
# marDendro/marHeatmap 設(shè)置下、左气忠、上邓深、右的邊距
head(MEs_col)
?plotEigengeneNetworks
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2),
plotDendrograms = T,
xLabelsAngle = 90)
6. 模塊-表型數(shù)據(jù)關(guān)聯(lián)
datTraits=pbmc@meta.data
MEs0 = moduleEigengenes(dataExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p"); #核心代碼,計算了每個模塊和每個基因的相關(guān)性
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
# 通過相關(guān)值對每個關(guān)聯(lián)進(jìn)行顏色編碼
sizeGrWindow(10,6)
# 展示模塊與表型數(shù)據(jù)的相關(guān)系數(shù)和 P值
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# 用熱圖的形式展示相關(guān)系數(shù)
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
7. 隨機(jī)森林篩選重要的模塊
identical(rownames(MEs_col),rownames(datTraits))
exp <- cbind(MEs_col,datTraits)
View(exp)
pbmc.rf <- randomForest(cell_type ~ ., data=exp, importance=TRUE,
proximity=TRUE)
library(randomForest)
pbmc.rf <- randomForest(cell_type ~ ., data=exp, importance=TRUE,
proximity=TRUE)
print(pbmc.rf)
pbmc.rf$importance
varImpPlot(pbmc.rf, main = "Top 30 - variable importance")
思路
1. 需要注意的是,整合樣本做WGCNA分析分析得到的模塊主要和細(xì)胞類型有關(guān)聯(lián),因為此時的cluster代表不同的細(xì)胞類型袱瓮,此時做WGCNA感覺作用不大缤骨,因為不同的細(xì)胞類型本身就應(yīng)該有獨特的一套基因列表,但是我們在做同一細(xì)胞類型再分群的時候尺借,這個分析的用處就會很大绊起,因為得到的模塊與同一細(xì)胞類型不同的subcluster相關(guān)聯(lián),得到的就是一些在各個亞群高度協(xié)調(diào)的模塊基因燎斩,這個時候把得到的模塊基因全部做下游分析虱歪,意義非常大,從另外一個側(cè)面表現(xiàn)了細(xì)胞類型內(nèi)部的異質(zhì)性栅表,如果是多樣本整合的關(guān)系(比如正常和疾菜癖伞),同一細(xì)胞類型的不同樣本關(guān)聯(lián)得到的gene模塊列表怪瓶,就可以得到處理組和對照組關(guān)聯(lián)度高的模塊列表萧落,新的角度來看待疾病的發(fā)生。
2. 此外洗贰,我們在做分析的時候找岖,同樣是多個樣本(正常和疾病兩組),同一細(xì)胞類型敛滋,我們先做正常組(或者疾病組)的WGCNA宣增,得到的基因模塊反映到疾病組(正常組),這個時候就會看出來明顯的差異矛缨,本應(yīng)該高度關(guān)聯(lián)的模塊可能就消失了,這個高度關(guān)聯(lián)的模塊的功能也隨之產(chǎn)生了變化帖旨;還有就是同一細(xì)胞類型箕昭,兩組樣本先分別做WGCNA,得到的基因模塊進(jìn)行平行對比解阅,那么得到的就是對疾病發(fā)生新的認(rèn)識落竹。