0.數(shù)據(jù)預整理
這是從GEO數(shù)據(jù)庫下載的數(shù)據(jù)坟漱,GSE199335。
rm(list = ls())
library(tinyarray)
gse = "GSE199335"
geo = geo_download(gse,destdir=".",colon_remove = T)
geo$gpl
## [1] "GPL17400"
#View(geo$pd)
library(stringr)
Group = paste(geo$pd$genotype,geo$pd$age,sep="_") %>%
str_remove(" months of age| weeks of age") %>%
str_remove(" type") %>%
str_replace("/",".")
table(Group)
## Group
## R6.1_6 R6.2_9 wild_6 wild_9
## 3 4 4 4
Group = factor(Group,levels = c("wild_6", "wild_9", "R6.1_6", "R6.2_9"))
ids <- AnnoProbe::idmap(geo$gpl,destdir = tempdir(),type = "soft")
ids = na.omit(ids)
exp = trans_array(geo$exp,ids,from = "ID")
pd = geo$pd
Group
## [1] wild_6 wild_6 wild_6 R6.1_6 R6.1_6 wild_6 R6.1_6 wild_9 wild_9 wild_9
## [11] wild_9 R6.2_9 R6.2_9 R6.2_9 R6.2_9
## Levels: wild_6 wild_9 R6.1_6 R6.2_9
save(exp,Group,pd,file = "Dat.Rdata")
整理數(shù)據(jù)的代碼更哄,尤其是分組信息和探針注釋芋齿,因數(shù)據(jù)而異。 下面的代碼主要自WGCNA的官方文檔成翩,本強迫癥進行了一些改動觅捆。
1.表達矩陣數(shù)據(jù)整理
首先需要有至少15個樣本。 行是樣本麻敌,列是基因栅炒。 不推薦使用全部基因,計算量太大 也不推薦使用差異分析所得的差異基因术羔,包作者說的赢赊。 比較推薦的方法是按照方差或者mad去取前多少個基因,例如3500级历,5000释移,8000個,或者保留基因總數(shù)的前1/4鱼喉。無標準答案秀鞭。
rm(list = ls())
library(WGCNA)
library(tinyarray)
load("Dat.Rdata")
#exp = log2(geo$exp+1)
png("1.exp.png", width = 2000, height = 1200,res = 300)
boxplot(exp)
dev.off()
從這張圖可以看出數(shù)據(jù)是取過log的,且沒有異常樣本扛禽,可以用锋边。
datExpr0 = t(exp[order(apply(exp, 1, mad), decreasing = T)[1:5000],])
#datExpr0 = t(exp[order(apply(exp, 1, var), decreasing = T)[1:round(0.25*nrow(exp))],])
#datExpr0 = as.data.frame(t(exp))
datExpr0[1:4,1:4]
## Bpifa6 Scd3 Myh4 Opn1sw
## GSM5970616 5.808989 5.964354 8.777054 7.488333
## GSM5970617 4.272349 4.216835 9.052218 7.843838
## GSM5970618 2.558643 4.095240 8.907886 7.464708
## GSM5970619 2.081443 4.061555 7.401055 2.912478
## Bpifa6 Scd3 Myh4 Opn1sw
## GSM5970616 5.808989 5.964354 8.777054 7.488333
## GSM5970617 4.272349 4.216835 9.052218 7.843838
## GSM5970618 2.558643 4.095240 8.907886 7.464708
## GSM5970619 2.081443 4.061555 7.401055 2.912478
#rownames(datExpr0) = names(exp)[-c(1:8)]
1.1.基因過濾
主要看缺失值。GEO的芯片數(shù)據(jù)大多數(shù)沒缺失值编曼。
gsg = goodSamplesGenes(datExpr0, verbose = 3)
## Flagging genes and samples with too many missing values...
## ..step 1
## Flagging genes and samples with too many missing values...
## ..step 1
gsg$allOK # 返回TRUE則繼續(xù)
## [1] TRUE
## [1] TRUE
if (!gsg$allOK){
# 把含有缺失值的基因或樣本打印出來
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# 去掉那些缺失值
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
1.2.樣本過濾
sampleTree = hclust(dist(datExpr0), method = "average")
png(file = "2.sampleClustering.png", width = 2000, height = 2000,res = 300)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
dev.off()
看有沒有自己單獨一個分支的樣本豆巨, 如果有異常值就需要去掉,根據(jù)聚類圖自己設置cutHeight 參數(shù)的值掐场。
if(F){
clust = cutreeStatic(sampleTree, cutHeight = 170, minSize = 10)
table(clust) # 0代表切除的往扔,1代表保留的
keepSamples = (clust!=0)
datExpr = datExpr0[keepSamples, ]
}
#沒有異常樣本就不需要去除
datExpr = datExpr0
2.表型信息整理
這個信息來自芯片數(shù)據(jù)的pData贩猎,也就是上面的pd。要求必須是數(shù)值型萍膛,要么像年齡那樣的數(shù)字觉增,要么搞成0租冠,1,或者是1,2幼苛,3等卵史。 這里利用了因子轉(zhuǎn)數(shù)字的方法舰讹,轉(zhuǎn)換成了數(shù)值娃承。
library(stringr)
traitData = data.frame(genotype = as.numeric(as.factor(pd$genotype)),
age = as.numeric(as.factor(pd$age)))
str(traitData)
## 'data.frame': 15 obs. of 2 variables:
## $ genotype: num 3 3 3 1 1 3 1 3 3 3 ...
## $ age : num 1 1 1 1 1 1 1 2 2 2 ...
## 'data.frame': 15 obs. of 2 variables:
## $ genotype: num 3 3 3 1 1 3 1 3 3 3 ...
## $ age : num 1 1 1 1 1 1 1 2 2 2 ...
table(traitData[,1])
##
## 1 2 3
## 3 4 8
##
## 1 2 3
## 3 4 8
#pd
#dim(traitData)
names(traitData)
## [1] "genotype" "age"
這個數(shù)據(jù)有用的表型只有兩列。
datTraits = traitData
sampleTree2 = hclust(dist(datExpr), method = "average")
# 用顏色表示表型在各個樣本的表現(xiàn): 白色表示低桩匪,紅色為高打瘪,灰色為缺失
traitColors = numbers2colors(datTraits, signed = FALSE)
# 把樣本聚類和表型繪制在一起
png(file = "3.Sample dendrogram and trait heatmap.png", width = 2000, height = 2000,res = 300)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
dev.off()
3.WGCNA正式開始
3.1 軟閾值的篩選
設置一系列軟閾值,范圍是1-30之間傻昙,后面的數(shù)沒必要全部畫闺骚,就seq一下。
powers = c(1:10, seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 5000.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 5000 of 5000
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.4030 2.440 0.876 1590.00 1630.000 2170.0
## 2 2 0.0902 0.561 0.765 727.00 751.000 1230.0
## 3 3 0.0154 -0.169 0.673 396.00 402.000 785.0
## 4 4 0.1470 -0.444 0.720 240.00 236.000 538.0
## 5 5 0.3300 -0.651 0.777 156.00 147.000 394.0
## 6 6 0.4950 -0.813 0.850 107.00 96.200 304.0
## 7 7 0.6600 -0.916 0.952 76.90 65.000 245.0
## 8 8 0.7470 -1.100 0.980 56.90 45.100 209.0
## 9 9 0.8020 -1.270 0.993 43.30 32.200 186.0
## 10 10 0.8400 -1.420 0.992 33.70 23.600 168.0
## 11 12 0.8970 -1.600 0.975 21.60 13.400 141.0
## 12 14 0.9230 -1.660 0.954 14.70 8.010 121.0
## 13 16 0.9450 -1.670 0.952 10.40 5.000 106.0
## 14 18 0.9630 -1.660 0.959 7.70 3.210 94.5
## 15 20 0.9610 -1.630 0.951 5.86 2.130 84.7
## 16 22 0.9700 -1.590 0.961 4.58 1.450 76.5
## 17 24 0.9660 -1.560 0.958 3.65 1.020 69.5
## 18 26 0.9650 -1.520 0.958 2.97 0.724 63.5
## 19 28 0.9690 -1.490 0.966 2.45 0.513 58.2
## 20 30 0.9670 -1.450 0.967 2.05 0.378 53.6
sft$powerEstimate
## [1] 12
這個結(jié)果就是推薦的軟閾值屋匕,拿到了可以直接用葛碧,無視下面的圖借杰。有的數(shù)據(jù)走到這一步會得到NA过吻,也就是沒得推薦。蔗衡。纤虽。那就要看下面的圖,選拐點绞惦。
根據(jù)我的經(jīng)驗逼纸,沒有推薦軟閾值,或者數(shù)字太大济蝉,后面跌跌撞撞走起來有些艱難哦杰刽,就得跑到前面重新調(diào)整表達矩陣里納入的基因了。 cex1一般設置為0.9王滤,不太合適(就是大多數(shù)軟閾值對應的縱坐標都達不到0.9)時贺嫂,可以設置為0.8或者0.85。一般不能再低了雁乡。
cex1 = 0.9
png(file = "4.Soft threshold.png", width = 2000, height = 1500,res = 300)
par(mfrow = c(1,2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=cex1,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
3.2 一步構建網(wǎng)絡
如果前面沒有得到推薦的軟閾值第喳,這里就要自己指定一個。根據(jù)上面的圖踱稍,選擇左圖縱坐標第一個達到上面設置的cex1值的軟閾值曲饱。
power = sft$powerEstimate
power
## [1] 12
net = blockwiseModules(datExpr, power = power,
TOMType = "unsigned",
minModuleSize = 30,
reassignThreshold = 0,
mergeCutHeight = 0.25,
deepSplit = 2 ,
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "testTOM",
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file testTOM-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1 genes from module 1 because their KME is too low.
## ..removing 1 genes from module 2 because their KME is too low.
## ..removing 1 genes from module 4 because their KME is too low.
## ..removing 1 genes from module 6 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
后面的結(jié)果如果不理想悠抹,比如劃分的模塊太少,或者青色扩淀、灰色的模塊占到大多數(shù)楔敌,其他顏色都很少,或者顏色模塊聚類是凌亂的驻谆,可以倒回來調(diào)整幾個參數(shù)梁丘。
deepSplit 默認2,調(diào)整劃分模塊的敏感度旺韭,值越大氛谜,越敏感,得到的模塊就越多区端; minModuleSize 默認30值漫,參數(shù)設置最小模塊的基因數(shù),值越小织盼,小的模塊就會被保留下來杨何; mergeCutHeight 默認0.25,設置合并相似性模塊的距離沥邻,值越小危虱,就越不容易被合并,保留下來的模塊就越多唐全。 https://zhuanlan.zhihu.com/p/34697561
可以試試調(diào)整埃跷,但是。邮利。怎么說呢弥雹,主要看選擇的基因是否給力,不給力的話調(diào)整了也就稍微好一點點延届。
不是很有必要去嘗試分步法構建網(wǎng)絡剪勿,得到的結(jié)果一樣,可以調(diào)整的參數(shù)上面也都有方庭。
此處展示得到了多少模塊厕吉,每個模塊里面有多少基因。
table(net$colors)
##
## 0 1 2 3 4 5 6 7 8 9
## 200 785 685 654 628 626 527 447 253 195
mergedColors = labels2colors(net$colors)
png(file = "5.DendroAndColors.png", width = 2000, height = 1200,res = 300)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
好的結(jié)果就是每個顏色都差不多在一起械念,(青色和灰色不在考慮范圍內(nèi))头朱。然后青色和灰色的基因不要太多。
因為灰色代表沒有合適的聚類订讼,青色是基因數(shù)量的模塊髓窜,比如你輸入5000個基因,其中3000個都屬于青色,剩下的模塊基因數(shù)量太少寄纵,就很難受了鳖敷。
3.3 保存每個模塊對應的基因
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs
geneTree = net$dendrograms[[1]]
gm = data.frame(net$colors)
gm$color = moduleColors
head(gm)
## net.colors color
## Bpifa6 7 black
## Scd3 7 black
## Myh4 9 magenta
## Opn1sw 8 pink
## Mb 9 magenta
## Myh13 9 magenta
genes = split(rownames(gm),gm$color)
save(genes,file = "genes.Rdata")
我這里把每個模塊對應的基因存為了Rdata,用于數(shù)據(jù)挖掘下一步需求程拭,提取基因定踱。比如你需要某模塊的基因與差異基因取交集等。
3.4 模塊與表型的相關性
計算基因與表型的相關性矩陣
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#熱圖
png(file = "6.labeledHeatmap.png", width = 2000, height = 2000,res = 300)
# 設置熱圖上的文字(兩行數(shù)字:第一行是模塊與各種表型的相關系數(shù)恃鞋;
# 第二行是p值)
# signif 取有效數(shù)字
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# 然后對moduleTraitCor畫熱圖
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed (50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
我們希望找到和每個表型相關性較強的模塊崖媚,正負相關都可。相關系數(shù)越大越好恤浪,如果能有個0.8畅哑,那結(jié)論就比較穩(wěn)啦!沒有的話0.6或0.7幾也行水由,再小就不要拿來糊弄人了荠呐。
3.5. GS與MM
GS代表模塊里的每個基因與形狀的相關性
MM代表每個基因和所在模塊之間的相關性,表示是否與模塊的趨勢一致砂客。
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
第幾列的表型是最關心的泥张,下面的i就設置為幾。
與關心的表型相關性最高的模塊賦值給下面的module鞠值。
i = 2
#module = "pink"
module = "turquoise"
assign(colnames(traitData)[i],traitData[i])
instrait = eval(parse(text = colnames(traitData)[i]))
geneTraitSignificance = as.data.frame(cor(datExpr, instrait, use = "p"))
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(instrait), sep="")
names(GSPvalue) = paste("p.GS.", names(instrait), sep="")
png(file = paste0("7.MM-GS-scatterplot.png"), width = 2000, height = 2000,res = 300)
column = match(module, modNames) #找到目標模塊所在列
moduleGenes = moduleColors==module #找到模塊基因所在行
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
我們希望看到強悍的正相關媚创。數(shù)據(jù)挖掘里面,可以把整個模塊的基因拿出來和別的步驟結(jié)果取交集了彤恶。
也可以通過這里找到GS和MM都大的基因钞钙。
f = data.frame(GS = abs(geneModuleMembership[moduleGenes, column]),
MM = abs(geneTraitSignificance[moduleGenes, 1]))
rownames(f) = rownames(gm[moduleGenes,])
head(f)
## GS MM
## Rnu1b1 0.8667593 0.9482171
## ND6 0.8599513 0.9310548
## Vmn1r127 0.8839290 0.8336949
## Snora62 0.8862433 0.9287403
## Crygf 0.8759284 0.9884732
## Snord68 0.8733251 0.9027953
3.6.TOM
用基因相關性熱圖的方式展示加權網(wǎng)絡,每行每列代表一個基因粤剧。 一般取400個基因畫就夠啦歇竟,拿全部基因去做電腦要燒起來了挥唠。
就想看看對角線附近紅彤彤的小方塊抵恋,以及同一個顏色基本都在一起,快樂宝磨。
nSelect = 400
set.seed(10)
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6)
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
# 再計算基因之間的距離樹(對于基因的子集弧关,需要重新聚類)
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
library(gplots)
myheatcol = colorpanel(250,'red',"orange",'lemonchiffon')
png(file = "8.Sub400-netheatmap.png", width = 2000, height = 2000,res = 300)
plotDiss = selectTOM^7
diag(plotDiss) = NA #將對角線設成NA,在圖形中顯示為白色的點唤锉,更清晰顯示趨勢
TOMplot(plotDiss, selectTree, selectColors, col=myheatcol,main = "Network heatmap plot, selected genes")
dev.off()
3.7 模塊與表型的相關性
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
MET = orderMEs(cbind(MEs, instrait))
png(file = "9.Eigengene-dengro-heatmap.png", width = 2000, height = 2000,res = 300)
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(4,4,1,2), cex.lab = 0.8, xLabelsAngle
= 90)
dev.off()
也可以把上面的圖分開來畫
png(file = "10.Eigengene-dendrogram.png", width = 2000, height = 2000,res = 300)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
dev.off()
png(file = "11.Eigengene-heatmap.png", width = 2000, height = 2000,res = 300)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(4,5,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
dev.off()