前言:加權(quán)基因共表達(dá)網(wǎng)絡(luò)分析(weighted gene co-expression network analysis锄禽,WGCNA)是研究相似基因表達(dá)模式的一種方法不撑,通過(guò)尋找協(xié)同表達(dá)的基因模塊,研究這些模塊與臨床表型之間的關(guān)系枝哄,并識(shí)別網(wǎng)絡(luò)中的關(guān)鍵基因(Langfelder & Horvath, 2008)展辞。普通的共表達(dá)網(wǎng)絡(luò)分析,通過(guò)絕對(duì)的閾值對(duì)基因相關(guān)性進(jìn)行篩選腥光,將會(huì)導(dǎo)致一定的信息丟失,WGCNA通過(guò)引入加權(quán)的相關(guān)系數(shù)來(lái)更加全面的體現(xiàn)基因表達(dá)之間的相關(guān)性糊秆,發(fā)掘其中的生物學(xué)意義武福。我們擬對(duì)AGING上發(fā)表的“Identification of biomarkers related to CD8+ T cell infiltration with gene co-expression network in clear cell renal cell carcinoma”(Lin et al., 2020)文章進(jìn)行結(jié)果重現(xiàn)。
具體實(shí)現(xiàn)過(guò)程如下:
1.1 數(shù)據(jù)獲取
從GEO網(wǎng)站下載GSE73731數(shù)據(jù)痘番,該數(shù)據(jù)中包括了265個(gè)ccRCC樣本的測(cè)序數(shù)據(jù)捉片。
1.2 數(shù)據(jù)預(yù)處理
###### 讀入數(shù)據(jù)并進(jìn)行矯正
### 加載R包,讀入數(shù)據(jù)
library(GEOquery)
library(ggplot2)
library(reshape2)
library(limma)
options(stringsAsFactors=F)
gset <- getGEO(filename='GSE73731_series_matrix.txt',AnnotGPL=TRUE,destdir='./')
exp.mat <- exprs(gset)
sample.info.dat <- pData(gset)
gene.info.dat <- fData(gset)
# 查看樣本間表達(dá)值分布汞舱,如圖1所示
boxplot(exp.mat)
# 芯片間表達(dá)數(shù)據(jù)矯正
exp.norm.mat <- normalizeBetweenArrays(exp.mat)
### 去除沒(méi)有基因symbol的探針
gene.filtered <- (!grepl("http:///", gene.info.dat[,"Gene symbol"])) & (gene.info.dat[,"Gene symbol"]!="")
gene.symbol.dat <- data.frame(GeneSymbol=gene.info.dat[gene.filtered,"Gene symbol"])
rownames(gene.symbol.dat) <- rownames(gene.info.dat)[gene.filtered]
exp.norm.mat <- exp.norm.mat[gene.filtered,]
### 對(duì)于對(duì)應(yīng)相同基因symbol的多個(gè)探針伍纫,取各個(gè)探針的平均表達(dá)值
symbol.mean.list <- by(exp.norm.mat,gene.symbol.dat$GeneSymbol,colMeans)
symbol.mean.mat <- matrix(unlist(symbol.mean.list), byrow=T, ncol=ncol(exp.norm.mat))
rownames(symbol.mean.mat) <- names(symbol.mean.list)
colnames(symbol.mean.mat) <- names(symbol.mean.list[[1]])
# 查看矯正之后各個(gè)樣本的表達(dá)分布,如圖2所示昂芜,與圖1相比莹规,可看出芯片的批次效應(yīng)得到矯正
exp.melt.dat <- melt(symbol.mean.mat,value.name="Expression")
colnames(exp.melt.dat)[1:2] <- c('Gene','Sample')
pdf('expression.mean.norm.pdf',width=60,height=5)
ggplot(exp.melt.dat,aes(x=Sample,y=Expression,fill=Sample)) +
geom_boxplot() +
theme(legend.position='none',axis.text.x = element_text(angle=45, hjust=1, vjust=0.5))
dev.off()
# 計(jì)算基因變異系數(shù)(CV),根據(jù)CV>0.1取高變異基因進(jìn)行下游分析
symbol.cv.vec <- apply(symbol.mean.mat,1,function(x){ sd(x)/mean(x) })
symbol.mean.filter.mat <- symbol.mean.mat[symbol.cv.vec > 0.1,]
symbol.mean.dat <- data.frame(symbol.mean.filter.mat)
1.3 WGCNA網(wǎng)絡(luò)構(gòu)建
###### 參照WGCNA教程泌神,對(duì)265例ccRCC表達(dá)數(shù)據(jù)構(gòu)建加權(quán)共表達(dá)網(wǎng)絡(luò)
### 加載R包
library(WGCNA)
library(ggplot2)
library(reshape2)
library(limma)
library(plyr)
# 檢查數(shù)據(jù)中是否有缺失值
datExpr <- t(symbol.mean.dat)
gsg <- goodSamplesGenes(datExpr, verbose = 3)
gsg$allOK
### 計(jì)算合適的相關(guān)系數(shù)軟閾值良漱,如圖3所示,據(jù)此可選擇3或4為相關(guān)性軟閾值power
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
sft <- pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
pdf(file = "Plots/SoftThreshold.pdf", width = 8, height = 4)
par(mfrow = c(1,2))
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=0.9,col="red")
abline(h=0.85,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=0.9,col="red")
dev.off()
###### Step-by-step network construction
### 計(jì)算鄰接矩陣與相異矩陣
adjacency <- adjacency(datExpr, power = sft$powerEstimate)
TOM <- TOMsimilarity(adjacency)
dissTOM <- 1-TOM
geneTree <- hclust(as.dist(dissTOM), method = "average")
### 使用動(dòng)態(tài)剪切樹(shù)算法鑒定相關(guān)模塊
minModuleSize = 30
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize,cutHeight=0.99)
table(dynamicMods)
dynamicColors <- labels2colors(dynamicMods)
### 將相似的模塊聚類欢际,如圖4所示
MEList <- moduleEigengenes(datExpr, colors = dynamicColors)
MEs <- MEList$eigengenes
MEDiss <- 1-cor(MEs)
METree <- hclust(as.dist(MEDiss), method = "average")
pdf(file = "Plots/cluster.module.step2.pdf", width = 8, height = 6)
plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")
MEDissThres = 0.25
abline(h=MEDissThres, col = "red")
dev.off()
### 將相似模塊聚為一個(gè)模塊母市,如圖5所示,除灰色模塊外损趋,共鑒定到9個(gè)信息模塊
merge <- mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergedColors <- merge$colors
mergedMEs <- merge$newMEs
moduleColors <- mergedColors
pdf(file = "Plots/dendrogram.module.merged.step2.pdf", width = 8, height = 7)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
dev.off()
1.4 細(xì)胞成分計(jì)算
通過(guò)CIBERSORTx(Newman et al., 2019)對(duì)bulk測(cè)序樣本中不同類型免疫細(xì)胞的成分進(jìn)行推算患久,將265個(gè)ccRCC樣本的表達(dá)矩陣上傳至CIBERSORTx網(wǎng)站(https://cibersortx.stanford.edu/)進(jìn)行在線計(jì)算。
1.5 T細(xì)胞浸潤(rùn)相關(guān)模塊鑒定
### 數(shù)據(jù)讀入與預(yù)處理
infil.t <- read.csv('CIBERSORTx_Job2_Results.csv',head=T,row.names=1)
infil.t <- infil.t[,grep('T.cells',colnames(infil.t))]
infil.t <- infil.t[colnames(symbol.mean.dat),]
datExpr = as.data.frame(t(symbol.mean.dat))
datTraits = infil.t
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
### 計(jì)算模塊eigengene與免疫細(xì)胞浸潤(rùn)相關(guān)性
MEs0 <- moduleEigengenes(datExpr, mergedColors)$eigengenes
MEs <- orderMEs(MEs0)
moduleTraitCor <- cor(MEs, datTraits, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples)
### 相關(guān)圖展示浑槽,如圖6所示墙杯,可以看出綠色模塊與CD8T細(xì)胞浸潤(rùn)之間存在顯著相關(guān),下面的分析中針對(duì)這一模塊進(jìn)行重點(diǎn)研究
textMatrix <- paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
pdf('Plots/Module-trait-relationships.pdf',width=10,height=8)
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
1.6 鑒定hub gene
###### 根據(jù)gene significance(GS)和module membership(MM)鑒定具有重要作用的hub gene
### 計(jì)算MM
geneModuleMembership <- as.data.frame(cor(datExpr, MEs, use = "p"))
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
modNames <- substring(names(MEs), 3)
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
### 計(jì)算GS
CD8T <- as.data.frame(datTraits$T.cells.CD8)
names(CD8T) <- "CD8T"
geneTraitSignificance <- as.data.frame(cor(datExpr, CD8T, use = "p"))
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) <- paste("GS.", names(CD8T), sep="")
names(GSPvalue) <- paste("p.GS.", names(CD8T), sep="")
### 查看綠色模塊內(nèi)部gene與CD8T細(xì)胞浸潤(rùn)的相關(guān)性(即GS值)括荡,以及與綠色模塊eigengene相關(guān)性(即MM值)
### 如圖7所示高镐,這里我們根據(jù)GS>0.5且MM>0.8進(jìn)行篩選,可以得到66個(gè)hub gene
module = "green"
column <- match(module, modNames)
moduleGenes <- moduleColors==module
pdf('Plots/MM-GS.green.pdf',width=7,height=7)
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab = paste("Module Membership in", module, "module"),
ylab = "Gene significance for body weight",
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
1.7 結(jié)果匯總
# Summary output of network analysis results
geneInfo0 <- data.frame(GeneSymbol= names(datExpr),
ModuleColor = moduleColors,
geneTraitSignificance,
GSPvalue)
# Order modules by their significance for CD8T
modOrder <- order(-abs(cor(MEs, CD8T, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
oldNames <- names(geneInfo0)
geneInfo0 <- data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
MMPvalue[, modOrder[mod]])
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder <- order(geneInfo0$ModuleColor, -abs(geneInfo0$GS.CD8T));
geneInfo <- geneInfo0[geneOrder, ]
write.csv(geneInfo, file = "Plots/geneInfo.csv")
write.csv(rownames(geneInfo)[geneInfo$ModuleColor=='green'], file = "Plots/geneInfo.green.csv",row.names=F)
# 自定義標(biāo)準(zhǔn)選擇hub gene
############################################
geneInfo <- read.csv("Plots/geneInfo.csv",header=T,row.names=1)
gene.greenInfo <- geneInfo[geneInfo$ModuleColor=='green',1:6]
hub.gene.green <- rownames(gene.greenInfo)[(gene.greenInfo$GS.CD8T > 0.5) & (gene.greenInfo$MM.green > 0.8)]
write.csv(hub.gene.green,'hub.gene.green.csv')
主要參考文獻(xiàn):
Langfelder, P., & Horvath, S. (2008). WGCNA: an R package for weighted correlation network analysis. BMC bioinformatics, 9(1), 559.
Lin, J., Yu, M., Xu, X., Wang, Y., Xing, H., An, J., ... & Zhu, Y. (2020). Identification of biomarkers related to CD8+ T cell infiltration with gene co-expression network in clear cell renal cell carcinoma. Aging (Albany NY), 12(4), 3694.
Newman, A. M., Steen, C. B., Liu, C. L., Gentles, A. J., Chaudhuri, A. A., Scherer, F., ... & Diehn, M. (2019). Determining cell type abundance and expression from bulk tissues with digital cytometry. Nature biotechnology, 37(7), 773-782.