對2016年的WGCNA文章嘗試復(fù)現(xiàn)原朝,效果不太好驯嘱,但是過了一遍流程還不錯!
文章名稱:伴 HBV 感染性肝癌調(diào)控樞紐基因篩選與初步鑒定
# title:伴 HBV 感染性肝癌調(diào)控樞紐基因篩選與初步鑒定
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse47197
#加載包
rm(list = ls())
options(stringsAsFactors = F)
if (F) {
library(GEOmirror)
library(GEOquery)
library(stringr)
suppressMessages(library(WGCNA))
suppressMessages(library(limma))
}
f <- 'gse47197_eSet.Rdata'
#數(shù)據(jù)導(dǎo)入
if(!file.exists(f)){
gse <- geoChina("GSE47197")
save(gse,file=f)
}
load(f)
gset<- gse[[1]]
#GPL16699
#表達(dá)矩陣
dat<-exprs(gset) #表達(dá)矩陣
dim(dat) #28782 124
dat[1:4,1:4]
boxplot(dat,las=2) #數(shù)據(jù)已經(jīng)過標(biāo)準(zhǔn)化處理
#樣本信息
pdat <- pData(gset)
#構(gòu)建分組信息
group_list=tolower(str_split(pdat$title,' ',simplify = T)[,1])
table(group_list)
#下載注釋文件#GPL16699
#Download GPL file, put it in the current directory, and load it:
gpl <- getGEO('GPL16699', destdir=".")
colnames(Table(gpl))
head(Table(gpl)[,c(1,10)]) ## you need to check this , which column do you need
probe2gene=Table(gpl)[,c(1,10)] #提取探針基因?qū)?yīng)關(guān)系
head(probe2gene) #查看數(shù)據(jù)
library(stringr)
save(probe2gene,file='probe2gene.Rdata')
load('probe2gene.Rdata')
##檢查有多少個空白symbol
nrow(probe2gene[probe2gene$GENE_SYMBOL=='',]) #[1] 8681
ids<- probe2gene
colnames(ids)=c('probe_id','symbol')
ids=ids[ids$symbol != '',] #去除空白的基因
#nrow(ids[ids$symbol == '',]) ,再次檢查喳坠,0個
dat <- dat[rownames(dat)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(dat),]
anno <- function(dat,ids){
tmp <- by(dat,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))])
probes = as.character(tmp)
dat=dat[rownames(dat) %in% probes ,]
rownames(dat)=ids[ids$probe_id%in%rownames(dat),2]
return(dat)
}
exprSet<- anno(dat = dat,ids = ids)
save(exprSet,group_list,file='gse47197_new_exprSet.Rdata')
###到此為止已經(jīng)完成了數(shù)據(jù)輸入以及基因注釋鞠评。
load('gse47197_new_exprSet.Rdata') #加載數(shù)據(jù)
dim(exprSet) #[1] 16390 124
exprSet[1:5,1:5]
group_list[1:6]
colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
#構(gòu)建癌旁以及癌表達(dá)矩陣
non <-exprSet[,group_list=="non"]
tumor <- exprSet[,group_list=="tumoral"]
###limma進(jìn)行差異分析
# 表達(dá)矩陣 exprSet
# design矩陣
design <- model.matrix(~0+factor(group_list,levels = c("non","tumoral"),ordered = T))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
# 比較矩陣
contrast.matrix<-makeContrasts("tumoral-non",
levels = c("non","tumoral"))
#差異分析
fit <- lmFit(exprSet,design)
fit1 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit1)
de <- topTable(fit2,coef = 1,n=Inf,p.value = 0.05)
dim(de)
nrDEG <- na.omit(de)
head(nrDEG)
DEG <- nrDEG[abs(nrDEG$logFC)>1,] #此處篩出583個DEG,與原文不一致壕鹉,求解剃幌? 原文5986個
a<- rownames(DEG)
non <-exprSet[a,group_list=="non"]
tumor <- exprSet[a,group_list=="tumoral"]
save(exprSet,group_list,nrDEG,DEG, non,tumor,
file='gse47197_DEG.Rdata')
load('gse47197_DEG.Rdata')
###WGCNA
tnon <- t(non)
ttumor <- t(tumor)
sampleTree = function(datExpr) {
nGenes = ncol(datExpr) #583
nSamples = nrow(datExpr) #63個樣本
datExpr_tree <- hclust(dist(datExpr), method = "average") #聚類分析
par(mar = c(0, 5, 2, 0))
plot(
datExpr_tree,
main = "Sample clustering",
sub = "",
xlab = "",
cex.axis = 1, #坐標(biāo)軸字體大小
cex.main = 1, #title 字體大小 main = "Sample clustering"
cex.lab = 1 #height字體大小
)}
sampleTree(tnon)
sampleTree(ttumor)
nrow(ttumor)
table(group_list)
if (F) {
#篩選一個cutoff把圖中的離群樣本刪除
datExpr_tree <- hclust(dist(tnon), method = "average")
height_cut =50 #篩選離群樣本的閾值需要結(jié)合圖以及分析的結(jié)果定
clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10)
table(clust)
keepSamples <- (clust=1)
dim(tnon)
tnon = tnon[keepSamples, ]
dim(tnon)
# 觀察結(jié)果
sampleTree(tnon)
}
datExpr_tree <- hclust(dist(ttumor), method = "average")
height_cut =50 #篩選離群樣本的閾值需要結(jié)合圖以及分析的結(jié)果定
clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10)
table(clust)
keepSamples <- (clust!=0)
dim(ttumor)
ttumor = ttumor[keepSamples, ]
dim(ttumor)
# 觀察結(jié)果
sampleTree(ttumor)
###選擇軟閾值
datExpr<- tnon
datExpr<- ttumor
datExpr[1:4,1:4] #行 樣本,列 基因
powers = c(c(1:10), seq(from = 12, to = 30, by = 2)) #構(gòu)建軟閾值權(quán)重范圍
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#設(shè)置網(wǎng)絡(luò)構(gòu)建參數(shù)選擇范圍晾浴,計算無尺度分布拓?fù)渚仃?TOM)
png("step2-beta-value1.png",width = 800,height = 600)
par(mfrow = c(1, 2)) #
cex1 = 0.9 #字體大小
# 無尺度拓?fù)鋽M合與軟閾值圖负乡,縱坐標(biāo)(無尺度拓?fù)鋽M合)越接近于1,代表擬合越接近于無尺度分布
plot(
sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit,signed R^2",
type = "n",
main = paste("Scale independence")
)
text(
sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers,
cex = cex1,
col = "red"
)
##畫紅線截取height
abline(h = 0.90, col = "red")
#平均連接度與軟閾值圖
plot(
sft$fitIndices[, 1],
sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity",
type = "n",
main = paste("Mean connectivity")
)
text(
sft$fitIndices[, 1],
sft$fitIndices[, 5],
labels = powers,
cex = cex1,
col = "red"
)
dev.off() #根據(jù)原文脊凰,非腫瘤power=10抖棘,腫瘤9,話說狸涌,我畫的貌似不太一樣切省,可能跟前面進(jìn)行了差異基因有關(guān)
datExpr <- ttumor;powert=9
#4. 構(gòu)建共表達(dá)網(wǎng)絡(luò)
net = blockwiseModules(
datExpr, #輸入數(shù)據(jù)為轉(zhuǎn)置后的表達(dá)矩陣
power = powert, #估計的權(quán)重系數(shù)
maxBlockSize = 6000,
TOMType = "unsigned", #拓樸重疊矩陣類型為無向型
minModuleSize = 50, #最少模塊數(shù)量 50
detectCutHeight = 0.995, #設(shè)定高度上限0.995
reassignThreshold = 0,
mergeCutHeight = 0.25, #75%相關(guān)度為融合閾值
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = T,saveTOMFileBase = "HCCtumor",
verbose = 3
)
table(net$colors) #看一下module數(shù)量以及對應(yīng)基因數(shù)量。0屬于沒有聚類到的基因數(shù)
##純純大怨種帕胆,只聚了3類朝捆,越來越疑惑作者為什么前面會做個差異分析?而我的結(jié)果也跟作者差異那么大
mergedColors = labels2colors(net$colors) # 數(shù)字標(biāo)簽與顏色相對應(yīng)
png("step4-genes-modules.png",width = 800,height = 600)
plotDendroAndColors(
net$dendrograms[[1]], #拓?fù)湎嗨凭仃嚲垲惤Y(jié)果
mergedColors, #顏色對應(yīng)關(guān)系
c("Module colors","morged colors"),
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
)
# plotDendroAndColors接受一個聚類的對象懒豹,以及該對象里面包含的所有個體所對應(yīng)的顏色芙盘。
dev.off()
##聚類結(jié)果相似模塊的融合,Merging of modules whose expression profiles are very similar
#在聚類樹中每一leaf是一個短線脸秽,代表一個基因儒老,
#不同分之間靠的越近表示有高的共表達(dá)基因,將共表達(dá)極其相似的modules進(jìn)行融合
# Calculate eigengenes
if(T){
MEList = moduleEigengenes(datExpr, colors = mergedColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);#計算模塊和模塊之間的相關(guān)性和相異性
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
#sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#建立基因模塊后豹储,可以將模塊用顏色來區(qū)分贷盲,有些模塊相似性高,就需要將模塊合并。將模塊特征基因進(jìn)行聚類巩剖,在完成聚類后合并铝穷,0.4高度對應(yīng)的相似度閾值就是0.6。具體的相似性閾值可以自行設(shè)置佳魔,進(jìn)行聚類剪切后曙聂,就可以區(qū)分哪些模塊相似性高,哪些模塊相似性低鞠鲜,如下圖宁脊。
#選擇有60%相關(guān)性的進(jìn)行融合
MEDissThres = 0.4#0.4剪切高度可修改 ####可以完成相似模塊的合并,剪切高度是0.4贤姆,也就是將相似性高于0.6的模塊進(jìn)行了合并
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(datExpr, mergedColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors1 = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
png("dynamicColors_mergedColors.png",width = 800,height = 600)
plotDendroAndColors(net$dendrograms[[1]], cbind(mergedColors, mergedColors1),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
}
#模塊模塊相關(guān)性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#模塊顏色信息(模塊標(biāo)簽轉(zhuǎn)化為顏色信息)
moduleColors <- labels2colors(net$colors)
# Recalculate KMEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0); ##不同顏色的模塊的ME值矩 (樣本vs模塊)
moduleTraitCor = cor(MEs, MEs , use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
png("step5-Module-trait-relationships.png",width = 800,height = 1200,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(MEs),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-Module relationships"))
dev.off()
#TOM圖 基因相似性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
# GO注釋
table(moduleColors)
group_g=data.frame(gene=colnames(datExpr),
group=moduleColors)
save(group_g,file='wgcna_group_g.Rdata')
rm(list = ls())
load(file='wgcna_group_g.Rdata')
library(clusterProfiler)
# Convert gene ID into entrez genes
head(group_g)
tmp <- bitr(group_g$gene, fromType="SYMBOL",
toType="ENTREZID",
OrgDb="org.Hs.eg.db")
head(tmp)
de_gene_clusters=merge(tmp,group_g,by.x='SYMBOL',by.y='gene')
table(de_gene_clusters$group)
head(de_gene_clusters)
list_de_gene_clusters <- split(de_gene_clusters$ENTREZID,
de_gene_clusters$group) #按照顏色group拆分entrezid
# library(ggplot2)
# gcSample= list_de_gene_clusters
# source('function.R') # 這個代碼非常復(fù)雜榆苞,就不給大家了# 就是包裝了一個com_kegg_go函數(shù),里面會對分組好的基因集進(jìn)行批量注釋
#
# # 下一步非常耗時霞捡,保守估計半個小時
# 主要是對我們的模塊進(jìn)行功能注釋坐漏,就是GO/KEGG數(shù)據(jù)庫
# com_kegg_go(gcSample,'top5000')
# Run full GO enrichment test
formula_res <- compareCluster(
ENTREZID~group,
data=de_gene_clusters,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05
)
# Run GO enrichment test and merge terms
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
formula_res,
cutoff=0.5,
by="p.adjust",
select_fun=min
)
# Plot both analysis results
#pdf('female_compared_GO_term_DE_cluster.pdf',width = 11,height = 6)
#dotplot(formula_res, showCategory=5)
#dev.off()
pdf('Go.pdf',width = 13,height = 8)
dotplot(lineage1_ego, showCategory=5)
dev.off()
# Save results
#write.csv(formula_res@compareClusterResult,
# file="Microglia_compared_GO_term_DE_cluster.csv")
write.csv(lineage1_ego@compareClusterResult,
file="Microglia_compared_GO_term_DE_cluster_simplified.csv")
figure:
參考:學(xué)徒復(fù)現(xiàn)WGCNA文章圖表 - 知乎 (zhihu.com)
WGCNA學(xué)習(xí)筆記 - 簡書 (jianshu.com)
r - Cut dendrogram / cluster: Error in function 'cutree': tree incorrect (composante 'merge') - Stack Overflow
這個WGCNA作業(yè)終于有學(xué)徒完成了! - 云+社區(qū) - 騰訊云 (tencent.com)
limma: Linear Models for Microarray Data (bioconductor.org)
(6條消息) 用limma包進(jìn)行多組差異表達(dá)分析_今天也是個妖精頭子呀的博客-CSDN博客_limma 差異表達(dá)分析