- ORA富集分析只需要提供差異基因列表绊汹,GSEA富集分析還需要提供對(duì)應(yīng)差異倍數(shù)信息芯勘。但二者都是基于差異分析得到的結(jié)果空执。
- Gene set variation analysis (GSVA)與Single sample GSEA (ssGSEA)這兩種方法是都基于單樣本的基因表達(dá)信息計(jì)算每個(gè)通路的相對(duì)表達(dá)活性代咸,然后基于此可計(jì)算樣本間的通路表達(dá)活性的差異分析瞪醋。由于這兩種算法相似,就不做過多的區(qū)分休弃。
- 相當(dāng)于把基因表達(dá)矩陣(列:樣本吞歼,行:基因 )變?yōu)橐粋€(gè)通路/基因集活性表達(dá)矩陣(列:樣本,行:通路/基因集)
- 可使用
GSVA
包進(jìn)行分析塔猾,以下的學(xué)習(xí)代碼最主要來(lái)自該包的官方文檔:https://www.bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html
1篙骡、gsva()
的兩個(gè)核心參數(shù)
#安裝加載包
BiocManager::install("GSVA")
library(GSVA)
#示例數(shù)據(jù)
BiocManager::install("GSVAdata")
library(GSVAdata)
形如gsva(expr, gset.idx.list)
,進(jìn)行GSVA分析需要兩個(gè)必要參數(shù)
(1)expr表達(dá)矩陣
- 表達(dá)矩陣可以是單純的
matrix
格式丈甸,也可以是ExpressionSet/SummarizedExperiment對(duì)象糯俗; - 對(duì)于microarray或者經(jīng)過log-CPMs, log-RPKMs or log-TPMs之類標(biāo)準(zhǔn)化處理的RNA-seq數(shù)據(jù),使用默認(rèn)的參數(shù)即可睦擂;如果是原始count的RNA-seq表達(dá)水平得湘,需要添加
kcdf="Poisson"
參數(shù)。
(2)gset.idx.list通路基因集
- 通路基因集的最直接顿仇、簡(jiǎn)單的格式就是list對(duì)象淘正,每個(gè)元素包含特定基因集的所有基因,元素名為通路基因名臼闻;
- The Molecular Signatures Database (MSigDB)數(shù)據(jù)庫(kù)包含了多種主流的注釋基因集供下載使用
https://www.gsea-msigdb.org/gsea/msigdb/
- 值得注意的是文件為
gmt
格式鸿吆,可使用clusterProfiler::read.gmt()
函數(shù)讀為數(shù)據(jù)框格式。建議下載Entrez
基因id格式
c2.cp.kegg = clusterProfiler::read.gmt("c2.cp.kegg.v7.4.entrez.gmt")
c2.cp.reactome = clusterProfiler::read.gmt("c2.cp.reactome.v7.4.entrez.gmt")
genesets = rbind(c2.cp.kegg,c2.cp.reactome)
head(genesets)
# term gene
# 1 KEGG_N_GLYCAN_BIOSYNTHESIS 79868
# 2 KEGG_N_GLYCAN_BIOSYNTHESIS 57171
# 3 KEGG_N_GLYCAN_BIOSYNTHESIS 6184
# 4 KEGG_N_GLYCAN_BIOSYNTHESIS 199857
# 5 KEGG_N_GLYCAN_BIOSYNTHESIS 11253
# 6 KEGG_N_GLYCAN_BIOSYNTHESIS 10195
genesets = split(genesets$gene, genesets$term)
str(head(genesets))
2述呐、gsva()
分析示例
2.1 例1:膠質(zhì)母細(xì)胞瘤亞型GSVA分析
- 目的:對(duì)于膠質(zhì)母細(xì)胞瘤GBM的四種亞型(proneural, classical, neural and mesenchymal)惩淳,給定的4個(gè)不同大腦細(xì)胞類型基因集的富集程度分布情況。
表達(dá)矩陣數(shù)據(jù)
data(gbm_VerhaakEtAl)
exprs(gbm_eset)[1:4,1:4]
# TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01 TCGA.02.0014.01A.01
# AACS 0.51451 -0.36739 0.27668 0.71333
# FSTL1 -0.28438 -1.52133 -1.02120 -2.06326
# ELMO2 0.09697 0.32328 0.59386 0.41703
# CREB3L1 0.27977 -0.64587 0.37805 -0.60164
基因集數(shù)據(jù)
data(brainTxDbSets)
str(brainTxDbSets)
# List of 4
# $ astrocytic_up : chr [1:85] "GRHL1" "GPAM" "PAPSS2" "MERTK" ...
# $ astroglia_up : chr [1:88] "BST2" "SERPING1" "ACTA2" "C9orf167" ...
# $ neuronal_up : chr [1:98] "STXBP1" "JPH4" "CACNG3" "BRUNOL6" ...
# $ oligodendrocytic_up: chr [1:70] "DCT" "ZNF536" "GNG8" "ELOVL6" ...
gsva()分析
gbm_es <- gsva(gbm_eset, brainTxDbSets)
# Estimating GSVA scores for 4 gene sets.
# Estimating ECDFs with Gaussian kernels
# |==========================================================================================================| 100%
class(gbm_es)
# [1] "ExpressionSet"
# attr(,"package")
# [1] "Biobase"
exprs(gbm_es)[1:4,1:3]
# TCGA.02.0003.01A.01 TCGA.02.0010.01A.01 TCGA.02.0011.01B.01
# astrocytic_up -0.2785436 -0.2676394 -0.01994194
# astroglia_up -0.5038491 -0.5199252 -0.44371190
# neuronal_up 0.5406985 0.1856688 -0.12880731
# oligodendrocytic_up 0.3049166 0.2399523 0.26027077
熱圖可視化
subtypeOrder <- c("Proneural", "Neural", "Classical", "Mesenchymal")
sampleOrderBySubtype <- sort(match(gbm_es$subtype, subtypeOrder),
index.return=TRUE)$ix
geneSetOrder <- c("astroglia_up", "astrocytic_up", "neuronal_up",
"oligodendrocytic_up")
library(pheatmap)
pheatmap(exprs(gbm_es)[geneSetOrder, sampleOrderBySubtype],
show_colnames = F, cluster_cols = F,
annotation_col = pData(gbm_es[,sampleOrderBySubtype]))
2.2 例2:白血病亞型GSVA分析+差異分析
- 目的:研究常規(guī)兒童急性淋巴母細(xì)胞白血病(ALL)與MLL(混合系白血病基因)易位的通路富集差異乓搬,并進(jìn)行差異分析思犁,得到在兩種亞型中明顯差異表達(dá)的通路基因集代虾。
表達(dá)矩陣數(shù)據(jù)
data(leukemia)
exprs(leukemia_eset)[1:4,1:4]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 1000_at 11.354426 10.932543 11.185906 11.251631
# 1001_at 9.185470 8.823661 8.687186 8.958305
# 1002_f_at 7.806993 8.127591 7.842353 8.319227
# 1003_s_at 10.164370 10.048514 10.006014 10.474046
#探針的基因名注釋
library(hgu95a.db)
ids = as.data.frame(hgu95aENTREZID)
fData(leukemia_eset)$ENTREZID = ids$gene_id[match(rownames(leukemia_eset), ids$probe_id)]
leukemia_eset = leukemia_eset[!is.na(fData(leukemia_eset)$ENTREZID),]
leukemia_eset = leukemia_eset[!duplicated(fData(leukemia_eset)$ENTREZID),]
rownames(leukemia_eset) = fData(leukemia_eset)$ENTREZID
exprs(leukemia_eset)[1:4,1:4]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL CL2001011105AA.CEL
# 5595 11.354426 10.932543 11.185906 11.251631
# 7075 9.185470 8.823661 8.687186 8.958305
# 1557 7.806993 8.127591 7.842353 8.319227
# 643 10.164370 10.048514 10.006014 10.474046
#樣本分組
table(leukemia_eset$subtype)
# ALL MLL
# 20 17
通路基因集
- 為1.2點(diǎn)整理的kegg、reactome基因集
length(genesets)
#[1] 1790
str(head(genesets))
# List of 6
# $ KEGG_N_GLYCAN_BIOSYNTHESIS : chr [1:46] "79868" "57171" "6184" "199857" ...
# $ KEGG_OTHER_GLYCAN_DEGRADATION : chr [1:16] "64772" "2720" "4126" "4125" ...
# $ KEGG_O_GLYCAN_BIOSYNTHESIS : chr [1:30] "8693" "117248" "168391" "11226" ...
# $ KEGG_GLYCOSAMINOGLYCAN_DEGRADATION : chr [1:21] "9955" "10855" "60495" "2720" ...
# $ KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE: chr [1:15] "9435" "11041" "10678" "8534" ...
# $ KEGG_GLYCEROLIPID_METABOLISM : chr [1:49] "129642" "57678" "9388" "8525" ...
gsva()分析
leukemia_es <- gsva(leukemia_eset, genesets, min.sz=10, max.sz=500)
exprs(leukemia_es)[1:4,1:3]
# CL2001011101AA.CEL CL2001011102AA.CEL CL2001011104AA.CEL
# KEGG_N_GLYCAN_BIOSYNTHESIS 0.056098397 -0.21254643 0.3442295
# KEGG_OTHER_GLYCAN_DEGRADATION -0.371743215 -0.43330368 0.3012659
# KEGG_GLYCOSAMINOGLYCAN_DEGRADATION 0.044683884 -0.04355861 -0.1263370
# KEGG_GLYCOSAMINOGLYCAN_BIOSYNTHESIS_KERATAN_SULFATE -0.002049164 -0.59098416 -0.1913861
limma差異分析
library(limma)
mod <- model.matrix(~ factor(leukemia_es$subtype))
colnames(mod) <- c("ALL", "MLLvsALL")
fit <- lmFit(leukemia_es, mod)
fit <- eBayes(fit)
tt <- topTable(fit, coef=2, n=Inf)
head(tt)
# logFC AveExpr t P.Value adj.P.Val B
# REACTOME_DCC_MEDIATED_ATTRACTIVE_SIGNALING -0.4364816 -0.015338411 -6.356475 1.035005e-07 0.0001322736 7.597326
# REACTOME_GLYCOSPHINGOLIPID_METABOLISM 0.3490613 -0.008042234 5.201199 5.022153e-06 0.0032091556 4.015100
# REACTOME_RESPONSE_TO_METAL_IONS 0.5102309 -0.009041289 4.864142 1.526901e-05 0.0039353010 2.988990
# REACTOME_REGULATION_OF_MECP2_EXPRESSION_AND_ACTIVITY -0.3540451 0.004755112 -4.859177 1.551936e-05 0.0039353010 2.973992
# REACTOME_OTHER_SEMAPHORIN_INTERACTIONS 0.3638330 0.013275406 4.792116 1.932383e-05 0.0039353010 2.771837
# KEGG_PROXIMAL_TUBULE_BICARBONATE_RECLAMATION 0.2701284 -0.003998529 4.782970 1.990926e-05 0.0039353010 2.744324
差異分析可視化:火山圖
library(ggplot2)
library(ggrepel)
#自定義閾值 adj.P.Val < 0.05 激蹲;abs(tt$logFC) > 0.25
tt$change = as.factor(ifelse(tt$adj.P.Val < 0.05 & abs(tt$logFC) > 0.25,
ifelse(tt$logFC > 0.25 ,'UP','DOWN'),'NOT'))
up_gs = rownames(subset(tt,logFC > 0))[1:3]
down_gs = rownames(subset(tt,logFC < 0))[1:3]
label_gs = tt[c(up_gs,down_gs),]
label_gs$name = rownames(label_gs)
ggplot(data=tt,
aes(x=logFC, y=-log10(P.Value),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) +
geom_text_repel(
data = label_gs,
aes(label = name),min.segment.length = 0)
差異分析可視化:分組熱圖
tt_sig = subset(tt, adj.P.Val < 0.1)
pheatmap(leukemia_es[rownames(tt_sig),],
show_colnames = F, cluster_cols = F,
annotation_col = pData(leukemia_es))