GSEA簡(jiǎn)介
數(shù)據(jù)準(zhǔn)備
GSEA只需要gene id和log2FoldChange兩列
library(clusterProfiler)
library(enrichplot)
library(org.Mm.eg.db) # mouse全基因組注釋包邮绿,用于不同版本基因名的轉(zhuǎn)換
# 這個(gè)是屬于注釋包凉泄,每個(gè)基因集可能對(duì)應(yīng)的注釋包不一樣宪萄,要從基因集所在的平臺(tái)找到對(duì)應(yīng)的注釋包柒桑,然后從bioconductor獲取安裝。
library(dplyr)
library(stringr)
library(msigdbr) # 提供多個(gè)物種的MSigDB數(shù)據(jù)
dif_genes <- read.csv("dif_genes.csv")
# 得到差異分析結(jié)果:all_different_genes
geneList <- dif_genes$avg_logFC
# 如果是Ensemble ID窑邦,并且如果還帶著版本號(hào)擅威,需要去除版本號(hào),再進(jìn)行基因ID轉(zhuǎn)換冈钦,得到Entrez ID
names(geneList) <- dif_genes$gene_id
# 最后從大到小排序郊丛,得到一個(gè)字符串
geneList <- sort(geneList, decreasing = T)
# 檢查是否有重復(fù)基因名
genelist <- dif_genes$gene_id
genelist[duplicated(genelist)]
數(shù)據(jù)集準(zhǔn)備
MSigDB數(shù)據(jù)庫(kù)是GSEA的官方數(shù)據(jù)庫(kù)
misgdbr包
msigdbr包可提供多個(gè)物種的MSigDB數(shù)據(jù)
# GSEA #mdigbd dataset
msigdbr_df <- msigdbr(species = "Mus musculus",category = "C2")
# msigdbr_df$gene_symbol:"SYMBOL";$entrez_gene:"ENTREZID"
# msigdbr_list <- split(x = msigdbr_df$entrez_gene, f = msigdbr_df$gs_name)
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, ensembl_gene) %>% as.data.frame()
msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()
用clusterProfiler實(shí)現(xiàn)GSEA
gsea_result <- GSEA(geneList = geneList,
minGSSize = 1,
maxGSSize = 1000,
pvalueCutoff = 1,
TERM2GENE = msigdbr_t2g)
# 將其中的基因名變成symbol ID
#gsea_result <- setReadable(gsea_result, OrgDb = org.Mm.eg.db)
# 還可以直接點(diǎn)擊查看,只需要轉(zhuǎn)成數(shù)據(jù)框
#gsea_result_df <- as.data.frame(gsea_result)
# 導(dǎo)出結(jié)果
write.csv(gsea_result@result, file = "gsea_result.csv")
# 對(duì)感興趣的通路可視化
gseaplot2(gsea_result, geneSetID = c("PATHWAY_NAME"))
或進(jìn)行GO分析
## gseGO
# 想看biological process(BP)瞧筛,可以先不過(guò)濾p值
go_result <- gseGO(geneList = geneList,
ont = "BP",
OrgDb = org.Mm.eg.db,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1)
# 將其中的基因名變成symbol ID
go_result <- setReadable(go_result, OrgDb = org.Mm.eg.db)
# 還可以直接點(diǎn)擊查看厉熟,只需要轉(zhuǎn)成數(shù)據(jù)框
go_result_df <- as.data.frame(go_result)
# 導(dǎo)出結(jié)果
write.csv(go_result@result, file = "gseGO_result.csv")
# 對(duì)感興趣的通路可視化
gseaplot2(go_result, geneSetID = c("GO:0032981"))
KEGG分析
##gseKEGG
kk <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
verbose = FALSE)
gseaplot(kk, geneSetID = "hsa04110")
結(jié)果注釋?zhuān)梢暬庾x
enrichmentscore 或 NES 大于0表示上調(diào),小于0表示下調(diào)
GSEA詳細(xì)解釋及結(jié)果解讀
===================================================
完整代碼
rm(list=ls())
setwd("C:\\Users\\DELL\\Desktop\\gsea")
memory.limit(102400)
library(clusterProfiler)
library(enrichplot)
library(org.Mm.eg.db) # mouse全基因組注釋包较幌,用于不同版本基因名的轉(zhuǎn)換
library(dplyr)
library(stringr)
library(msigdbr)
# GSEA #mdigbd dataset
msigdbr_df <- msigdbr(species = "Mus musculus", category = "C2")
# msigdbr_df$gene_symbol:"SYMBOL";$entrez_gene:"ENTREZID"
# msigdbr_list <- split(x = msigdbr_df$entrez_gene, f = msigdbr_df$gs_name)
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()
# msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, ensembl_gene) %>% as.data.frame()
msigdbr_t2g <- msigdbr_df %>% dplyr::distinct(gs_name, gene_symbol) %>% as.data.frame()
files <- list.files(getwd(), pattern = "*.csv")
# 多組數(shù)據(jù)揍瑟,用for循環(huán)
for (i in 1:length(files)){
file <- files[i]
print(file)
dif_genes <- read.csv(file)
# 得到差異分析結(jié)果:all_different_genes
geneList <- dif_genes$avg_logFC
# 如果是Ensemble ID,并且如果還帶著版本號(hào)乍炉,需要去除版本號(hào)绢片,再進(jìn)行基因ID轉(zhuǎn)換滤馍,得到Entrez ID
names(geneList) <- dif_genes$gene_id
# 最后從大到小排序,得到一個(gè)字符串
geneList <- sort(geneList, decreasing = T)
# 檢查是否有重復(fù)基因名
genelist <- dif_genes$gene_id
genelist <- genelist[duplicated(genelist)]
gsea_result <- GSEA(geneList = geneList,
minGSSize = 1,
maxGSSize = 1000,
pvalueCutoff = 1,
TERM2GENE = msigdbr_t2g)
# 將其中的基因名變成symbol ID
#gsea_result <- setReadable(gsea_result, OrgDb = org.Mm.eg.db)
# 還可以直接點(diǎn)擊查看底循,只需要轉(zhuǎn)成數(shù)據(jù)框
#gsea_result_df <- as.data.frame(gsea_result)
# 導(dǎo)出結(jié)果
write.csv(gsea_result@result, file = "gsea_result.csv")
}