1.AUCell package
假設存在一通路A有20個相關基因功舀,要評價通路的表達情況纽甘,可以用AUCell對這20個相關基因進行打分鱼鼓,得分高代表通路表達較高钉跷。
①加載package與數(shù)據(jù)
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
###加載所需要的包
library(Seurat)
library(tidyverse)
library(dplyr)
library(patchwork)
library(harmony)
library(cowplot)
library(ggplot2)
load("scRNA1.rdata")
library(AUCell)
library(ggplot2)
library(Seurat)
library(clusterProfiler)
②對細胞進行排序
cells_rankings <- AUCell_buildRankings(scRNA1@assays$RNA@data,nCores=1, plotStats=TRUE)
這一步是對每單個細胞中每個基因的表達量進行排序,每個細胞的每個基因都得到一個排序序號
③下載用于參考的細胞通路對應的基因
在GSEA broad上進行下載
通路數(shù)據(jù)庫簡介
1.GO(gene ontology疆前,即基因本體論):基因本體論是對基因在不同維度和不同層次上的描述
a. Cellular Component(CC):解釋基因在細胞中的位置
b. Biological Process(BP):說明該基因參與了哪些生物學過程
c. Molecular Function(MF):說明該基因在分子層面的功能
2.KEGG:對細胞所有pathway進行富集分析,KEGG數(shù)據(jù)庫共有186條pathway
二者的區(qū)別在于數(shù)據(jù)庫不同
下載KEGG pathway數(shù)據(jù)后進行讀取
c2 <- read.gmt("c2.cp.kegg.v7.5.1.symbols.gmt")
提取每個通路對應的基因
geneSets <- lapply(unique(c2$term), function(x){print(x);c2$gene[c2$term == x]})
names(geneSets) <- unique(c2$term)
④基因轉換
cell_rankings 是鼠的基因寒跳,而geneSets是人的pathway數(shù)據(jù),需要用msigdbr包將geneSets轉換成對應的鼠的通路數(shù)據(jù)
library(msigdbr)
m_df<- msigdbr(species = "Mus musculus", category = "C2", subcategory = "KEGG")
fgsea_sets<- m_df %>% split(x = .$gene_symbol, f = .$gs_name)
⑤基因富集計算
cells_AUC <- AUCell_calcAUC(fgsea_sets, cells_rankings,nCores =1, aucMaxRank=nrow(cells_rankings)*0.1)
#提取氧化磷酸化通路相關的基因
geneSet <- "KEGG_OXIDATIVE_PHOSPHORYLATION"
aucs <- as.numeric(getAUC(cells_AUC)[geneSet, ])
#合并metadata信息與降維結果
df<- data.frame(scRNA1@meta.data, scRNA1@reductions$umap@cell.embeddings)
#將seurat cluster信息注釋到圖上
class_avg <- df %>%
group_by(seurat_clusters) %>%
summarise(
UMAP_1 = median(UMAP_1),
UMAP_2 = median(UMAP_2)
)
⑥可視化
p <-ggplot(df, aes(UMAP_1, UMAP_2)) +
geom_point(aes(colour = AUC)) + viridis::scale_color_viridis(option="E") +
ggrepel::geom_label_repel(aes(label = seurat_clusters),
data = class_avg,
size = 5,
label.size = 1,
segment.color = NA
)+ theme(legend.position = "none") + theme_bw()
分組展示
p<- p+facet_grid(.~orig.ident)
2.過表征分析
①基于GO數(shù)據(jù)庫的分析
接下來將對DepiNeg1峡继、DepiNeg2兩個樣本的差異基因進行分析冯袍,探究它們的差異基因主要與哪些功能有關
找到差異基因
degdf <- FindMarkers(scRNA1,ident.1 = "DapiNeg1",ident.2 = "DapiNeg2",
logfc.threshold = 0.3,group.by = "orig.ident",subset.ident=1)
分析BP類通路
library(org.Mm.eg.db)
degs.list=rownames(degdf)
erich.go.BP = enrichGO(gene =degs.list,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
dotplot(erich.go.BP,showCategory = 15 )
barplot(erich.go.BP,showCategory = 8)
erich.go.BP=erich.go.BP@result
分析CC類通路
erich.go.CC = enrichGO(gene =degs.list,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "CC",
pvalueCutoff = 0.5,
qvalueCutoff = 0.5)
dotplot(erich.go.CC,showCategory = 8)
barplot(erich.go.CC,showCategory = 8)
分析MF類通路
erich.go.MF = enrichGO(gene =degs.list,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "MF",
pvalueCutoff = 0.5,
qvalueCutoff = 0.5)
dotplot(erich.go.MF,showCategory = 8)
barplot(erich.go.MF,showCategory = 8)
②基于KEGG數(shù)據(jù)庫分析
轉換基因名
將差異基因的基因名從symble名轉化為entrezID
DEG.entrez_id = mapIds(x = org.Mm.eg.db,
keys = degs.list,
keytype = "SYMBOL",
column = "ENTREZID")
進行基于KEGG的富集
enrich.kegg.res <- enrichKEGG(gene = DEG.entrez_id,
organism = "mmu",
keyType = "kegg")
dotplot(enrich.kegg.res)
如果出現(xiàn)報錯則運行以下代碼
library(R.utils)
R.utils::setOption('clusterProfiler.download.method','auto')
將結果進行保存
result <- enrich.kegg.res@result
write.table(result,"enrich.kegg.res.txt",sep = "\t",col.names = NA)
挑選其中感興趣的通路進行作圖
以第12-16,21-23碾牌,27-29個通路為例,將這些通路制成一個txt文件
儡循,并讀取該文件進行作圖
kegg=read.table("KEGG.8.20.txt",header = T,sep = "\t")
可視化
k=data.frame(kegg)
library(ggplot2)
library(dplyr)
#提取GeneRatio的前與后的值
before <- as.numeric(sub("/\\d+$", "", k$GeneRatio))
after <- as.numeric(sub("^\\d+/", "", k$GeneRatio))
k$GeneRatio = before /after
font.size =10
k %>%
## 按p值排序
arrange(p.adjust) %>%
## 開始ggplot2 作圖舶吗,其中fct_reorder調(diào)整因子level的順序
ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+
## 畫出點圖
geom_point(aes(color=p.adjust, size = Count)) +
## 調(diào)整顏色,guide_colorbar調(diào)整色圖的方向
scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
## 調(diào)整泡泡的大小
scale_size_continuous(range=c(3, 8))+
## 如果用ylab("")或出現(xiàn)左側空白
labs(y=NULL) +
## 如果沒有這一句择膝,上方會到頂
ggtitle("")+
## 設定主題
theme_bw() +
theme(axis.text.x = element_text(colour = "black",
size = font.size, vjust =1 ),
axis.text.y = element_text(colour = "black",
size = font.size, hjust =1 ),
axis.title = element_text(margin=margin(10, 5, 0, 0),
color = "black",size = font.size),
axis.title.y = element_text(angle=90))
③可視化展示富集結果
讀取數(shù)據(jù)誓琼,完成降維聚類、注釋
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
load("scRNA_harmony.Rdata")
library(dplyr)
library(Seurat)
library(tidyverse)
library(patchwork)
library(SingleR)
load("ref_Human_all.RData")
refdata <- ref_Human_all
testdata <- GetAssayData(scRNA_harmony, slot="data")
clusters <- scRNA_harmony@meta.data$seurat_clusters
###開始用singler分析
cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main,
method = "cluster", clusters = clusters,
assay.type.test = "logcounts", assay.type.ref = "logcounts")
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = FALSE)
scRNA_harmony@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
scRNA_harmony@meta.data[which(scRNA_harmony@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
以cluster0為例,找到cluster0的marker基因并進行GO富集
#對cluster0尋找marker基因
deg =FindMarkers(scRNA_harmony,ident.1 = "sample_11",ident.2 = "sample_3",group.by = "orig.ident",
subset.ident = "0")
degs.list=rownames(deg)
#GO BP富集
library(org.Hs.eg.db)
library(clusterProfiler)
erich.go.BP = enrichGO(gene =degs.list,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pvalueCutoff = 0.5,
qvalueCutoff = 0.5)
dotplot(erich.go.BP,showCategory = 30)
挑選感興趣的通路進行作圖
erich.go.BP=erich.go.BP@result
挑選3個感興趣的通路信息制成txt文件
#提取用于作圖的信息
library(GOplot)
go1= read.table("goplot.txt",sep = "\t",header = T)
go1=erich.go.BP[1:3,c(1,2,8,6)]
rownames(go1)=go1$ID
go=go1
#調(diào)整go數(shù)據(jù)框
library(stringr)
go$geneID=str_replace_all(go$geneID,"/",",")
names(go)=c('ID','Term','Genes','adj_pval')
go$Category="BP"
#提取每個通路對應的基因
x1=strsplit(go$Genes[1],split=",",fixed=T)
x2=strsplit(go$Genes[2],split=",",fixed=T)
x3=strsplit(go$Genes[3],split=",",fixed=T)
g1=c(x1[[1]],x2[[1]],x3[[1]])
#從差異分析結果deg中提取這些基因的信息并整合
genedata1=deg[g1,]
genedata1$ID=rownames(genedata1)
genedata2=data.frame(ID=genedata1$ID,logFC=genedata1$avg_log2FC)
各種形式可視化
circ <- circle_dat(go,genedata2)
##條形圖
GOBar(subset(circ, category == 'BP'))
#氣泡圖
GOBubble(circ, labels = 3)
#圈圖
GOCircle(circ, nsub = 3)
#和弦圖
chord<-chord_dat(circ, genedata2)
GOChord(chord, gene.order = 'logFC')
#基因與GO Term的熱圖
GOHeat(chord, nlfc = 1, fill.col = c('red', 'yellow', 'green'))
3.GSVA富集
GSVA富集不僅可以得出差異基因富集于哪個通路腹侣,還能分析出該通路相關的基因綜合表達是上調(diào)還是下調(diào)叔收。GSVA先將基因富集到各通路上,再通過差異分析找出差異表達的通路傲隶。
①pipeline
輸入表達矩陣饺律,得到GSVA評分矩陣
準備分析數(shù)據(jù)與package
library(GSEABase)
library(GSVA)
library(msigdbr)
###準備參考數(shù)據(jù)集###
m_df<- msigdbr(species = "human", category = "H" )
#將gene_symbol按gs_name分組
geneSets <- m_df %>% split(x = .$gene_symbol, f = .$gs_name)
###準備分析數(shù)據(jù)集###
exp=AverageExpression(scRNA_harmony,add.ident = "orig.ident")
exp=exp[["RNA"]]
#挑選sample11與sample3的cluster0、1跺株、2進行分析
counts2=exp[,c(1:6)]
進行GSVA富集
GSVA_hall <- gsva(expr=as.matrix(counts2),
gset.idx.list=geneSets,
mx.diff=T, # 數(shù)據(jù)為正態(tài)分布則T复濒,雙峰則F
kcdf="Poisson", #CPM, RPKM, TPM數(shù)據(jù)就用默認值"Gaussian", read count數(shù)據(jù)則為"Poisson"
)
limma進行差異通路分析
# 設置分組矩陣
group <- factor(c(rep("s.11",3), rep("s.3", 3)), levels = c( 's.11','s.3'))
design <- model.matrix(~0+group)
colnames(design) = levels(factor(group))
rownames(design) = colnames(GSVA_hall)
###進行差異分析###
#創(chuàng)建比較對象乒省,此處是s.3與s.11比較
compare <- makeContrasts(s.3 - s.11, levels=design)
#線性擬合
fit <- lmFit(GSVA_hall, design)
fit2 <- contrasts.fit(fit, compare)
#貝葉斯
fit3 <- eBayes(fit2)
#提取差異分析結果
Diff <- topTable(fit3, coef=1, number=200)
可視化
dat_plot <- data.frame(id = row.names(Diff),
t = Diff$t)
# 去掉"HALLMARK_"
library(stringr)
dat_plot$id <- str_replace(dat_plot$id , "HALLMARK_","")
# 變成因子類型
dat_plot$id <- factor(dat_plot$id,levels = dat_plot$id)
# 新增一列 根據(jù)t閾值分類
dat_plot$threshold = factor(ifelse(dat_plot$t >-1, ifelse(dat_plot$t >= 1 ,'Up','NoSignifi'),'Down'),levels=c('Up','Down','NoSignifi'))
# 按t值排序
dat_plot <- dat_plot %>% arrange(t)
library(ggplot2)
library(ggthemes)
library(ggprism)
p <- ggplot(data = dat_plot,aes(x = id,y = t,fill = threshold)) +
geom_col()+
coord_flip() +#橫縱坐標對調(diào)
scale_fill_manual(values = c('Up'= '#36638a','NoSignifi'='#cccccc','Down'='#7bcd7b')) +
geom_hline(yintercept = c(-2,2),color = 'white',size = 0.5,lty='dashed') +
xlab('') +
ylab('t value of GSVA score, S.11 VS S.3') +
guides(fill=F)+ # 不顯示圖例
theme_prism(border = T) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
###添加標簽###
# 小于-1的數(shù)量
low1 <- dat_plot %>% filter(t < -1) %>% nrow()
# 小于0總數(shù)量
low0 <- dat_plot %>% filter( t < 0) %>% nrow()
# 小于1總數(shù)量
high0 <- dat_plot %>% filter(t < 1) %>% nrow()
# 總的柱子數(shù)量
high1 <- nrow(dat_plot)
# 依次從下到上添加標簽
p <- p + geom_text(data = dat_plot[1:low1,],aes(x = id,y = 0.1,label = id),
hjust = 0,color = 'black',size=2) + # 小于-1的為黑色標簽
geom_text(data = dat_plot[(low1 +1):low0,],aes(x = id,y = 0.1,label = id),
hjust = 0,color = 'grey',size=2) + # 灰色標簽
geom_text(data = dat_plot[(low0 + 1):high0,],aes(x = id,y = -0.1,label = id),
hjust = 1,color = 'grey',size=2) + # 灰色標簽
geom_text(data = dat_plot[(high0 +1):high1,],aes(x = id,y = -0.1,label = id),
hjust = 1,color = 'black',size=2) # 大于1的為黑色標簽
p
pheatmap::pheatmap(GSVA_hall, #熱圖的數(shù)據(jù)
cluster_rows = T,#行聚類
cluster_cols =T,#列聚類巧颈,可以看出樣本之間的區(qū)分度
fontsize = 5,
show_colnames=T,
scale = "row", #以行來標準化
color =colorRampPalette(c("#FF7744", "white","#AAAAAA","#0044BB"))(100))
4.自定義通路
某些情況下直接用GO、KEGG袖扛、hallmark等數(shù)據(jù)庫已有的基因集進行富集分析得不到理想結果砸泛,就可以自定義通路與通路對應的基因進行GSVA分析(通過文獻或已有資料刪除對通路起負反饋效果的基因等)。
①GSVA自定義分析
準備自定義的通路數(shù)據(jù)集
需要準備1.基因名稱的txt文件,2.通路名稱文件
先在excel表中進行操作蛆封,再復制粘貼到txt中
在excel表中準備數(shù)據(jù)時需要多加一個自定義的xyz通路晾嘶,并保證該通路基因數(shù)最多,因為在后續(xù)去掉表中的空值時最長的一列(即沒有空值的一列)會被全部去除
#讀取文件
gene.set=read.table("gene.txt",
header =F,sep = '\t',quote = '',fill=TRUE)
kegg.123=read.table("genesetname.txt",
header =F,sep = '\t',quote = '')
#將行轉為基因娶吞,列轉為通路
gene.set1=as.matrix(gene.set)
gene.set2=t(gene.set1)
#去掉空值
gmt=list()
for (i in 1:11) {
y=as.character(gene.set2[,i])
b=which(y=="")
gmt[[i]]=y[-b]
}
#對通路命名
names(gmt)=kegg.123$V1
#去掉xyz通路
gmt=gmt[-1]
②GO富集自定義分析
找出差異基因
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
load("scRNA_harmony.Rdata")
library(dplyr)
library(Seurat)
library(tidyverse)
library(patchwork)
library(SingleR)
load("ref_Human_all.RData")
refdata <- ref_Human_all
testdata <- GetAssayData(scRNA_harmony, slot="data")
###把scRNA數(shù)據(jù)中的seurat_clusters提取出來垒迂,注意這里是因子類型的
clusters <- scRNA_harmony@meta.data$seurat_clusters
###開始用singler分析
cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main,
method = "cluster", clusters = clusters,
assay.type.test = "logcounts", assay.type.ref = "logcounts")
celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = FALSE)
scRNA_harmony@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
scRNA_harmony@meta.data[which(scRNA_harmony@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
sc.t=scRNA_harmony[,rownames(subset(scRNA_harmony@meta.data,celltype=="T_cells"))]
deg =FindMarkers(scRNA_harmony,ident.1 = "sample_11",ident.2 = "sample_3",group.by = "orig.ident",
subset.ident = "0")
進行自定義的GO分析
用enricher()進行自定義分析,enrichGO()的分析數(shù)據(jù)都來源于網(wǎng)絡上的數(shù)據(jù)庫妒蛇,無法修改机断,但enricher()依靠自定義的通路數(shù)據(jù),方便修改绣夺。
enricher的通路數(shù)據(jù)TERM2GENE是包含兩列的數(shù)據(jù)框吏奸,一列為通路,一列為基因
#準備通路數(shù)據(jù)
m_df <-msigdbr(species = "human",category = "H")
m_df <- m_df[,c(3,4)]
#進行富集分析
res<-enricher(gene = rownames(deg),TERM2GENE = m_df)
5.代謝活性通路富集
僅關注與代謝有關的通路
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
install.packages("wesanderson")
install.packages("C:/Users/86269/Desktop/shun.C/single_cell/phangorn_2.8.1.zip", repos = NULL, type = "win.binary")
install.packages("C:/Users/86269/Desktop/shun.C/single_cell/quadprog_1.5-8.zip", repos = NULL, type = "win.binary")
devtools::install_github("YosefLab/VISION")
devtools::install_github("wu-yc/scMetabolism")
library(scMetabolism)
library(ggplot2)
library(rsvd)
load(file = "pbmc_demo.rda")
countexp.Seurat<-sc.metabolism.Seurat(obj = countexp.Seurat, method = "VISION", imputation = F, ncores = 2, metabolism.type = "KEGG")
DimPlot.metabolism(obj = countexp.Seurat, pathway = "Glycolysis / Gluconeogenesis", dimention.reduction.type = "umap", dimention.reduction.run = F, size = 1)
##Dot plot
input.pathway<-c("Glycolysis / Gluconeogenesis", "Oxidative phosphorylation", "Citrate cycle (TCA cycle)")
DotPlot.metabolism(obj = countexp.Seurat, pathway = input.pathway, phenotype = "ident", norm = "y")
## Box plot
BoxPlot.metabolism(obj = countexp.Seurat, pathway = input.pathway, phenotype = "ident", ncol = 1)
metabolism.matrix<-sc.metabolism(countexp = countexp, method = "VISION", imputation = F, ncores = 2, metabolism.type = "KEGG")
6.GSEA富集分析
傳統(tǒng)的KEGG或GO富集分析的缺點是規(guī)定閾值進行篩選陶耍,但一條通路上不同的基因表達量可能會有很大差異胶背,上游基因的微小變化可能造成下游基因明顯變化,因此規(guī)定閾值進行篩選會損失很多基因信息歹茶。而GSEA方法不需要規(guī)定閾值進行篩選捎拯,規(guī)避了傳統(tǒng)富集方法的缺點。
①Pipeline
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
###加載所需要的包
library(Seurat)
library(tidyverse)
library(dplyr)
library(patchwork)
library(harmony)
library(cowplot)
library(ggplot2)
load("scRNA1.rdata")
#按細胞類型分組毯欣,尋找差異基因
Idents(scRNA1)="celltype"
deg=FindMarkers(scRNA1,ident.1 = "Adipocytes",ident.2 = "Granulocytes",
min.pct = 0.01,logfc.threshold = 0.01)
#將avg_log2FC按從大到小提嚷(GSEA函數(shù)的要求)
genelist=deg$avg_log2FC
#把鼠的基因名改為人的基因名,二者的差別就在于大小寫不同
names(genelist)=toupper(rownames(deg))
genelist=sort(genelist,decreasing = T)
###進行富集分析與可視化###
library(clusterProfiler)
library(org.Hs.eg.db)
library(GSEABase)
library(enrichplot)
#讀取通路信息
geneset=read.gmt("c2.cp.kegg.v7.5.1.symbols.gmt")
#進行GSEA富集
egmt=GSEA(genelist,TERM2GENE = geneset,
minGSSize = 1,pvalueCutoff = 0.5)#TERM2GENE可以自行篩選
#查看前兩條通路的富集情況
gseaplot2(egmt,geneSetID =c(1,2),pvalue_table = T,base_size = 5)
第一部分為基因Enrichment Score的折線圖酗钞,大概意思是對差異基因排序列表進行掃描腹忽,每遇到一個參與該通路的差異基因来累,得分就會上升,如果遇到一個差異基因不在該通路里窘奏,就罰分嘹锁;score縱軸為對應的Running ES, 在折線圖中有個峰值,該峰值就是這個基因集的Enrichemnt score着裹,峰值之前的基因就是該基因集下的核心基因领猾,也稱為領頭亞集基因,表明對該條通路最終得分貢獻最大的基因求冷。
第二部分用線條標記位于該基因集下的基因瘤运,從左到右按照log2FC從高到低進行排序。
第三部分的縱坐標代表FoldChange值匠题。
從圖中可看出一個通路的基因富集在FoldChange>0的區(qū)域拯坟,說明該通路上調(diào),另一個則富集在FoldChange<0的區(qū)域韭山,說明該通路下調(diào)郁季。
將上調(diào)與下調(diào)的通路分別進行批量可視化并儲存圖片
kegg.res=egmt@result
down.kegg.res<-kegg.res[(kegg.res$p_adjust<0.01 & kegg.res$enrichmentScore < -1.5),]
down.kegg.res$group=1
up.kegg.res<-kegg.res[(kegg.res$p_adjust<0.01 & kegg.res$enrichmentScore > 1.5),]
up.kegg.res$group=1
lapply(1:nrow(down.kegg.res), function(i){
gseaplot2(egmt,down.kegg.res$ID[i],title = down.kegg.res$Description[i],pvalue_table = T)
ggsave(paste0(gsub("/","_",down.kegg.res$Description[i]),".down.pdf"),width = 11,height =5)
}
)
down.kegg.res$ID[1]
lapply(1:nrow(up.kegg.res), function(i){
gseaplot2(egmt,up.kegg.res$ID[i],title = up.kegg.res$Description[i],pvalue_table = T)
ggsave(paste0(gsub("/","-",up.kegg.res$Description[i]),".up.pdf"),width = 11,height =5)
}
)
②自定義通路
fgsea能夠快速對預選基因集進行GSEA富集分析,預選基因集可以是自己設定钱磅,一般使用MSigdb數(shù)據(jù)庫(同樣由提出GSEA方法團隊提供)梦裂。
library(msigdbr)
library(fgsea)
#提取預選基因集
m_df<- msigdbr(species = "Mus musculus", category = "C2", subcategory = "KEGG")
fgsea_sets<- m_df %>% split(x = .$gene_symbol, f = .$gs_name)
#準備用于富集分析的基因數(shù)據(jù)
genelist=deg$avg_log2FC
names(genelist)= rownames(deg)
genelist=sort(genelist,decreasing = T)
#進行富集
fgseaRes<- fgsea(fgsea_sets, stats = genelist )
#進行可視化,篩選p值<0.005的20個通路,NES<1.5的視為下調(diào)基因盖淡,反之則是上調(diào)
ggplot(fgseaRes %>% filter(padj < 0.005) %>% head(n= 20), aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill= NES < 1.5)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal() ####以1.5進行繪圖填色
③自定義畫圖
加載數(shù)據(jù)年柠,尋找差異基因
##系統(tǒng)報錯改為英文
Sys.setenv(LANGUAGE = "en")
##禁止轉化為因子
options(stringsAsFactors = FALSE)
##清空環(huán)境
rm(list=ls())
setwd("C:/Users/86269/Desktop/shun.C/single_cell")
###加載所需要的包
library(Seurat)
library(tidyverse)
library(dplyr)
library(patchwork)
library(harmony)
library(cowplot)
library(ggplot2)
library(clusterProfiler)
library(enrichplot)
library(plyr)
library(ggrepel)
library(ggplot2)
library(RColorBrewer)
library(gridExtra)
load("scRNA1.rdata")
Idents(scRNA1)="celltype"
deg=FindMarkers(scRNA1,ident.1 = "Adipocytes",ident.2 = "Granulocytes",
min.pct = 0.01,logfc.threshold = 0.01)
進行富集分析
#按照FoldChange排序
gsym.fc.id.sorted <- deg[order(deg$avg_log2FC,decreasing = T),]
#將基因ID從symbol轉化為entrez
expm.id <- bitr(rownames(gsym.fc.id.sorted), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Mm.eg.db")
exp.fc.id <- merge(expm.id, gsym.fc.id.sorted,by.x=1, by.y=0 )
#按FoldChange排序
gsym.fc.id.sorted <- exp.fc.id[order(exp.fc.id$avg_log2FC,decreasing = T),]
#獲得ENTREZID、foldchange列表褪迟,做為GSEA的輸入
id.fc <- gsym.fc.id.sorted$avg_log2FC
names(id.fc) <- gsym.fc.id.sorted$ENTREZID
#進行gseKEGG富集分析
library(R.utils)
R.utils::setOption("clusterProfiler.download.method","auto")
#這一條語句就做完了KEGG的GSEA分析
kk <- gseKEGG(id.fc, organism = "mmu")
#選擇兩條通路作圖
gseaplot2(kk, geneSetID = c(1,2),
color = c("darkgreen","chocolate4")) #自定義顏色
自定義作圖
作圖前數(shù)據(jù)準備
#把ENTREZ ID轉為gene symbol冗恨,便于查看通路里的基因
kk.gsym <- setReadable(kk, 'org.Mm.eg.db', #物種
'ENTREZID')
#按照enrichment score從高到低排序,便于查看富集的通路
sortkk <- kk.gsym[order(kk.gsym$enrichmentScore, decreasing = T),]
# 要畫的通路
geneSetID <- c("mmu03010", "mmu00982")
# 突出顯示感興趣的基因
selectedGeneID <- c("Rps28","Rps29","Rps26","Rps2","Rpl22l1")
# 自定義足夠多的顏色
mycol <- c("darkgreen","chocolate4","blueviolet","#223D6C","#D20A13","#088247","#58CDD9","#7A142C","#5D90BA","#431A3D","#91612D","#6E568C","#E0367A","#D8D155","#64495D","#7CC767")
# 字號
base_size = 12
# rank mode
rankmode <- "comb" #合起來畫
#rankmode <- "sep" #如果通路多味赃,分開畫更好
#合并多條通路的數(shù)據(jù)
gsdata <- do.call(rbind, lapply(geneSetID,enrichplot:::gsInfo, object = x))#對兩條通路分別用gsInfo提取富集結果
gsdata$gsym <- rep(gsym.fc.id.sorted$SYMBOL,2)
分別畫出GSEA富集圖的三個部分
# 畫running score
p.res <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) +
geom_line(aes_(y = ~runningScore, color= ~Description), size=1) +
scale_color_manual(values = mycol) +
scale_x_continuous(expand=c(0,0)) + #兩側不留空
geom_hline(yintercept = 0, lty = 2, lwd = 0.2) + #在0的位置畫虛線
ylab("Enrichment\n Score") +
theme_bw(base_size) +
theme(panel.grid = element_blank()) + #不畫網(wǎng)格
theme(legend.position = "top", legend.title = element_blank(),
legend.background = element_rect(fill = "transparent")) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.line.x=element_blank(),
plot.margin=margin(t=.2, r = .2, b=0, l=.2, unit="cm"))
# 畫rank
if (rankmode == "comb") {
rel_heights <- c(1.5, .2, 1.5) # 整個GSEA圖上中下三個部分的比例
p2 <- ggplot(gsdata, aes_(x = ~x)) +
geom_bar(position = "dodge",stat = "identity",
aes(x, y = position*(-0.1), fill=Description))+#圖中所有豎線即y值相等掀抹,都是0
xlab(NULL) + ylab(NULL) +
scale_fill_manual(values = mycol) + #用自定義的顏色
theme_bw(base_size) +
theme(panel.grid = element_blank()) + #不畫網(wǎng)格
theme(legend.position = "none",
plot.margin = margin(t=-.1, b=0,unit="cm"),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.line.x = element_blank()) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
} else if (rankmode == "sep") {
rel_heights <- c(1.5, .5, 1.5) # 上中下三個部分的比例
i <- 0
for (term in unique(gsdata$Description)) {
idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
gsdata[idx, "ymin"] <- i
gsdata[idx, "ymax"] <- i + 1
}
head(gsdata)
p2 <- ggplot(gsdata, aes_(x = ~x)) +
geom_linerange(aes_(ymin=~ymin, ymax=~ymax, color=~Description)) +
xlab(NULL) + ylab(NULL) +
scale_color_manual(values = mycol) + #用自定義的顏色
theme_bw(base_size) +
theme(panel.grid = element_blank()) + #不畫網(wǎng)格
theme(legend.position = "none",
plot.margin = margin(t=-.1, b=0,unit="cm"),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.line.x = element_blank()) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
} else {stop("Unsupport mode")}
# 畫變化倍數(shù)
df2 <- p.res$data
df2$y <- p.res$data$geneList[df2$x]
# 提取感興趣的基因的變化倍數(shù)
selectgenes <- data.frame(gsym = selectedGeneID)
selectgenes <- merge(selectgenes, df2, by = "gsym")
selectgenes <- selectgenes[selectgenes$position == 1,]
p.pos <- ggplot(selectgenes, aes(x, y, fill = Description, color = Description, label = gsym)) +
geom_segment(data=df2, aes_(x=~x, xend=~x, y=~y, yend=0),
color = "grey") +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(values = mycol, guide=FALSE) + #用自定義的顏色
scale_color_manual(values = mycol, guide=FALSE) + #用自定義的顏色
scale_x_continuous(expand=c(0,0)) +
geom_hline(yintercept = 0, lty = 2, lwd = 0.2) + #在0的位置畫虛線
ylab("Ranked list\n metric") +
xlab("Rank in ordered dataset") +
theme_bw(base_size) +
theme(panel.grid = element_blank()) +
# 顯示感興趣的基因的基因名
geom_text_repel(data = selectgenes,
show.legend = FALSE, #不顯示圖例
direction = "x", #基因名橫向排列在x軸方向
ylim = c(2, NA), #基因名畫在-2下方
angle = 90, #基因名豎著寫
size = 2.5, box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines")) +
theme(plot.margin=margin(t = -.1, r = .2, b=.2, l=.2, unit="cm"))
# 組圖
plotlist <- list(p.res, p2, p.pos)
n <- length(plotlist)
plotlist[[n]] <- plotlist[[n]] +
theme(axis.line.x = element_line(),
axis.ticks.x = element_line(),
axis.text.x = element_text())
plot_grid(plotlist = plotlist, ncol = 1, align="v", rel_heights = rel_heights)