單細(xì)胞富集分析系列:
1. 數(shù)據(jù)和基因集準(zhǔn)備
library(Seurat)
library(msigdbr)
library(GSVA)
library(tidyverse)
library(clusterProfiler)
library(patchwork)
library(limma)
rm(list=ls())
- 數(shù)據(jù)準(zhǔn)備
pbmc <- readRDS("pbmc.rds")
#DefaultAssay(scRNA) <- "RNA"
#scRNA <- NormalizeData(scRNA)
- 基因集準(zhǔn)備
genesets <- msigdbr(species = "Homo sapiens", category = "H")
genesets <- subset(genesets, select = c("gs_name","gene_symbol")) %>% as.data.frame()
genesets <- split(genesets$gene_symbol, genesets$gs_name)
- 使用
AverageExpression
函數(shù)提取分組平均表達(dá)矩陣
Idents(pbmc) <- "cell_type"
expr <- AverageExpression(pbmc, assays = "RNA", slot = "data")[[1]]
expr <- expr[rowSums(expr)>0,] #選取非零基因
expr <- as.matrix(expr)
head(expr)
# Naive CD4 T Memory CD4 T CD14+ Mono B CD8 T FCGR3A+ Mono NK DC Platelet
# AL627309.1 0.006007987 0.04854338 0.006065400 0.00000000 0.01995673 0.00000000 0.00000000 0.00000000 0
# AP006222.2 0.000000000 0.01088471 0.008397321 0.00000000 0.01157323 0.00000000 0.00000000 0.00000000 0
# RP11-206L10.2 0.007306336 0.00000000 0.000000000 0.02065031 0.00000000 0.00000000 0.00000000 0.08462847 0
# RP11-206L10.9 0.000000000 0.01050116 0.000000000 0.00000000 0.00000000 0.01200008 0.00000000 0.00000000 0
# LINC00115 0.014872162 0.03753737 0.031095957 0.03888541 0.01892413 0.01469374 0.06302423 0.00000000 0
# NOC2L 0.501822970 0.27253750 0.346874253 0.58653489 0.59129394 0.50026775 0.65705381 0.32861136 0
2. 分組GSVA分析
# gsva默認(rèn)開啟全部線程計算
gsva.res <- gsva(expr, genesets, method="gsva")
saveRDS(gsva.res, "gsva.res.rds")
gsva.df <- data.frame(Genesets=rownames(gsva.res), gsva.res, check.names = F)
write.csv(gsva.df, "gsva_res.csv", row.names = F)
pheatmap::pheatmap(gsva.res, show_colnames = T,
scale = "row",angle_col = "45",
color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
3. 計算通路在case和control之間的顯著性并繪圖(limma做的)
pbmc3k的數(shù)據(jù)是有一個樣本装处,在這里我們把9種細(xì)胞當(dāng)成4個con5個病例組來計算case和control之間某通路的顯著性蜜笤。(僅供演示犬性,實際操作時候前面計算每個sample的基因表達(dá)均值幌缝,這里再換成case和control就可以了)
- 分組
group_list <- data.frame(sample = colnames(gsva.df)[-1], group = c(rep("con", 4), rep("case", 5)))
group_list
# sample group
# 1 Naive CD4 T con
# 2 Memory CD4 T con
# 3 CD14+ Mono con
# 4 B con
# 5 CD8 T case
# 6 FCGR3A+ Mono case
# 7 NK case
# 8 DC case
# 9 Platelet case
- 設(shè)置對比
design <- model.matrix(~ 0 + factor(group_list$group))
colnames(design) <- levels(factor(group_list$group))
rownames(design) <- colnames(gsva.res)
design
# case con
# Naive CD4 T 0 1
# Memory CD4 T 0 1
# CD14+ Mono 0 1
# B 0 1
# CD8 T 1 0
# FCGR3A+ Mono 1 0
# NK 1 0
# DC 1 0
# Platelet 1 0
# attr(,"assign")
# [1] 1 1
# attr(,"contrasts")
# attr(,"contrasts")$`factor(group_list$group)`
# [1] "contr.treatment"
- 差異分析
# 構(gòu)建差異比較矩陣
contrast.matrix <- makeContrasts(con-case, levels = design)
# 差異分析穿扳,case vs. con
fit <- lmFit(gsva.res, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
x <- topTable(fit2, coef = 1, n = Inf, adjust.method = "BH", sort.by = "P")
head(x)
# logFC AveExpr t P.Value adj.P.Val B
# HALLMARK_MTORC1_SIGNALING -0.3291987 -0.08593222 -4.477744 0.000747704 0.0373852 -0.2823204
# HALLMARK_PI3K_AKT_MTOR_SIGNALING -0.2988238 -0.16659423 -3.391304 0.005325363 0.1331341 -2.1754501
# HALLMARK_PEROXISOME -0.1862704 -0.08599750 -2.611784 0.022660475 0.2801731 -3.5499137
# HALLMARK_PROTEIN_SECRETION -0.2411741 -0.06878214 -2.523986 0.026641622 0.2801731 -3.7008368
# HALLMARK_CHOLESTEROL_HOMEOSTASIS -0.2317843 -0.08879774 -2.496598 0.028017314 0.2801731 -3.7476296
# HALLMARK_FATTY_ACID_METABOLISM -0.2112693 -0.10325132 -2.242946 0.044472826 0.3706069 -4.1730598
#把通路的limma分析結(jié)果保存到文件
write.csv(x, "gsva_limma.csv", quote = F)
#輸出t值,用做繪圖的輸入數(shù)據(jù)
pathway <- str_replace(row.names(x), "HALLMARK_", "")
df <- data.frame(ID = pathway, score = x$t)
head(df)
# ID score
# 1 MTORC1_SIGNALING 4.477744
# 2 PI3K_AKT_MTOR_SIGNALING 3.391304
# 3 PEROXISOME 2.611784
# 4 PROTEIN_SECRETION 2.523986
# 5 CHOLESTEROL_HOMEOSTASIS 2.496598
# 6 FATTY_ACID_METABOLISM 2.242946
write.csv(df, "enrich_bar.csv", quote = F, row.names = F)
- 準(zhǔn)備繪圖輸入矩陣
#按照score的值分組
cutoff <- 0
df$group <- cut(df$score, breaks = c(-Inf, cutoff, Inf),labels = c(1,2))
head(df)
# ID score group
# 1 MTORC1_SIGNALING -4.477744 2
# 2 PI3K_AKT_MTOR_SIGNALING -3.391304 2
# 3 PEROXISOME -2.611784 2
# 4 PROTEIN_SECRETION -2.523986 2
# 5 CHOLESTEROL_HOMEOSTASIS -2.496598 2
# 6 FATTY_ACID_METABOLISM -2.242946 2
#按照score排序
sortdf <- df[order(df$score),]
sortdf$ID <- factor(sortdf$ID, levels = sortdf$ID)
head(sortdf)
# ID score group
# 1 MTORC1_SIGNALING -4.477744 1
# 2 PI3K_AKT_MTOR_SIGNALING -3.391304 1
# 3 PEROXISOME -2.611784 1
# 4 PROTEIN_SECRETION -2.523986 1
# 5 CHOLESTEROL_HOMEOSTASIS -2.496598 1
# 6 FATTY_ACID_METABOLISM -2.242946 1
- 繪圖(這里橫軸用的是t值)
ggplot(sortdf, aes(ID, score, fill = group)) + geom_bar(stat = 'identity') +
coord_flip() +
scale_fill_manual(values = c('palegreen3', 'dodgerblue4'), guide = FALSE) +
#畫2條虛線
geom_hline(yintercept = c(-1,1),
color="white",
linetype = 2, #畫虛線
size = 0.3) + #線的粗細(xì)
#寫label
geom_text(data = subset(df, score < 0),
aes(x=ID, y= 0.1, label=ID, color = group),
size = 3, #字的大小
hjust = "outward" ) + #字的對齊方式
geom_text(data = subset(df, score > 0),
aes(x=ID, y= -0.1, label= paste0(" ", ID), color = group),#bar跟坐標(biāo)軸間留出間隙
size = 3, hjust = "inward") +
scale_colour_manual(values = c("black","black"), guide = FALSE) +
xlab("") +ylab("t value of GSVA score")+
theme_bw() + #去除背景色
theme(panel.grid =element_blank()) + #去除網(wǎng)格線
theme(panel.border = element_rect(size = 0.6)) + #邊框粗細(xì)
theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) #去除y軸
ggsave("gsva.pdf", width = 6, height = 8)