單細胞富集分析系列:
- 單細胞之富集分析-1:單細胞GSEA分析流程
- 單細胞之富集分析-2:批量GSEA和GSVA分析
- 單細胞之富集分析-3:GO和KEGG富集分析及繪圖
- 單細胞之富集分析-4:分組水平GSVA
- 單細胞之富集分析-5:一張圖展示多組富集分析結(jié)果
0. 簡介
異常細胞信號會引起癌癥等其他疾病弹渔,并且是常見的治療的靶點牲平。撤沾停可以通過基因的表達來推斷某個信號通路的活性玫荣。然而糕伐,只考慮基因表達對通路的作用往往忽略了翻譯后修飾的作用险掀,并且下游信號代表非常特定的實驗條件徽职。在這里枝缔,作者提出介紹PROGENy,這是一種通過利用大量公開可用的擾動實驗熔吗,來克服了這兩個局限性的方法辆床。與現(xiàn)有方法不同,PROGENy可以(i)恢復(fù)已知驅(qū)動基因突變的作用桅狠,(ii)提供或改善藥物的marker讼载,以及(iii)區(qū)分致癌和腫瘤抑制途徑轿秧,以確保患者生存咨堤。
PROGENy可以從基因表達數(shù)據(jù)中推斷14種信號通路(雄激素菇篡,雌激素,EGFR吱型,低氧逸贾,JAK-STAT,MAPK津滞,NFkB,PI3K灼伤,p53触徐,TGFb,TNFa狐赡,Trail撞鹉,VEGF和WNT)的通路活性。默認情況下颖侄,途徑活動推斷是基于相應(yīng)的途徑擾動后前100個響應(yīng)性最高的基因的基因集鸟雏,我們將其稱為途徑的足跡基因。為每個足跡基因分配一個權(quán)重览祖,該權(quán)重表示對路徑擾動進行調(diào)節(jié)的強度和方向孝鹊。途徑得分是通過表達和足跡基因權(quán)重乘積的加權(quán)總和計算得出的。
例如:在pbmc單細胞數(shù)據(jù)中識別通路活性展蒂。
1. 分析Bulk數(shù)據(jù)
1.1 導(dǎo)入演示數(shù)據(jù)(芯片數(shù)據(jù))
library(airway)
library(DESeq2)
data(airway)
# import data to DESeq2 and variance stabilize
dset = DESeqDataSetFromMatrix(assay(airway),
colData=as.data.frame(colData(airway)), design=~dex)
dset = estimateSizeFactors(dset)
dset = estimateDispersions(dset)
gene_expr = getVarianceStabilizedData(dset)
# annotate matrix with HGNC symbols
library(biomaRt)
mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes = getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
values=rownames(gene_expr), mart=mart)
matched = match(rownames(gene_expr), genes$ensembl_gene_id)
rownames(gene_expr) = genes$hgnc_symbol[matched]
dim(gene_expr)
# [1] 64102 8
gene_expr[1:5,1:5]
# SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
# TSPAN6 9.531699 9.170317 9.674546 9.421197 10.027595
# TNMD 5.302948 5.302948 5.302948 5.302948 5.302948
# DPM1 9.055928 9.347026 9.235703 9.278863 9.166625
# SCYL3 8.358430 8.273162 8.212763 8.316611 8.136760
# C1orf112 6.967075 7.001886 6.596359 6.882393 7.060850
1.2 分析和繪圖
library(progeny)
pathways = progeny(gene_expr, scale=FALSE)
controls = airway$dex == "untrt"
ctl_mean = apply(pathways[controls,], 2, mean)
ctl_sd = apply(pathways[controls,], 2, sd)
pathways = t(apply(pathways, 1, function(x) x - ctl_mean))
pathways = apply(pathways, 1, function(x) x / ctl_sd)
library(pheatmap)
myColor = colorRampPalette(c("Darkblue", "white","red"))(100)
pheatmap(t(pathways),fontsize=14, show_rownames = F,
color=myColor, main = "PROGENy", angle_col = 45, treeheight_col = 0,
border_color = NA)
2. 分析單細胞數(shù)據(jù)
2.1 導(dǎo)入數(shù)據(jù)
library(Seurat)
library(progeny)
library(tidyr)
library(tibble)
pbmc <- readRDS("pbmc.rds") #注釋好的單細胞數(shù)據(jù)集
2.2 提取細胞類型標簽
查看每個細胞類型這14個通路的活性
## We create a data frame with the specification of the cells that belong to
## each cluster to match with the Progeny scores.
CellsClusters <- data.frame(Cell = names(Idents(pbmc)),
CellType = as.character(Idents(pbmc)),
stringsAsFactors = FALSE)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
2.3 計算
## We compute the Progeny activity scores and add them to our Seurat object
## as a new assay called Progeny.
pbmc <- progeny(pbmc, scale=FALSE, organism="Human", top=500, perm=1,
return_assay = TRUE)
pbmc@assays$progeny
# Assay data with 14 features for 2638 cells
# First 10 features:
# Androgen, EGFR, Estrogen, Hypoxia, JAK-STAT, MAPK, NFkB, p53, PI3K, TGFb
pbmc@assays[["progeny"]]@data
下儲存的是14個通路在2638個細胞中的評分
pbmc@assays[["progeny"]]@data[1:6,1:4]
# AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1
# Androgen 86.88406 57.92163 94.287118 11.135108
# EGFR 28.34279 -15.00708 -2.652041 1.738702
# Estrogen -51.13204 -67.55945 -64.452782 -37.103230
# Hypoxia 96.56731 122.76491 135.750523 96.936620
# JAK-STAT 163.11016 269.55272 315.408767 609.408980
# MAPK -10.39436 -59.37868 -17.161151 -61.763826
2.4 整合同一細胞類型的評分
## We can now directly apply Seurat functions in our Progeny scores.
## For instance, we scale the pathway activity scores.
pbmc <- Seurat::ScaleData(pbmc, assay = "progeny")
## We transform Progeny scores into a data frame to better handling the results
progeny_scores_df <-
as.data.frame(t(GetAssayData(pbmc, slot = "scale.data",
assay = "progeny"))) %>%
rownames_to_column("Cell") %>%
gather(Pathway, Activity, -Cell)
dim(progeny_scores_df)
# [1] 36932 3
## We match Progeny scores with the cell clusters.
progeny_scores_df <- inner_join(progeny_scores_df, CellsClusters)
## We summarize the Progeny scores by cellpopulation
summarized_progeny_scores <- progeny_scores_df %>%
group_by(Pathway, CellType) %>%
summarise(avg = mean(Activity), std = sd(Activity))
dim(summarized_progeny_scores)
# [1] 126 4
## We prepare the data for the plot
summarized_progeny_scores_df <- summarized_progeny_scores %>%
dplyr::select(-std) %>%
spread(Pathway, avg) %>%
data.frame(row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
2.5 繪圖
paletteLength = 100
myColor = colorRampPalette(c("Darkblue", "white","red"))(paletteLength)
progenyBreaks = c(seq(min(summarized_progeny_scores_df), 0,
length.out=ceiling(paletteLength/2) + 1),
seq(max(summarized_progeny_scores_df)/paletteLength,
max(summarized_progeny_scores_df),
length.out=floor(paletteLength/2)))
progeny_hmap = pheatmap(t(summarized_progeny_scores_df),fontsize=12,
fontsize_row = 10,
color=myColor, breaks = progenyBreaks,
main = "PROGENy (500)", angle_col = 45,
treeheight_col = 0, border_color = NA)
2.6 繪制單個通路的FeaturePlot
DefaultAssay(pbmc) <- 'progeny'
p1= FeaturePlot(pbmc,features = "NFkB", coord.fixed = T, order = T, cols = viridis(10))
p2=FeaturePlot(pbmc,features = "MAPK", coord.fixed = T, order = T, cols = viridis(10))
p1|p2