摘要:不同的測(cè)序平臺(tái)或同一平臺(tái)不同樣本得到的測(cè)序數(shù)據(jù)需要去除批次效應(yīng)后進(jìn)行整合分析或南,擬采用Seurat2程序包對(duì)CelSeq繁仁,CelSeq2掰曾,F(xiàn)luidigm C1,SMART-Seq2四種測(cè)序技術(shù)產(chǎn)生的胰島單細(xì)胞測(cè)序數(shù)據(jù)進(jìn)行整合分析并去除批次效應(yīng)倘是。
1 數(shù)據(jù)獲取
從https://satijalab.org/seurat/pancreas_integration_label_transfer.html處下載得到四種測(cè)序技術(shù)的胰島單細(xì)胞測(cè)序數(shù)據(jù)進(jìn)行后續(xù)分析亭枷。
2 分析過(guò)程
2.1 數(shù)據(jù)預(yù)處理
library(Seurat)
### 構(gòu)建Seurat對(duì)象
pancreas.data <- readRDS(file = "pancreas_expression_matrix.rds")
metadata <- readRDS(file = "pancreas_metadata.rds")
pancreas <- CreateSeuratObject(raw.data = pancreas.data, meta.data = metadata)
### 分離4種技術(shù)測(cè)序數(shù)據(jù),構(gòu)建獨(dú)立對(duì)象
pancreas.list <- SplitObject(object = pancreas,attribute.1 = "tech")
celseq <- pancreas.list$celseq
celseq2 <- pancreas.list$celseq2
fluidigmc1 <- pancreas.list$fluidigmc1
smartseq2 <- pancreas.list$smartseq2
### QC and normalize, preprocessing
# for celseq data
celseq <- FilterCells(celseq, subset.names = "nGene", low.thresholds = 1750)
celseq <- NormalizeData(celseq)
celseq <- FindVariableGenes(celseq, do.plot = F, display.progress = F)
celseq <- ScaleData(celseq)
# for celseq2 data
celseq2 <- FilterCells(celseq2, subset.names = "nGene", low.thresholds = 2500)
celseq2 <- NormalizeData(celseq2)
celseq2 <- FindVariableGenes(celseq2, do.plot = F, display.progress = F)
celseq2 <- ScaleData(celseq2)
# for fluidigmc1 data, already filtered and no need to FilterCells again
fluidigmc1 <- NormalizeData(fluidigmc1)
fluidigmc1 <- FindVariableGenes(fluidigmc1, do.plot = F, display.progress = F)
fluidigmc1 <- ScaleData(fluidigmc1)
# for smartseq2 data
smartseq2 <- FilterCells(smartseq2, subset.names = "nGene", low.thresholds = 2500)
smartseq2 <- NormalizeData(smartseq2)
smartseq2 <- FindVariableGenes(smartseq2, do.plot = F, display.progress = F)
smartseq2 <- ScaleData(smartseq2)
### 對(duì)數(shù)據(jù)進(jìn)行采樣搀崭,每個(gè)數(shù)據(jù)集隨機(jī)選取500個(gè)細(xì)胞進(jìn)行下游分析叨粘,以減少內(nèi)存消耗,縮短分析時(shí)間
celseq <- SetAllIdent(celseq, id = "tech")
celseq <- SubsetData(celseq, max.cells.per.ident = 500, random.seed = 1)
celseq2 <- SetAllIdent(celseq2, id = "tech")
celseq2 <- SubsetData(celseq2, max.cells.per.ident = 500, random.seed = 1)
fluidigmc1 <- SetAllIdent(fluidigmc1, id = "tech")
fluidigmc1 <- SubsetData(fluidigmc1, max.cells.per.ident = 500, random.seed = 1)
smartseq2 <- SetAllIdent(smartseq2, id = "tech")
smartseq2 <- SubsetData(smartseq2, max.cells.per.ident = 500, random.seed = 1)
# 將預(yù)處理好的數(shù)據(jù)存儲(chǔ)備用
save(celseq, celseq2, fluidigmc1, smartseq2, file='splited.4.seurat.object.RData')
2.2 不進(jìn)行批次效應(yīng)的矯正瘤睹,直接將數(shù)據(jù)匯總之后進(jìn)行分析
# Merge Seurat objects
gcdata <- MergeSeurat(celseq, celseq2, do.normalize=F)
gcdata <- MergeSeurat(gcdata, fluidigmc1, do.normalize=F)
gcdata <- MergeSeurat(gcdata, smartseq2, do.normalize=F)
# 查看不同4種技術(shù)之間在測(cè)得基因數(shù)量上的差異升敲,如圖1所示
VlnPlot(gcdata, "nGene", group.by = "tech")
圖1. 不同數(shù)據(jù)集單個(gè)細(xì)胞測(cè)得基因數(shù)量.png
# 對(duì)匯總后的數(shù)據(jù)進(jìn)行矯正和歸一化
gcdata <- NormalizeData(object = gcdata)
gcdata <- FindVariableGenes(gcdata, do.plot = F, display.progress = F)
gcdata <- ScaleData(gcdata, genes.use = gcdata@var.genes)
# PCA降維
gcdata <- RunPCA(gcdata, pc.genes = gcdata@var.genes,
pcs.compute = 30, do.print = TRUE, pcs.print = 5, genes.print = 5)
# 查看細(xì)胞在PCA前兩個(gè)維度空間中的分布,觀察有無(wú)批次效應(yīng)轰传,如圖2所示
# 可以用看出細(xì)胞分布受批次效應(yīng)影響較大驴党,尤其是Fluidigm C1數(shù)據(jù)集與其他數(shù)據(jù)集存在明顯偏離
DimPlot(gcdata, reduction.use = "pca", dim.1 = 1, dim.2 = 2, group.by = "tech")
圖2. 未去除批次效應(yīng)下細(xì)胞PCA降維分布.png
# 聚類并進(jìn)行tSNE降維
gcdata <- FindClusters(gcdata, reduction.type = "pca", dims.use = 1:20,
print.output = F, save.SNN = T, force.recalc = T, random.seed = 100)
gcdata <- RunTSNE(gcdata, dims.use = 1:20, do.fast = T, reduction.use = "pca", perplexity = 30)
# 在tSNE降維空間查看細(xì)胞聚類結(jié)果與批次分布,如圖3所示获茬。
# 可看出細(xì)胞聚類結(jié)果受到批次效應(yīng)影響較大港庄,同一細(xì)胞群基本來(lái)自同一批次
p1 <- DimPlot(gcdata, reduction.use = "tsne", dim.1 = 1, dim.2 = 2, group.by = "res.0.8")
p2 <- DimPlot(gcdata, reduction.use = "tsne", dim.1 = 1, dim.2 = 2, group.by = "tech")
plot_grid(p1, p2)
圖3. 未去除批次效應(yīng)下細(xì)胞聚類結(jié)果.png
# 根據(jù)Adjusted rand index(ARI)系數(shù)對(duì)批次效應(yīng)進(jìn)行評(píng)價(jià)
# ARI對(duì)當(dāng)前聚類結(jié)果與4種技術(shù)標(biāo)簽進(jìn)行比較倔既,ARI值在-1到1之間
# ARI接近于1說(shuō)明兩種聚類標(biāo)簽具有很高的相似性
# ARI接近于0說(shuō)明一種聚類標(biāo)簽相對(duì)于另一種為隨機(jī)產(chǎn)生
# ARI小于0說(shuō)明兩種聚類標(biāo)簽相似性甚至低于隨機(jī)水平(此處存疑)
# 具體計(jì)算過(guò)程見(jiàn)https://davetang.org/muse/2017/09/21/adjusted-rand-index/
# 自然,我們希望當(dāng)前聚類結(jié)果與4種技術(shù)標(biāo)簽互相獨(dú)立鹏氧,ARI值應(yīng)接近于0
# 結(jié)果結(jié)果顯示ARI為0.3430069渤涌,說(shuō)明當(dāng)前聚類結(jié)果受到批次效應(yīng)影響
ari <- dplyr::select(gcdata@meta.data, tech, res.0.8)
ari$tech <- plyr::mapvalues(ari$tech, from = c("celseq", "celseq2", "fluidigmc1", "smartseq2"), to = c(0, 1, 2, 3))
mclust::adjustedRandIndex(as.numeric(ari$tech), as.numeric(ari$res.0.8))
# 保存當(dāng)前結(jié)果
save(gcdata, file = “gcdata.Rdata”)
2.3 通過(guò)CCA算法進(jìn)行批次效應(yīng)的校正后進(jìn)行下游分析
# 鑒定出至少在兩個(gè)數(shù)據(jù)集中同時(shí)出現(xiàn)的高變異基因,使用這些基因進(jìn)行CCA分析
ob.list <- list(celseq, celseq2, fluidigmc1, smartseq2)
genes.use <- c()
for (i in 1:length(ob.list)) {
genes.use <- c(genes.use, head(rownames(ob.list[[i]]@hvg.info), 1000))
}
genes.use <- names(which(table(genes.use) > 1))
for (i in 1:length(ob.list)) {
genes.use <- genes.use[genes.use %in% rownames(ob.list[[i]]@scale.data)]
}
# 使用上述程序鑒定到的高變異基因把还,對(duì)4個(gè)數(shù)據(jù)集進(jìn)行CCA分析实蓬,并計(jì)算前15個(gè)CC維度
gcdata.CCA <- RunMultiCCA(ob.list, genes.use = genes.use, num.ccs = 15)
# 查看細(xì)胞在CCA前兩個(gè)維度空間中的分布,觀察有無(wú)批次效應(yīng)吊履,如圖4所示
# 可以看出與圖2相比安皱,批次效應(yīng)得到一定程度的緩解
DimPlot(gcdata.CCA, reduction.use = "cca", group.by = "tech", pt.size = 0.5)
圖4. CCA降維后細(xì)胞分布.png
# 查看在前幾個(gè)CC維度上重要性較大的部分基因,這些基因的表達(dá)可能在細(xì)胞類別的區(qū)分上權(quán)重較大艇炎,如圖5所示
DimHeatmap(object = gcdata.CCA, reduction.type = "cca", cells.use = 500, dim.use = 1:9, do.balanced = TRUE)
圖5. 前9個(gè)CC維度上基因得分.png
# 使用MetageneBicorPlot功能练俐,選擇下游分析使用的CC維度,如圖6所示
# 可選擇前10-12個(gè)CC維度進(jìn)行下游分析
MetageneBicorPlot(gcdata.CCA, grouping.var = "tech", dims.eval = 1:15)
圖6. MetageneBicorPlot.png
# 將4個(gè)數(shù)據(jù)集的CCA維度空間進(jìn)行比對(duì)冕臭,產(chǎn)生新的降維空間cca.aligned
# 在新的空間對(duì)細(xì)胞進(jìn)行聚類分析
# 使用AlignSubspace,同時(shí)聲明分組變量與比對(duì)維度
gcdata.CCA <- AlignSubspace(gcdata.CCA, grouping.var = "tech", dims.align = 1:12)
# 查看比對(duì)之后的維度分?jǐn)?shù)分布燕锥,如圖7所示
VlnPlot(gcdata.CCA, features.plot = c("ACC1","ACC2"),, group.by = "tech")
圖7. CCA比對(duì)后維度分?jǐn)?shù)分布.png
# 與未比對(duì)之前的CC維度分?jǐn)?shù)分布進(jìn)行比較辜贵,如圖8所示
# 圖7、圖8比較归形,可見(jiàn)經(jīng)過(guò)CCA比對(duì)之后托慨,細(xì)胞分布收到批次效應(yīng)影響的程度更小
VlnPlot(gcdata.CCA, features.plot = c("CC1","CC2"),, group.by = "tech")
圖8. CCA比對(duì)前維度分?jǐn)?shù)分布.png
# 在比對(duì)后的ACC空間進(jìn)行聚類分析與tSNE降維分析
gcdata.CCA <- FindClusters(gcdata.CCA, reduction.type = "cca.aligned", dims.use = 1:12,
save.SNN = T, random.seed = 100)
gcdata.CCA <- RunTSNE(gcdata.CCA, reduction.use = "cca.aligned", dims.use = 1:12,
do.fast = TRUE, seed.use = 1)
# 在tSNE降維空間查看細(xì)胞聚類結(jié)果與批次分布,如圖9所示
# 與圖3比較暇榴,可見(jiàn)批次效應(yīng)得到很大程度的緩解厚棵,各個(gè)批次的細(xì)胞呈均勻分布
p1 <- DimPlot(gcdata.CCA, reduction.use = "tsne", dim.1 = 1, dim.2 = 2,
group.by = "res.0.8", do.return = T)
p2 <- DimPlot(gcdata.CCA, reduction.use = "tsne", dim.1 = 1, dim.2 = 2,
group.by = "tech", do.return = T)
plot_grid(p1, p2)
圖9. 去除批次效應(yīng)后細(xì)胞聚類結(jié)果.png
# 計(jì)算矯正批次效應(yīng)后的ARI,結(jié)果顯示為0.04356477蔼紧,基本接近與0
# 且與未矯正批次效應(yīng)的ARI(0.3430069)相比婆硬,有明顯降低,說(shuō)明矯正后聚類結(jié)果基本不受批次效應(yīng)影響
ari <- dplyr::select(gcdata.CCA@meta.data, tech, res.0.8)
ari$tech <- plyr::mapvalues(ari$tech,
from = c("celseq", "celseq2", "fluidigmc1", "smartseq2"),
to = c(0, 1, 2, 3))
mclust::adjustedRandIndex(as.numeric(ari$tech), as.numeric(ari$res.0.8))
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.2
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] Seurat_2.3.4 Matrix_1.2-17 cowplot_0.9.4 ggplot2_3.1.1
loaded via a namespace (and not attached):
[1] tsne_0.1-3 segmented_0.5-4.0 nlme_3.1-139
[4] bitops_1.0-6 RColorBrewer_1.1-2 httr_1.4.0
[7] prabclus_2.3-1 tools_3.6.0 backports_1.1.4
[10] irlba_2.3.3 R6_2.4.0 rpart_4.1-15
[13] KernSmooth_2.23-15 Hmisc_4.2-0 lazyeval_0.2.2
[16] colorspace_1.4-1 nnet_7.3-12 npsurv_0.4-0
[19] withr_2.1.2 tidyselect_0.2.5 gridExtra_2.3
[22] compiler_3.6.0 htmlTable_1.13.1 hdf5r_0.0.0-9000
[25] labeling_0.3 diptest_0.75-7 caTools_1.17.1.2
[28] scales_1.1.0 checkmate_1.9.3 lmtest_0.9-37
[31] DEoptimR_1.0-8 mvtnorm_1.0-10 robustbase_0.93-5
[34] ggridges_0.5.1 pbapply_1.4-0 dtw_1.20-1
[37] proxy_0.4-23 stringr_1.4.0 digest_0.6.19
[40] mixtools_1.1.0 foreign_0.8-71 R.utils_2.8.0
[43] base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6
[46] bibtex_0.4.2 htmlwidgets_1.3 rlang_0.4.5
[49] rstudioapi_0.10 farver_2.0.3 jsonlite_1.6
[52] zoo_1.8-6 ica_1.0-2 mclust_5.4.3
[55] gtools_3.8.1 acepack_1.4.1 dplyr_0.8.1
[58] R.oo_1.22.0 magrittr_1.5 modeltools_0.2-22
[61] Formula_1.2-3 lars_1.2 Rcpp_1.0.1
[64] munsell_0.5.0 reticulate_1.12 ape_5.3
[67] lifecycle_0.1.0 R.methodsS3_1.7.1 stringi_1.4.3
[70] gbRd_0.4-11 MASS_7.3-51.4 flexmix_2.3-15
[73] gplots_3.0.1.1 Rtsne_0.15 plyr_1.8.4
[76] grid_3.6.0 parallel_3.6.0 gdata_2.18.0
[79] crayon_1.3.4 doSNOW_1.0.16 lattice_0.20-38
[82] splines_3.6.0 SDMTools_1.1-221.1 knitr_1.23
[85] pillar_1.4.1 igraph_1.2.4.1 fpc_2.2-1
[88] reshape2_1.4.3 codetools_0.2-16 stats4_3.6.0
[91] glue_1.3.1 lsei_1.2-0 metap_1.1
[94] latticeExtra_0.6-28 data.table_1.12.2 png_0.1-7
[97] Rdpack_0.11-0 foreach_1.4.4 tidyr_0.8.3
[100] gtable_0.3.0 RANN_2.6.1 purrr_0.3.2
[103] kernlab_0.9-27 assertthat_0.2.1 xfun_0.7
[106] class_7.3-15 survival_2.44-1.1 tibble_2.1.2
[109] snow_0.4-3 iterators_1.0.10 cluster_2.0.8
[112] fitdistrplus_1.0-14 ROCR_1.0-7
主要參考資料: