8.8 Seurat v3, 3’ 10k PBMC和全血STRT-Seq
盡管我們的/data文件夾中已經(jīng)有了所有必要的文件,我們?nèi)钥梢詮腉EO數(shù)據(jù)庫下載必要的文件:
download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149938/suppl/GSE149938_umi_matrix.csv.gz",
destfile = "GSE149938_umi_matrix.csv.gz")
download.file("https://cf.10xgenomics.com/samples/cell-exp/4.0.0/Parent_NGSC3_DI_PBMC/Parent_NGSC3_DI_PBMC_filtered_feature_bc_matrix.h5",
destfile = "3p_pbmc10k_filt.h5")
> umi_gz <- gzfile("data/update/GSE149938_umi_matrix.csv.gz",'rt')
> umi <- read.csv(umi_gz,check.names = F,quote = "")
> matrix_3p <- Read10X_h5("data/update/3p_pbmc10k_filt.h5",use.names = T)
接下來账锹,讓我們創(chuàng)建Seurat
對象并重新定義一些元數(shù)據(jù)列(GEO數(shù)據(jù)集將細胞類型放入orig.ident
slot掠哥,這會干擾我們接下來要做的事情):
> srat_wb <- CreateSeuratObject(t(umi),project = "whole_blood")
> srat_3p <- CreateSeuratObject(matrix_3p,project = "pbmc10k_3p")
> rm(umi_gz)
> rm(umi)
> rm(matrix_3p)
> colnames(srat_wb@meta.data)[1] <- "cell_type"
> srat_wb@meta.data$orig.ident <- "whole_blood"
> srat_wb@meta.data$orig.ident <- as.factor(srat_wb@meta.data$orig.ident)
> head(srat_wb[[]])
cell_type nCount_RNA nFeature_RNA orig.ident
BNK_spBM1_L1_bar25 BNK 24494 1869 whole_blood
BNK_spBM1_L1_bar26 BNK 61980 2051 whole_blood
BNK_spBM1_L1_bar27 BNK 124382 3872 whole_blood
BNK_spBM1_L1_bar28 BNK 8144 1475 whole_blood
BNK_spBM1_L1_bar29 BNK 53612 2086 whole_blood
BNK_spBM1_L1_bar30 BNK 33582 2038 whole_blood
進行基本的QC。STRT-seq與10x有很大不同趾代,每個細胞檢測到的基因要多得多。此外,出于某種原因番电,全血數(shù)據(jù)集的矩陣中沒有MT基因,但這并不重要辆琅。
> srat_wb <- SetIdent(srat_wb,value = "orig.ident")
> srat_wb[["percent.mt"]] <- PercentageFeatureSet(srat_wb, pattern = "^MT-")
> srat_wb[["percent.rbp"]] <- PercentageFeatureSet(srat_wb, pattern = "^RP[SL]")
> srat_3p[["percent.mt"]] <- PercentageFeatureSet(srat_3p, pattern = "^MT-")
> srat_3p[["percent.rbp"]] <- PercentageFeatureSet(srat_3p, pattern = "^RP[SL]")
> VlnPlot(srat_wb, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)
> VlnPlot(srat_3p, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)
用于處理GEO全血數(shù)據(jù)集的注釋文件與Cell Ranger GRCh38-2020A有很大不同漱办。讓我們看看有多少個共有基因:
> table(rownames(srat_3p) %in% rownames(srat_wb))
FALSE TRUE
18050 18551
> common_genes <- rownames(srat_3p)[rownames(srat_3p) %in% rownames(srat_wb)]
過濾基因數(shù)量過高或過低、MT基因含量過高的細胞婉烟。另外娩井,讓我們將單個矩陣限制為僅包含共有基因:
> srat_3p <- subset(srat_3p, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 15)
> srat_wb <- subset(srat_wb, subset = nFeature_RNA > 1000 & nFeature_RNA < 6000)
> srat_3p <- srat_3p[rownames(srat_3p) %in% common_genes,]
> srat_wb <- srat_wb[rownames(srat_wb) %in% common_genes,]
與之前的Seurat
v3一樣,讓我們列出一個列表并為每個對象標準化/篩選HVG:
> wb_list <- list()
> wb_list[["pbmc10k_3p"]] <- srat_3p
> wb_list[["whole_blood"]] <- srat_wb
> for (i in 1:length(wb_list)) {
wb_list[[i]] <- NormalizeData(wb_list[[i]], verbose = F)
wb_list[[i]] <- FindVariableFeatures(wb_list[[i]], selection.method = "vst", nfeatures = 2000, verbose = F)
}
接下來進行整合似袁,Seurat 3分兩步完成洞辣。
> wb_anchors <- FindIntegrationAnchors(object.list = wb_list, dims = 1:30)
> wb_seurat <- IntegrateData(anchorset = wb_anchors, dims = 1:30)
> rm(wb_list)
> rm(wb_anchors)
對未整合的數(shù)據(jù)集進行基本的處理和可視化:
> DefaultAssay(wb_seurat) <- "RNA"
> wb_seurat <- NormalizeData(wb_seurat, verbose = F)
> wb_seurat <- FindVariableFeatures(wb_seurat, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
> DimPlot(wb_seurat,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")
現(xiàn)在,讓我們看一下整合的數(shù)據(jù)集:
> DefaultAssay(wb_seurat) <- "integrated"
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
> DimPlot(wb_seurat, reduction = "umap") + plot_annotation(title = "10k 3' PBMC and white blood cells, after integration (Seurat 3)")
讓我們看一些marker:
> FeaturePlot(wb_seurat,c("MS4A1","LYZ","NKG7","PPBP","LTF","HBA1","FCER1A","IL7R","FCGR3B")) & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
從圖中我們可以看出昙衅,有一些重要的細胞類型在PBMC數(shù)據(jù)集中不存在扬霜,但存在于全血數(shù)據(jù)集中。LTF基因是中性粒細胞最顯著的marker而涉,HBA1是在紅細胞中表達的血紅蛋白基因著瓶。
現(xiàn)在讓我們對整合矩陣進行聚類,并查看聚類在兩個數(shù)據(jù)集之間的分布情況:
> wb_seurat <- FindNeighbors(wb_seurat, dims = 1:30, k.param = 10, verbose = F)
> wb_seurat <- FindClusters(wb_seurat, verbose = F)
> DimPlot(wb_seurat,label = T) + NoLegend()
聚類組成顯示了全血數(shù)據(jù)集獨有的許多聚類:
> count_table <- table(wb_seurat@meta.data$seurat_clusters, wb_seurat@meta.data$orig.ident)
> count_table
pbmc10k_3p whole_blood
0 1426 237
1 1385 71
2 1264 130
3 1211 112
4 1115 145
5 0 1052
6 355 467
7 377 211
8 386 199
9 0 550
10 343 157
11 390 82
12 0 441
13 283 125
14 7 388
15 3 380
16 19 362
17 4 367
18 2 316
19 297 13
20 0 308
21 0 265
22 0 222
23 0 221
24 15 179
25 106 22
26 93 19
27 0 103
28 77 3
29 0 50
30 32 2
31 0 32
> plot_integrated_clusters(wb_seurat)
我們可以利用GSE149938中存在的元數(shù)據(jù):
> meta <- wb_seurat[[]]
> table(meta[meta$seurat_clusters == '5',]$cell_type) ## erythrocytes
BNK CMP ery immB MEP MLP preB proB toxiNK
1 2 1040 3 1 1 2 1 1
> table(meta[meta$seurat_clusters == '20',]$cell_type) ## neutrophils
BNK CLP CMP HSC immB kineNK matureN metaN MLP myeN
2 2 1 1 1 55 1 7 1 99
plasma proN toxiNK
1 136 1
> table(meta[meta$seurat_clusters == '24',]$cell_type) ## plasma
ery naiB plasma
11 2 166
> table(meta[meta$seurat_clusters == '16',]$cell_type) ## platelets
BNK CLP CMP ery HSC LMPP matureN MEP MPP NKP
1 3 61 4 72 1 1 144 74 1
> rm(wb_seurat)
8.9 Harmony, 3’ 10k PBMC和全血STRT-Seq
首先創(chuàng)建一個合并的Seurat
數(shù)據(jù)集啼县,并對其進行標準化和后續(xù)處理:
> wb_harmony <- merge(srat_3p,srat_wb)
> wb_harmony <- NormalizeData(wb_harmony, verbose = F)
> wb_harmony <- FindVariableFeatures(wb_harmony, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_harmony <- ScaleData(wb_harmony, verbose = F)
> wb_harmony <- RunPCA(wb_harmony, npcs = 30, verbose = F)
> wb_harmony <- RunUMAP(wb_harmony, reduction = "pca", dims = 1:30, verbose = F)
看一下PCA圖的變化材原,以及沿第一個主成分的分布:
> p1 <- DimPlot(object = wb_harmony, reduction = "pca", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "PC_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)
UMAP也顯示了數(shù)據(jù)集之間的明顯差異:
> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")
使用SeuratWrappers
庫中名為RunHarmony
的函數(shù)來運行harmony
:
> wb_harmony <- wb_harmony %>% RunHarmony("orig.ident", plot_convergence = T)
這會生成我們稍后將用于所有下游分析的降維結(jié)果沸久。
> harmony_embeddings <- Embeddings(wb_harmony, 'harmony')
> harmony_embeddings[1:5, 1:5]
harmony_1 harmony_2 harmony_3 harmony_4 harmony_5
AAACCCACATAACTCG-1 2.592137 -1.6045869 3.192993 -0.03594751 3.8447941
AAACCCACATGTAACC-1 4.244764 3.2122684 -9.738222 -5.90380632 0.9607984
AAACCCAGTGAGTCAG-1 3.208084 -2.1054920 2.035577 1.90984384 5.3665634
AAACGAACAGTCAGTT-1 -1.106694 1.8151680 3.092745 -2.34038488 7.6785360
AAACGAACATTCGGGC-1 4.735928 -0.4421468 -2.196355 2.77622970 -3.2385050
校正后的PCA和分布:
> p1 <- DimPlot(object = wb_harmony, reduction = "harmony", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "harmony_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)
進行UMAP降維和Louvain聚類:
> wb_harmony <- wb_harmony %>%
RunUMAP(reduction = "harmony", dims = 1:30, verbose = F) %>%
FindNeighbors(reduction = "harmony", k.param = 10, dims = 1:30) %>%
FindClusters() %>%
identity()
> wb_harmony <- SetIdent(wb_harmony,value = "orig.ident")
> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, after integration (Harmony)")
> DimPlot(wb_harmony, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()
該數(shù)據(jù)集的整合結(jié)果看起來與Seurat v3非常相似:
> wb_harmony <- SetIdent(wb_harmony,value = "seurat_clusters")
> DimPlot(wb_harmony,label = T) + NoLegend()
更詳細的聚類檢查似乎也證實了這一點:
> plot_integrated_clusters(wb_harmony)
> rm(wb_harmony)
8.10 LIGER, 3’ 10k PBMC和全血STRT-Seq
最后,使用LIGER
進行數(shù)據(jù)整合华糖。此步驟需要一些時間來運行:
> wb_liger <- merge(srat_3p,srat_wb)
> wb_liger <- NormalizeData(wb_liger)
> wb_liger <- FindVariableFeatures(wb_liger)
> wb_liger <- ScaleData(wb_liger, split.by = "orig.ident", do.center = F)
> wb_liger <- RunOptimizeALS(wb_liger, k = 30, lambda = 5, split.by = "orig.ident")
> wb_liger <- RunQuantileNorm(wb_liger, split.by = "orig.ident")
然后麦向,使用與之前類似的設置執(zhí)行Louvain聚類(FindNeighbors
和FindClusters
):
> wb_liger <- FindNeighbors(wb_liger,reduction = "iNMF",k.param = 10,dims = 1:30)
> wb_liger <- FindClusters(wb_liger)
讓我們用幾種不同的方式來看一下整合后的UMAP圖:
> wb_liger <- RunUMAP(wb_liger, dims = 1:ncol(wb_liger[["iNMF"]]), reduction = "iNMF",verbose = F)
> wb_liger <- SetIdent(wb_liger,value = "orig.ident")
> DimPlot(wb_liger,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and 10k 5' PBMC cells, after integration (LIGER)")
> DimPlot(wb_liger, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()
最后,看一下每個cluster的分布:
> plot_integrated_clusters(wb_liger)
> rm(wb_liger)
> rm(srat_wb)
> rm(srat_3p)
往期內(nèi)容:
重生之我在劍橋大學學習單細胞RNA-seq分析——8. scRNA-seq數(shù)據(jù)整合(1)
重生之我在劍橋大學學習單細胞RNA-seq分析——8. scRNA-seq數(shù)據(jù)整合(2)
重生之我在劍橋大學學習單細胞RNA-seq分析——8. scRNA-seq數(shù)據(jù)整合(3)