library(Seurat)
library(ggplot2)
library(patchwork)
下載所需要的文件:scATAC-seq, scATAC-seq metadata, scRNA-seq
注釋文件下載如下
#下載注釋文件并解壓
wget ftp://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz
gunzip Homo_sapiens.GRCh37.87.gtf.gz
peaks <- Read10X_h5("data/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5")
# create a gene activity matrix from the peak matrix and GTF, using chromosomes 1:22, X, and Y.
# Peaks that fall within gene bodies, or 2kb upstream of a gene, are considered
activity.matrix <- CreateGeneActivityMatrix(peak.matrix = peaks, annotation.file = "data/Homo_sapiens.GRCh37.87.gtf", seq.levels = c(1:22, "X", "Y"), upstream = 2000, verbose = TRUE)
設(shè)置Seurat中的對象拒秘,原始峰值計數(shù)存儲在“ ATAC”Assay叫确,基因表達矩陣存儲在“ RNA”Assay同规。
對象設(shè)置
pbmc.atac <- CreateSeuratObject(counts = peaks, assay = "ATAC", project = "10x_ATAC")
pbmc.atac[["ACTIVITY"]] <- CreateAssayObject(counts = activity.matrix)
meta <- read.table("data/atac_v1_pbmc_10k_singlecell.csv", sep = ",", header = TRUE, row.names = 1, stringsAsFactors = FALSE)
meta <- meta[colnames(pbmc.atac), ]
pbmc.atac <- AddMetaData(pbmc.atac, metadata = meta)
#質(zhì)控,過濾掉scATAC-seq數(shù)據(jù)中總數(shù)少于5K的細胞。
pbmc.atac <- subset(pbmc.atac, subset = nCount_ATAC > 5000)
pbmc.atac$tech <- "atac"
數(shù)據(jù)預(yù)處理
DefaultAssay(pbmc.atac) <- "ACTIVITY"
pbmc.atac <- FindVariableFeatures(pbmc.atac)
pbmc.atac <- NormalizeData(pbmc.atac)
pbmc.atac <- ScaleData(pbmc.atac)
DefaultAssay(pbmc.atac) <- "ATAC"
VariableFeatures(pbmc.atac) <- names(which(Matrix::rowSums(pbmc.atac) > 100))
pbmc.atac <- RunLSI(pbmc.atac, n = 50, scale.max = NULL)
pbmc.atac <- RunUMAP(pbmc.atac, reduction = "lsi", dims = 1:50)
下載已分析的PBMCs數(shù)據(jù)here.
pbmc.rna <- readRDS("../data/pbmc_10k_v3.rds")
pbmc.rna$tech <-"rna"
p1 <- DimPlot(pbmc.atac, reduction = "umap") + NoLegend() + ggtitle("scATAC-seq")
p2 <- DimPlot(pbmc.rna, group.by = "celltype", label = TRUE, repel = TRUE) + NoLegend() + ggtitle("scRNA-seq")
p1 + p2
我們可以識別scATAC-seq數(shù)據(jù)集和scRNA-seq數(shù)據(jù)集之間的錨點,并使用這些錨點將scRNA-seq數(shù)據(jù)中細胞類型標(biāo)簽轉(zhuǎn)移到scATAC-seq細胞。
transfer.anchors <- FindTransferAnchors(reference = pbmc.rna, query = pbmc.atac, features = VariableFeatures(object = pbmc.rna),
reference.assay = "RNA", query.assay = "ACTIVITY", reduction = "cca")
為了定義群ID混埠,我們?yōu)閞efdata參數(shù)提供了RNA先前注釋的細胞類型標(biāo)簽的向量。輸出將包含一個矩陣诗轻,其中包含每個ATAC細胞的預(yù)測和置信度分數(shù)钳宪。
celltype.predictions <- TransferData(anchorset = transfer.anchors, refdata = pbmc.rna$celltype,
weight.reduction = pbmc.atac[["lsi"]])
pbmc.atac <- AddMetaData(pbmc.atac, metadata = celltype.predictions)
檢查預(yù)測分數(shù)的分布,并可以選擇濾除那些得分較低的細胞
hist(pbmc.atac$prediction.score.max)
abline(v = 0.5, col = "red")
table(pbmc.atac$prediction.score.max > 0.5)
我們可以在scATAC-seq數(shù)據(jù)的UMAP表示形式上查看預(yù)測的細胞類型扳炬,并發(fā)現(xiàn)所轉(zhuǎn)移的標(biāo)簽與UMAP結(jié)構(gòu)高度一致吏颖。
pbmc.atac.filtered <- subset(pbmc.atac, subset = prediction.score.max > 0.5)
pbmc.atac.filtered$predicted.id <- factor(pbmc.atac.filtered$predicted.id, levels = levels(pbmc.rna)) # to make the colors match
p1 <- DimPlot(pbmc.atac.filtered, group.by = "predicted.id", label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") +
NoLegend() + scale_colour_hue(drop = FALSE)
p2 <- DimPlot(pbmc.rna, group.by = "celltype", label = TRUE, repel = TRUE) + ggtitle("scRNA-seq cells") +
NoLegend()
p1 + p2
最后,為了將所有細胞一起可視化恨樟,我們可以將scRNA-seq和scATAC-seq細胞共同嵌入同一低維空間中半醉。
# note that we restrict the imputation to variable genes from scRNA-seq, but could impute the
# full transcriptome if we wanted to
genes.use <- VariableFeatures(pbmc.rna)
refdata <- GetAssayData(pbmc.rna, assay = "RNA", slot = "data")[genes.use, ]
# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation
# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = pbmc.atac[["lsi"]])
# this line adds the imputed data matrix to the pbmc.atac object
pbmc.atac[["RNA"]] <- imputation
coembed <- merge(x = pbmc.rna, y = pbmc.atac)
# Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both
# datasets
coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE)
coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE)
coembed <- RunUMAP(coembed, dims = 1:30)
coembed$celltype <- ifelse(!is.na(coembed$celltype), coembed$celltype, coembed$predicted.id)
p1 <- DimPlot(coembed, group.by = "tech")
p2 <- DimPlot(coembed, group.by = "celltype", label = TRUE, repel = TRUE)
p1 + p2
參考:https://satijalab.org/seurat/v3.2/atacseq_integration_vignette.html