scATAC_Code
對(duì)單細(xì)胞測(cè)序感興趣的可以先參考下面的課程:
Analysis of single cell RNA-seq data:https://scrnaseq-course.cog.sanger.ac.uk/website/index.html
先驗(yàn)知識(shí)包括:
scATAC建庫(kù)
scATAC數(shù)據(jù)分析流程(上)
scATAC數(shù)據(jù)分析流程(下)
下游處理一共有8個(gè)steps褪秀,每個(gè)step我都會(huì)在代碼塊里寫上詳細(xì)的注釋
代碼和測(cè)試數(shù)據(jù)在:https://github.com/greenleaflab/10x-scatac-2019
每個(gè)step的結(jié)構(gòu)是:先把完整的代碼運(yùn)行一遍蓄诽,然后再分步分析每一步做了什么
Seurat的scATAC-seq分析流程:https://blog.csdn.net/u012110870/article/details/100171472
Step1-Filtering Cells based on TSS enrichment and unique fragments
完整代碼
用到的函數(shù)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(magrittr)
library(ggplot2)
library(Rcpp)
library(viridis)
#--------------------------------------------
# Functions
#--------------------------------------------
sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){
if(x1.size() != y1.size()){
stop("width must equal size!");
}
IntegerVector x = clone(x1);
IntegerVector y = clone(y1);
int n = x.size();
IntegerVector rx = seq(xmin,xmax);
IntegerVector ry = seq(ymin,ymax);
IntegerMatrix mat( ry.size() , rx.size() );
int xi,yi;
for(int i = 0; i < n; i++){
xi = (x[i] - xmin);
yi = (y[i] - ymin);
if(yi >= 0 && yi < ry.size()){
if(xi >= 0 && xi < rx.size()){
mat( yi , xi ) = mat( yi , xi ) + 1;
}
}
}
return mat;
}'
)
insertionProfileSingles <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
insertionProfileSingles_helper <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
#Convert To Insertion Sites
if(getInsertions){
insertions <- c(
GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
)
by <- "RG"
}else{
insertions <- fragments
}
remove(fragments)
gc()
#center the feature
center <- unique(resize(feature, width = 1, fix = fix, ignore.strand = FALSE))
#get overlaps between the feature and insertions only up to flank bp,這里maxgap設(shè)置只要對(duì)象間距離小于flank bp即認(rèn)為有overlap,默認(rèn)設(shè)置為2000bp媒吗,也是文中作者采用的參數(shù)
overlap <- DataFrame(findOverlaps(query = center, subject = insertions, maxgap = flank, ignore.strand = TRUE))
overlap$strand <- strand(center)[overlap[,1]]
overlap$name <- mcols(insertions)[overlap[,2],by]
overlap <- transform(overlap, id=match(name, unique(name)))
ids <- length(unique(overlap$name))
#add unique identity (ID) (integer) cell barcode to the overlaps object
#distance
overlap$dist <- NA
minus <- which(overlap$strand == "-")
other <- which(overlap$strand != "-")
overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])
overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])
#Insertion Mat 轉(zhuǎn)換成插入片段矩陣
profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -flank, ymax = flank)
colnames(profile_mat) <- unique(overlap$name)
profile <- rowSums(profile_mat)
#normalize
profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells
profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])
#smooth 滑動(dòng)平均數(shù)
profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))
profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1)
#enrichment
max_finite <- function(x){
suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
}
e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))
names(e_mat) <- colnames(profile_mat_norm_smooth)
e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)])
#Summary
df_mat <- data.frame(
enrichment = e_mat,
insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),
insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
)
df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
rownames(df_sum) <- NULL
return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
}
uniqueTags <- as.character(unique(mcols(fragments)[,by]))
splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)
batchTSS <- lapply(seq_along(splitTags), function(x){
setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))
profilex <- insertionProfileSingles_helper(
feature=feature,
fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])],
by = by,
getInsertions = getInsertions,
fix = fix,
flank = flank,
norm = norm,
smooth = smooth,
range = range
)
return(profilex)
})
df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)
dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)
profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}
輸入fragments文件仑氛、運(yùn)行
#--------------------------------------------
# Input
#--------------------------------------------
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
minFrags <- 100
filterFrags <- 1000
filterTSS <- 8
file_fragments <- "data/PBMC_10x-Sub25M-fragments.tsv.gz"
out_fragments <- "data/PBMC_10x-Sub25M-fragments.gr.rds"
name <- "PBMC"
#-----------------
# Reading Fragment Files
#-----------------
message("Reading in fragment files...")
fragments <- data.frame(readr::read_tsv(file_fragments, col_names=FALSE))
#fragmentSub <- fragments[sample(seq_len(nrow(fragments)),25*10^6),]
#write.table(fragmentSub, "data/PBMC_10x-Sub25M-fragments.tsv.gz", col.names=FALSE, row.names =FALSE, sep = "\t", quote = FALSE)
#因?yàn)閿?shù)據(jù)很大,讀取需要一段時(shí)間
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_double(),
X3 = col_double(),
X4 = col_character(),
X5 = col_double()
)
|==============================================================| 100% 1070 MB
fragments <- GRanges(
seqnames = fragments[,1],
IRanges(fragments[,2]+1, fragments[,3]),
RG = fragments[,4],
N = fragments[,5]
)#先把讀入的片段轉(zhuǎn)成GRanges對(duì)象
message("Filtering Lowly Represented Cells...")
tabRG <- table(fragments$RG)
keep <- names(tabRG)[which(tabRG >= minFrags)]#要求每個(gè)細(xì)胞的片段數(shù)至少100個(gè)闸英,這是降采樣的reads锯岖,所以不要求很多
fragments <- fragments[fragments$RG %in% keep,]#過(guò)濾以后從25M降低至約21M
fragments <- sort(sortSeqlevels(fragments))
#-----------------
# TSS Profile
#-----------------
feature <- txdb %>% transcripts(.) %>% resize(., width = 1, fix = "start") %>% unique#選取的feature為transcripts,并歸一化至1bp的起始位點(diǎn)也就是tss
tssProfile <- insertionProfileSingles(feature = feature, fragments = fragments,
getInsertions = TRUE, batchSize = 1000)
tssSingles <- tssProfile$dfall
tssSingles$uniqueFrags <- 0#創(chuàng)建一欄標(biāo)記每個(gè)細(xì)胞有多少fragments
tssSingles[names(tabRG),"uniqueFrags"] <- tabRG
#統(tǒng)計(jì)每個(gè)細(xì)胞的uniqueFrags數(shù),后續(xù)篩選
head(tssSingles)
enrichment insertions insertionsWindow uniqueFrags
TAGCCGGAGTGAATAC-1 6.196078 3238 2164 1619
TACCTATAGGACTTTC-1 6.470588 2738 1906 1369
TGACAACCATTGTTCT-1 8.274510 3836 3136 1918
ACAAAGAGTAACTCCA-1 6.117647 3840 2411 1920
GAGATTCCACATGATC-1 1.529412 504 408 252
TGATCAGAGGAAGGTA-1 3.058824 2850 1219 1425
tssSingles$cellCall <- 0#We then filtered all scATAC-seq profiles to keep those that had at least 1,000 unique fragments and a TSS enrichment of 8.
tssSingles$cellCall[tssSingles$uniqueFrags >= filterFrags & tssSingles$enrichment >= filterTSS] <- 1
可視化
#-----------------
# Plot Stats
#-----------------
tssSingles <- tssSingles[complete.cases(tssSingles),]#complete.cases返回?cái)?shù)據(jù)是否缺失的信息
nPass <- sum(tssSingles$cellCall==1)#filter后的細(xì)胞數(shù)甫何,3770
nTotal <- sum(tssSingles$uniqueFrags >= filterFrags)#僅考慮片段數(shù)不考慮tss enrich的過(guò)濾后細(xì)胞數(shù)出吹,7751
pdf("results/Filter-Cells.pdf")
ggplot(tssSingles[tssSingles$uniqueFrags > 500,], aes(x = log10(uniqueFrags), y = enrichment)) +
geom_hex(bins = 100) +
theme_bw() + scale_fill_viridis() +
xlab("log10 Unique Fragments") +
ylab("TSS Enrichment") +
geom_hline(yintercept = filterTSS, lty = "dashed") +
geom_vline(xintercept = log10(filterFrags), lty = "dashed") +
ggtitle(sprintf("Pass Rate : %s of %s (%s)", nPass, nTotal, round(100*nPass/nTotal,2)))
dev.off()
write.table(tssSingles, "results/Filter-Cells.txt")
#Filter,僅保留過(guò)濾后細(xì)胞的片段
fragments <- fragments[mcols(fragments)$RG %in% rownames(tssSingles)[tssSingles$cellCall==1]]
fragments$RG <- paste0(name,"#",fragments$RG)
#Save
saveRDS(fragments, out_fragments)
分步解析
insertionProfileSingles做了些什么
乍一看完整的代碼會(huì)比較懵辙喂,我把作者包裝的函數(shù)insertionProfileSingles拆開來(lái)一步步運(yùn)行捶牢,看每一步到底做了些什么
#先看看涉及的一些對(duì)象長(zhǎng)啥樣:
insertions
GRanges object with 43347138 ranges and 1 metadata column:
seqnames ranges strand | RG
<Rle> <IRanges> <Rle> | <character>
[1] chr1 10073 * | TAGCCGGAGTGAATAC-1
[2] chr1 10086 * | TACCTATAGGACTTTC-1
[3] chr1 10227 * | TGACAACCATTGTTCT-1
[4] chr1 10236 * | ACAAAGAGTAACTCCA-1
[5] chr1 10376 * | GAGATTCCACATGATC-1
feature
GRanges object with 51385 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr1 11874 + | 1 uc001aaa.3
[2] chr1 69091 + | 4 uc001aal.1
[3] chr1 321084 + | 5 uc001aaq.2
[4] chr1 321146 + | 6 uc001aar.2
[5] chr1 322037 + | 7 uc009vjk.2
center
GRanges object with 51385 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr1 11874 + | 1 uc001aaa.3
[2] chr1 69091 + | 4 uc001aal.1
[3] chr1 321084 + | 5 uc001aaq.2
[4] chr1 321146 + | 6 uc001aar.2
[5] chr1 322037 + | 7 uc009vjk.2
overlap
DataFrame with 27778683 rows and 2 columns
queryHits subjectHits
<integer> <integer>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
... ... ...
27778679 49027 43345682
27778680 49027 43345683
27778681 49027 43345684
27778682 49027 43345685
27778683 49027 43345686
overlap$strand <- strand(center)[overlap[,1]]#overlap[,1]是第一列query,也就是feature的索引巍耗,獲取overlap中這些feature的正負(fù)鏈信息
overlap$name <- mcols(insertions)[overlap[,2],]#overlap[,2]是第二列insertions的RG的索引秋麸,獲取overlap中這些insertions的barcode(name)信息
overlap <- transform(overlap, id=match(name, unique(name)))
#添加id,不同overlap可能對(duì)應(yīng)同一個(gè)cell barcode,因此是同一個(gè)id
> overlap
DataFrame with 27778683 rows and 5 columns
queryHits subjectHits strand name id
<integer> <integer> <Rle> <character> <integer>
1 1 1 + TAGCCGGAGTGAATAC-1 1
2 1 2 + TACCTATAGGACTTTC-1 2
3 1 3 + TGACAACCATTGTTCT-1 3
4 1 4 + ACAAAGAGTAACTCCA-1 4
5 1 5 + GAGATTCCACATGATC-1 5
... ... ... ... ... ...
27778679 49027 43345682 - CAACGTATCGCCCTTA-1 4603
27778680 49027 43345683 - CAGGGCTAGTACCACT-1 1172
27778681 49027 43345684 - CTCCCAACAGGATAGC-1 982
27778682 49027 43345685 - GTGGCGTTCAGTACAC-1 2850
27778683 49027 43345686 - CTCCCAACAGGATAGC-1 982
ids <- length(unique(overlap$name))
ids#總共有多少cell
[1] 21673
#distance 獲取overlap計(jì)數(shù)
overlap$dist <- NA
minus <- which(overlap$strand == "-")
other <- which(overlap$strand != "-")
overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])#如果feature在負(fù)鏈上炬太,就用feature的起點(diǎn)減去insertions的起點(diǎn)作為距離
overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])#反之灸蟆,距離就是insertions減去feature,也就是獲得的片段距離tss的長(zhǎng)度
#Insertion Mat 轉(zhuǎn)換成插入片段矩陣,每一列為barcode ids亲族,每一行為每個(gè)overlap里的的片段數(shù)量炒考,對(duì)列求和應(yīng)該就是每個(gè)細(xì)胞的總的與feature有重疊的counts數(shù)
profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -2000, ymax = 2000)
colnames(profile_mat) <- unique(overlap$name)
profile <- rowSums(profile_mat)
> str(profile_mat)
int [1:4001, 1:21673] 0 0 1 0 0 0 0 0 0 0 ...
> profile_mat[1:4,1:4]
[,1] [,2] [,3] [,4]
[1,] 0 0 0 0
[2,] 0 0 1 0
[3,] 1 0 2 0
[4,] 0 0 0 0
#矩陣的每一列對(duì)應(yīng)一個(gè)細(xì)胞,每一行對(duì)應(yīng)的應(yīng)該是所有tss附近的overlap計(jì)數(shù)霎迫,一共在tss附近統(tǒng)計(jì)了±2,000 bp
#票腰,越遠(yuǎn)離tss應(yīng)該0值越多,這是我自己推測(cè)的女气,感興趣的可以去看看tabulate2dCpp這個(gè)函數(shù)的c代碼
head(profile)#profile就是每一個(gè)tss附近在所有細(xì)胞中有多少overlap
[1] 2190 2060 2256 2247 2275 2127
#normalize
profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells杏慰,意思是對(duì)每一個(gè)細(xì)胞取前100和后100個(gè)tss的counts,如果平均counts數(shù)低于0.5就把每一列除以0.5炼鞠,如果平均counts數(shù)大于0.5就每一列除以平均counts數(shù)缘滥,減少每個(gè)細(xì)胞文庫(kù)大小不均的影響
profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])#把tss的富集豐度變成小數(shù)
head(profile_norm)
[1] 1.137927 1.070379 1.172221 1.167544 1.182093 1.105192
#smooth 求滑動(dòng)平均數(shù),可以使序列變平滑谒主,將N個(gè)連續(xù)的時(shí)間序列成員作為一個(gè)集合朝扼,計(jì)算該集合的平均值,并逐項(xiàng)推移該集合
profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))#以smooth=51為滑動(dòng)窗口長(zhǎng)度霎肯,fill的意思是前25個(gè)滑動(dòng)平均數(shù)都用1填充
profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1) #因此擎颖,profile_mat_norm_smooth反映了每個(gè)細(xì)胞的tss富集情況榛斯,profile_norm_smooth反映了所有細(xì)胞的tss的富集情況
#enrichment
max_finite <- function(x){
suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
}
e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))#取1900-2100范圍的tss富集情況
names(e_mat) <- colnames(profile_mat_norm_smooth)
e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)]) #Summary
df_mat <- data.frame(
enrichment = e_mat,
insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),#這里的insertions是每個(gè)細(xì)胞fragments數(shù)目
insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
)#這個(gè)是每個(gè)細(xì)胞overlap的數(shù)目
df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
rownames(df_sum) <- NULL
return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
}
head(df_sum)
bp profile norm_profile smooth_norm_profile enrichment
1 -2000 2190 1.137927 1 17.91299
2 -1999 2060 1.070379 1 17.91299
3 -1998 2256 1.172221 1 17.91299
4 -1997 2247 1.167544 1 17.91299
5 -1996 2275 1.182093 1 17.91299
6 -1995 2127 1.105192 1 17.91299
#可以看到在遠(yuǎn)離tss的地方,profile值(所有細(xì)胞在該位點(diǎn)總的counts)只有2000左右搂捧,當(dāng)然smooth也低
df_sum[c(seq(1995,2005)),]#但是tss附近profile值很高驮俗,smooth也是。enrichment是smooth_norm_profile的最大值允跑,所有細(xì)胞總的tss的最大富集分?jǐn)?shù)
bp profile norm_profile smooth_norm_profile enrichment
1995 -6 32300 16.78312 16.90412 17.91299
1996 -5 31558 16.39758 16.88616 17.91299
1997 -4 29566 15.36253 16.85958 17.91299
1998 -3 30395 15.79328 16.82941 17.91299
1999 -2 32723 17.00292 16.79287 17.91299
2000 -1 35846 18.62563 16.74796 17.91299
2001 0 36234 18.82724 16.69909 17.91299
2002 1 33384 17.34637 16.65906 17.91299
2003 2 31181 16.20169 16.62594 17.91299
2004 3 30488 15.84161 16.59353 17.91299
2005 4 32274 16.76961 16.56338 17.91299
批量運(yùn)行
測(cè)試數(shù)據(jù)只有2萬(wàn)多個(gè)細(xì)胞王凑,所以我像上面那樣直接運(yùn)行問(wèn)題可能不大,實(shí)際數(shù)據(jù)量可能很大(幾十萬(wàn)個(gè)細(xì)胞)聋丝,所以考慮用batch進(jìn)行分批處理的方法索烹,每次處理100個(gè)細(xì)胞,最后把結(jié)果合并弱睦。注意作者包裝的函數(shù)insertionProfileSingles里面還嵌套了一個(gè)函數(shù)insertionProfileSingles_helper百姓,所以正式運(yùn)行的話,外函數(shù)里運(yùn)行的是下面的部分
uniqueTags <- as.character(unique(mcols(fragments)[,by]))
splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
#ceiling向上取整况木,把所有unique barcode的序號(hào)除以100取整瓣戚,每100個(gè)barcode作為一個(gè)group,一共217個(gè)group
pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)#展示進(jìn)度條
batchTSS <- lapply(seq_along(splitTags), function(x){
setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))#相當(dāng)于每完成一個(gè)循環(huán)就進(jìn)度條加1
profilex <- insertionProfileSingles_helper(#之前的操作都是定義在insertionProfileSingles_helper這個(gè)函數(shù)里面焦读,這里才正式傳參運(yùn)行,每次只提取100個(gè)細(xì)胞的fragments
feature=feature,
fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])],
by = by,
getInsertions = getInsertions,
fix = fix,
flank = flank,
norm = norm,
smooth = smooth,
range = range
)
return(profilex)
})
df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)#把每個(gè)batch運(yùn)行的結(jié)果合并
dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)#dfall就是df_mat
profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}