在單細胞數據分析的過程中握础,Seurat包提供了一個為一個基因集打分的函數AddModuleScore(自定基因集)点楼,為基因集進行打分常見的富集分析軟件GSVA扫尖,今天我們來看看Seurat這個函數的用法和意義。
library(Seurat)
?AddModuleScore
給出的解釋說明是Calculate module scores for feature expression programs in single cells盟步,也就是計算單個細胞在一個基因集的score值藏斩。
描述是這樣的
Calculate the average expression levels of each program (cluster) on
single cell level, subtracted by the aggregated expression of control
feature sets. All analyzed features are binned based on averaged
expression, and the control features are randomly selected from each bin.
看完這個解釋,云里霧里却盘,不知所云狰域。
再來看看用法和參數:
Usage:
AddModuleScore(
object,
features,
pool = NULL,
nbin = 24,
ctrl = 100,
k = FALSE,
assay = NULL,
name = "Cluster",
seed = 1,
search = FALSE,
...
)
Arguments:
object: Seurat object
features: Feature expression programs in list
pool: List of features to check expression levels agains, defaults
to ‘rownames(x = object)’
nbin: Number of bins of aggregate expression levels for all
analyzed features
ctrl: Number of control features selected from the same bin per
analyzed feature
k: Use feature clusters returned from DoKMeans
assay: Name of assay to use
name: Name for the expression programs
seed: Set a random seed. If NULL, seed is not set.
search: Search for symbol synonyms for features in ‘features’ that
don't match features in ‘object’? Searches the HGNC's gene
names database; see ‘UpdateSymbolList’ for more details
...: Extra parameters passed to ‘UpdateSymbolList’
計算結果會返回一個數值,有正有負黄橘,今天我們的任務就是來參透這個函數兆览,既然看解釋不明白,那就只能看原函數了塞关。
function (object, features, pool = NULL, nbin = 24, ctrl = 100,
k = FALSE, assay = NULL, name = "Cluster", seed = 1, search = FALSE,
...)
{
if (!is.null(x = seed)) {
set.seed(seed = seed)
}
assay.old <- DefaultAssay(object = object)
assay <- assay %||% assay.old
DefaultAssay(object = object) <- assay
assay.data <- GetAssayData(object = object)
features.old <- features
if (k) {
.NotYetUsed(arg = "k")
features <- list()
for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster ==
i))
}
cluster.length <- length(x = features)
}
else {
if (is.null(x = features)) {
stop("Missing input feature list")
}
features <- lapply(X = features, FUN = function(x) {
missing.features <- setdiff(x = x, y = rownames(x = object))
if (length(x = missing.features) > 0) {
warning("The following features are not present in the object: ",
paste(missing.features, collapse = ", "), ifelse(test = search,
yes = ", attempting to find updated synonyms",
no = ", not searching for symbol synonyms"),
call. = FALSE, immediate. = TRUE)
if (search) {
tryCatch(expr = {
updated.features <- UpdateSymbolList(symbols = missing.features,
...)
names(x = updated.features) <- missing.features
for (miss in names(x = updated.features)) {
index <- which(x == miss)
x[index] <- updated.features[miss]
}
}, error = function(...) {
warning("Could not reach HGNC's gene names database",
call. = FALSE, immediate. = TRUE)
})
missing.features <- setdiff(x = x, y = rownames(x = object))
if (length(x = missing.features) > 0) {
warning("The following features are still not present in the object: ",
paste(missing.features, collapse = ", "),
call. = FALSE, immediate. = TRUE)
}
}
}
return(intersect(x = x, y = rownames(x = object)))
})
cluster.length <- length(x = features)
}
if (!all(LengthCheck(values = features))) {
warning(paste("Could not find enough features in the object from the following feature lists:",
paste(names(x = which(x = !LengthCheck(values = features)))),
"Attempting to match case..."))
features <- lapply(X = features.old, FUN = CaseMatch,
match = rownames(x = object))
}
if (!all(LengthCheck(values = features))) {
stop(paste("The following feature lists do not have enough features present in the object:",
paste(names(x = which(x = !LengthCheck(values = features)))),
"exiting..."))
}
pool <- pool %||% rownames(x = object)
data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
data.avg <- data.avg[order(data.avg)]
data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30,
n = nbin, labels = FALSE, right = FALSE)
names(x = data.cut) <- names(x = data.avg)
ctrl.use <- vector(mode = "list", length = cluster.length)
for (i in 1:cluster.length) {
features.use <- features[[i]]
for (j in 1:length(x = features.use)) {
ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut ==
data.cut[features.use[j]])], size = ctrl, replace = FALSE)))
}
}
ctrl.use <- lapply(X = ctrl.use, FUN = unique)
ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use),
ncol = ncol(x = object))
for (i in 1:length(ctrl.use)) {
features.use <- ctrl.use[[i]]
ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use,
])
}
features.scores <- matrix(data = numeric(length = 1L), nrow = cluster.length,
ncol = ncol(x = object))
for (i in 1:cluster.length) {
features.use <- features[[i]]
data.use <- assay.data[features.use, , drop = FALSE]
features.scores[i, ] <- Matrix::colMeans(x = data.use)
}
features.scores.use <- features.scores - ctrl.scores
rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
features.scores.use <- as.data.frame(x = t(x = features.scores.use))
rownames(x = features.scores.use) <- colnames(x = object)
object[[colnames(x = features.scores.use)]] <- features.scores.use
CheckGC()
DefaultAssay(object = object) <- assay.old
return(object)
}
我們來一步一步解析這個函數
我們先用它的默認值
library(Seurat)
load(Rdata)
assay.old <- DefaultAssay(object = object) ###結果為RNA
assay <- assay %||% assay.old ### %||%的用法可自行查詢抬探,這里還是顯示RNA
DefaultAssay(object = object) <- assay
assay.data <- GetAssayData(object = object)
features.old <- features ## 運行到這里,得到矩陣信息assay.data和gene list帆赢。
###這里我們慢慢運行
cluster.length <- length(x = features)
missing.features <- setdiff(x = features, y = rownames(x = object)) ##setdiff函數的用法 我們這里顯然是不會有不同的小压。
###接下來我們就可以直接跳到
pool <- pool %||% rownames(x = object)
data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
data.avg <- data.avg[order(data.avg)] ###排序,從小到大
data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30,
n = nbin, labels = FALSE, right = FALSE)
###rnorm(n, mean = 0, sd = 1) n 為產生隨機值個數(長度),mean 是平均數, sd 是標準差 椰于,也就是說這里將數據切成了nbin份(由小到大)怠益。
names(x = data.cut) <- names(x = data.avg)
ctrl.use <- vector(mode = "list", length = cluster.length) ##空的列表
for (i in 1:cluster.length) {
features.use <- features[[i]]
for (j in 1:length(x = features.use)) {
ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut ==
data.cut[features.use[j]])], size = ctrl, replace = FALSE)))
}
}
ctrl.use <- lapply(X = ctrl.use, FUN = unique)
ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use),
ncol = ncol(x = object)) ####空的
for (i in 1:length(ctrl.use)) {
features.use <- ctrl.use[[i]]
ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use,
])
} #######細胞的平均值
features.scores.use <- features.scores - ctrl.scores
rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
features.scores.use <- as.data.frame(x = t(x = features.scores.use))
rownames(x = features.scores.use) <- colnames(x = object)
object[[colnames(x = features.scores.use)]] <- features.scores.use
CheckGC()
DefaultAssay(object = object) <- assay.old
走到這里,相信大家應該都明白了瘾婿,也就是我們感興趣的基因蜻牢,抽出來烤咧,每一個細胞算一個這些基因表達的平均值,
背景基因的平均值在于找每個基因的所在的bin抢呆,在該bin內隨機抽取相應的ctrl個基因作為背景煮嫌,最后所有的目標基因算一個平均值,所有的背景基因算一個平均值抱虐,兩者相減就是該gene set 的score值昌阿。
至于生物學意義,仁者見仁智者見智了梯码。