Seurat中FindMarker尋找兩個(gè)cell type差異基因的若干方法

加載示例數(shù)據(jù)

1.運(yùn)行Seurat

pbmc.data <- read.csv("GSE117988_raw.expMatrix_PBMC.csv",header=TRUE,row.names = 1)
pbmc.data <- log2(1 + sweep(pbmc.data, 2, median(colSums(pbmc.data))/colSums(pbmc.data), '*'))
pbmc<- CreateSeuratObject(counts = pbmc.data, project = "10X pbmc", min.cells = 1, min.features = 0)

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

sce=pbmc
sce <- NormalizeData(sce, normalization.method =  "LogNormalize", scale.factor = 10000)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, selection.method = "vst", nfeatures = 2000) 

sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.2)
table(sce@meta.data$RNA_snn_res.0.2) 

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)

# 差異分析
markers_df <- FindMarkers(object = sce, ident.1 = 0, ident.2 = 1, min.pct = 0.25)

2.差異分析計(jì)算logFC值

object = sce
ident.1 = 0
ident.2 = 1
group.by = NULL
subset.ident = NULL
assay = NULL
slot = 'data'
reduction = NULL
features = NULL
logfc.threshold = 0.25
test.use = "wilcox"
min.pct = 0.25
min.diff.pct = -Inf
verbose = TRUE
only.pos = FALSE
max.cells.per.ident = Inf
random.seed = 1
latent.vars = NULL
min.cells.feature = 3
min.cells.group = 3
pseudocount.use = 1
mean.fxn = NULL
fc.name = NULL
base = 2
densify = FALSE
  
# select which data to use
assay <- assay %||% DefaultAssay(object = object)
data.use <- object[[assay]]
cellnames.use <-  colnames(x = data.use)

# IdentsToCells是提取細(xì)胞亞型的函數(shù),可以提取某一亞型的細(xì)胞barcodes
cells <- IdentsToCells(
  object = object,
  ident.1 = ident.1,
  ident.2 = ident.2,
  cellnames.use = cellnames.use
)
# check normalization method
norm.command <- paste0("NormalizeData.", assay)
norm.method <- Command(
  object = object,
  command = norm.command,
  value = "normalization.method"
)

# 讀取相應(yīng)的數(shù)據(jù)
data.slot <- ifelse(
  test = test.use %in% DEmethods_counts(),
  yes = 'counts',
  no = slot
)
data.use <-  GetAssayData(object = object, slot = data.slot)
counts <- switch(
  EXPR = data.slot,
  'scale.data' = GetAssayData(object = object, slot = "counts"),
  numeric()
)

# 計(jì)算兩個(gè)cell type直接的logFC值
fc.results <- FoldChange(
  object = object,
  slot = data.slot,
  ident.1 = 0,
  ident.2 = 1,
  features = features,
  pseudocount.use = pseudocount.use,
  mean.fxn = mean.fxn,
  fc.name = fc.name,
  base = base
)

其中:

  1. 對(duì)象 cells
    cells

    包括兩種cell type的細(xì)胞barcodes(即每一個(gè)單獨(dú)的細(xì)胞)

  2. 對(duì)象 fc.results
    fc.results

    計(jì)算了上述兩種細(xì)胞類型每個(gè)基因的平均log2FC值

那么軟件是如何計(jì)算平均差異表達(dá)的logFC的呢?

base = 2
pseudocount.use = 1
mean.fxn <- function(x) {
  return(log(x = rowMeans(x = x) + pseudocount.use, base = base))
}

# 加載數(shù)據(jù)
## data.use代表所有基因的表達(dá)矩陣
object = data.use

# Calculate percent expressed
## 定義閾值
thresh.min <- 0
# 過(guò)濾掉 cell type 1 基因在所有細(xì)胞表達(dá)都為 0 的基因
pct.1 <- round(
  x = rowSums(x = object[, cells.1, drop = FALSE] > thresh.min) /
    length(x = cells.1),
  digits = 3
)
# 過(guò)濾掉 cell type 2 基因在所有細(xì)胞表達(dá)都為 0 的基因
pct.2 <- round(
  x = rowSums(x = object[, cells.2, drop = FALSE] > thresh.min) /
    length(x = cells.2),
  digits = 3
)
# Calculate fold change
## mean.fxn 計(jì)算的是某基因在其中一種 cell type 中所有細(xì)胞表達(dá)量的對(duì)數(shù)平均值
data.1 <- mean.fxn(object[, cells.1, drop = FALSE])
data.2 <- mean.fxn(object[, cells.2, drop = FALSE])
# 求logFC
fc <- (data.1 - data.2)
fc.results <- as.data.frame(x = cbind(fc, pct.1, pct.2))
colnames(fc.results) <- c(fc.name, "pct.1", "pct.2")

接下來(lái)就是計(jì)算差異logFC的p_val

# 差異基因p_val計(jì)算
## 初始化對(duì)象
object = data.use
slot = data.slot
counts = counts
cells.1 = cells.1
cells.2 = cells.2
features = features
logfc.threshold = logfc.threshold
test.use = test.use
min.pct = min.pct
min.diff.pct = min.diff.pct
verbose = verbose
only.pos = only.pos
max.cells.per.ident = max.cells.per.ident
random.seed = random.seed
latent.vars = latent.vars
min.cells.feature = min.cells.feature
min.cells.group = min.cells.group
pseudocount.use = pseudocount.use
fc.results = fc.results
densify = densify

ValidateCellGroups(
  object = object,
  cells.1 = cells.1,
  cells.2 = cells.2,
  min.cells.group = min.cells.group
)
features <- features %||% rownames(x = object)
data <- switch(
  EXPR = slot,
  'scale.data' = counts,
  object
)
# feature selection (based on percentages)
alpha.min <- pmax(fc.results$pct.1, fc.results$pct.2)
names(x = alpha.min) <- rownames(x = fc.results)
features <- names(x = which(x = alpha.min >= min.pct))

alpha.diff <- alpha.min - pmin(fc.results$pct.1, fc.results$pct.2)
features <- names(
  x = which(x = alpha.min >= min.pct & alpha.diff >= min.diff.pct)
)

# feature selection (based on logFC)
## 根據(jù)logFC的情況篩選基因熏兄,該例子的 logfc.threshold = 0.25
total.diff <- fc.results[, 1] #first column is logFC
names(total.diff) <- rownames(fc.results)
features.diff <- names(x = which(x = abs(x = total.diff) >= logfc.threshold))

features <- intersect(x = features, y = features.diff)

# subsample cell groups if they are too large
## 計(jì)算差異顯著性
de.results <- PerformDE(
  object = object,
  cells.1 = cells.1,
  cells.2 = cells.2,
  features = features,
  test.use = test.use,
  verbose = verbose,
  min.cells.feature = min.cells.feature,
  latent.vars = latent.vars,
  densify = densify,
)

其中data.use代表單細(xì)胞的表達(dá)矩陣

data.use

行代表特征基因背伴,列代表不同的細(xì)胞

3.FindMarkers計(jì)算差異顯著性的方法

# 初始化
## 這里的data.use代表所有基因的表達(dá)矩陣
object = data.use
cells.1 = cells.1
cells.2 = cells.2
features = features
test.use = test.use
verbose = verbose
min.cells.feature = min.cells.feature
latent.vars = latent.vars
densify = densify

#提取特征基因的表達(dá)譜
## data.use代表篩選的特征基因的表達(dá)矩陣
data.use <- object[features, c(cells.1, cells.2), drop = FALSE]

#不同的檢測(cè)差異基因的方法
de.results <- switch(
  EXPR = test.use,
  'wilcox' = WilcoxDETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    verbose = verbose,
    ...
  ),
  'bimod' = DiffExpTest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    verbose = verbose
  ),
  'roc' = MarkerTest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    verbose = verbose
  ),
  't' = DiffTTest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    verbose = verbose
  ),
  'negbinom' = GLMDETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    min.cells = min.cells.feature,
    latent.vars = latent.vars,
    test.use = test.use,
    verbose = verbose
  ),
  'poisson' = GLMDETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    min.cells = min.cells.feature,
    latent.vars = latent.vars,
    test.use = test.use,
    verbose = verbose
  ),
  'MAST' = MASTDETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    latent.vars = latent.vars,
    verbose = verbose,
    ...
  ),
  "DESeq2" = DESeq2DETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    verbose = verbose,
    ...
  ),
  "LR" = LRDETest(
    data.use = data.use,
    cells.1 = cells.1,
    cells.2 = cells.2,
    latent.vars = latent.vars,
    verbose = verbose
  ),
  stop("Unknown test: ", test.use)
)

由源代碼我們可以知道,這里一共提供了9種檢測(cè)差異基因的手段殿漠,那么接下來(lái)我們可以一種一種的討論:

[1].wilcox

這一部分分為兩種檢測(cè)方式供用戶選擇,一種是用limma的wilcox檢驗(yàn)

# 初始化
data.use = data.use
cells.1 = cells.1
cells.2 = cells.2
verbose = verbose

data.use <- data.use[, c(cells.1, cells.2), drop = FALSE]
#以cell type 1 的長(zhǎng)度作為索引佩捞,該例子中一共有兩類細(xì)胞绞幌,共計(jì)6737個(gè)細(xì)胞
j <- seq_len(length.out = length(x = cells.1))

# limma wilcox test  計(jì)算 p_val
## 對(duì)第一個(gè)基因進(jìn)行檢驗(yàn)
p_val = min(2 * min(limma::rankSumTestWithCorrelation(index = j, statistics = data.use[1, ])), 1)
### data.use[1, ]代表單細(xì)胞表達(dá)矩陣的第一個(gè)基因在兩個(gè)種類型細(xì)胞中的表達(dá)量;其余基因依次遍歷
limma wilcox test p_val

而另外一種則是用普通的wilcox test函數(shù)進(jìn)行檢驗(yàn)

data.use = data.use
cells.1 = cells.1
cells.2 = cells.2
verbose = verbose

data.use <- data.use[, c(cells.1, cells.2), drop = FALSE]
#以cell type 1 的長(zhǎng)度作為索引一忱,該例子中一共有兩類細(xì)胞莲蜘,共計(jì)6737個(gè)細(xì)胞
j <- seq_len(length.out = length(x = cells.1))

# 創(chuàng)立wilcox test的背景集
group.info <- data.frame(row.names = c(cells.1, cells.2))
group.info[cells.1, "group"] <- "Group1"
group.info[cells.2, "group"] <- "Group2"
group.info[, "group"] <- factor(x = group.info[, "group"])
data.use <- data.use[, rownames(x = group.info), drop = FALSE]

# wilcox test  計(jì)算 p_val
## 對(duì)第一個(gè)基因進(jìn)行檢驗(yàn)
p_val = wilcox.test(data.use[1, ] ~ group.info[, "group"])$p.value
### data.use[1, ]代表單細(xì)胞表達(dá)矩陣的第一個(gè)基因在兩個(gè)種類型細(xì)胞中的表達(dá)量谭确;其余基因依次遍歷
wilcox test p_val

可以看到兩種wilcox test 計(jì)算的p值沒(méi)有差別

[2].bimod

這種檢驗(yàn)方式巧用了兩個(gè)cell type中其中一個(gè)基因的表達(dá)量所構(gòu)成的似然值
根據(jù)文章:Data exploration, quality control and testing in single-cell qPCR-based gene expression experiments 的補(bǔ)充材料所述,對(duì)于單細(xì)胞表達(dá)數(shù)據(jù)票渠,構(gòu)造某基因在各個(gè)細(xì)胞中表達(dá)情況的似然函數(shù)如下:

其中:

  1. Πk 表示某基因 A 在細(xì)胞中表達(dá)的比例
  2. 1-Πk 表示某基因 A 在細(xì)胞中未表達(dá)的比例
  3. nk 表示某基因 A 在細(xì)胞中表達(dá)的細(xì)胞個(gè)數(shù)
  4. 1-nk 表示某基因 A 在細(xì)胞中表達(dá)的細(xì)胞個(gè)數(shù)
  5. g() 為某基因 A 在細(xì)胞中表達(dá)了的pdf函數(shù)
data.use = data.use
cells.1 = cells.1
cells.2 = cells.2
verbose = verbose

x = as.numeric(x = data.use[1, cells.1])
y = as.numeric(x = data.use[1, cells.2])

# 分別計(jì)算兩種 cell type 某基因表達(dá)的對(duì)數(shù)似然值
lrtX <- bimodLikData(x = x)
lrtY <- bimodLikData(x = y)
# 計(jì)算兩種 cell type 混合時(shí)某基因表達(dá)的對(duì)似然值
lrtZ <- bimodLikData(x = c(x, y))
# 構(gòu)造似然比逐哈,對(duì)數(shù)的加減相當(dāng)于原式的乘除
lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
# 進(jìn)行對(duì)數(shù)似然比檢驗(yàn)
p_val = pchisq(q = lrt_diff, df = 3, lower.tail = F)


bimodLikData <- function(x, xmin = 0) {
  # 提取某基因 A 未表達(dá)的細(xì)胞
  x1 <- x[x <= xmin]
  # 提取某基因 A 表達(dá)的細(xì)胞
  x2 <- x[x > xmin]

  xal <- MinMax(
    # 計(jì)算某基因 A 在細(xì)胞中表達(dá)的比例
    data = length(x = x2) / length(x = x),
    min = 1e-5,
    max = (1 - 1e-5)
  )
  # 計(jì)算 Πk^nk
  likA <- length(x = x1) * log(x = 1 - xal)

 # 計(jì)算分布的sd值
  if (length(x = x2) < 2) {
    mysd <- 1
  } else {
    mysd <- sd(x = x2)
  }
  # 其中  length(x = x2) * log(x = xal) 計(jì)算的是 (1-Πk)^(1-nk)
  # sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE) 計(jì)算的是某基因 A 在細(xì)胞中表達(dá),對(duì)應(yīng)于分布似然值的乘積(由于對(duì)數(shù)化问顷,加和表示乘積)
  likB <- length(x = x2) *
    log(x = xal) +
    sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
  return(likA + likB)
}

其中:(默認(rèn)取了對(duì)數(shù))

  1. length(x = x1) * log(x = 1 - xal) 計(jì)算的是 Πknk
  2. length(x = x2) * log(x = xal) 計(jì)算的是 (1-Πk)(1-nk)
  3. sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE) 計(jì)算的是了 ∏ g()

那么如何和利用分布間的對(duì)數(shù)似然和關(guān)系反應(yīng)差異呢昂秃?
假設(shè)紅,藍(lán)分別代表兩類細(xì)胞中基因A(有表達(dá)的情況)的表達(dá)量分布杜窄;黑色代表兩者合并的分布肠骆,我們分如下三種情況來(lái)討論這個(gè)問(wèn)題:

情況 1:
兩類細(xì)胞中基因A(有表達(dá)的情況)的表達(dá)量分布很遠(yuǎn);藍(lán) 的表達(dá)量比 低羞芍,則它們表達(dá)的均值關(guān)系就會(huì)是 mean(藍(lán)) << mean(c(藍(lán),紅)) << mean(紅)

X1 = seq(2, 11, by = .1)
X2 = seq(11, 17, by = .1)

Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)

v = c(X1,X2)
g = dnorm(v, mean = mean(v), sd = sd(v),log = T)

qq = data.frame(v,g)
ww = data.frame(X1,Y1)
ee = data.frame(X2,Y2)

ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
  geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
  geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))

sum(Y1)
[1] -217.0104
sum(Y2)
[1] -121.0672
sum(g)
[1] -439.0152

顯然三個(gè)分布的對(duì)數(shù)似然值之和滿足 Y1 + Y2 >> g哗戈,即 Y1 + Y2 的對(duì)數(shù)似然值之和與 g 的對(duì)數(shù)似然值之和相差很大

情況 2:
兩類細(xì)胞中基因A(有表達(dá)的情況)的表達(dá)量分布重合;藍(lán) 的表達(dá)量和 的相同荷科,則它們表達(dá)的均值關(guān)系就會(huì)是 mean(藍(lán)) = mean(c(藍(lán),紅)) = mean(紅)

X1 = seq(11, 15, by = .1)
X2 = seq(11, 15, by = .1)

Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)

v = c(X1,X2)
g = dnorm(v, mean = mean(v), sd = sd(v),log = T)

qq = data.frame(v,g)
ww = data.frame(X1,Y1)
ee = data.frame(X2,Y2)

ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
  geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
  geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))

sum(Y1)
[1] -65.08036
sum(Y2)
[1] -65.08036
sum(g)
[1] -130.1514

顯然三個(gè)分布的對(duì)數(shù)似然值之和滿足 Y1 + Y2 = g唯咬,即 Y1 + Y2的對(duì)數(shù)似然值之和與 g 的對(duì)數(shù)似然值之和一樣大

情況 3:
兩類細(xì)胞中基因A(有表達(dá)的情況)的表達(dá)量分布不是很遠(yuǎn);藍(lán) 的表達(dá)量比 低畏浆,則它們表達(dá)的均值關(guān)系就會(huì)是 mean(藍(lán)) < mean(c(藍(lán),紅)) < mean(紅)

X1 = seq(9, 13, by = .1)
X2 = seq(11, 15, by = .1)

Y1 = dnorm(X1, mean = mean(X1), sd = sd(X1),log = T)
Y2 = dnorm(X2, mean = mean(X2), sd = sd(X2),log = T)

v = c(X1,X2)
g = dnorm(v, mean = mean(v), sd = sd(v),log = T)

qq = data.frame(v,g)
ww = data.frame(X1,Y1)
ee = data.frame(X2,Y2)

ggplot() + geom_point(qq,mapping = aes(x = v, y = g)) + 
  geom_point(ww,mapping = aes(x = X1, y = Y1,colour='red')) + 
  geom_point(ee,mapping = aes(x = X2, y = Y2,colour='blue'))

sum(Y1)
[1] -65.08036
sum(Y2)
[1] -65.08036
sum(g)
[1] -152.2503

顯然三個(gè)分布的對(duì)數(shù)似然值之和滿足 Y1 + Y2 > g胆胰,即 Y1 + Y2的對(duì)數(shù)似然值之和與 g 的對(duì)數(shù)似然值之和相差一般大

所以綜上所述,Y1 + Y2g 的關(guān)系可以反應(yīng) Y1Y2 兩個(gè)分布差異的大锌袒瘛(也可以解釋為Y1Y2 兩個(gè)分布相離的距離)

對(duì)于:

x = as.numeric(x = data.use[1, cells.1])
y = as.numeric(x = data.use[1, cells.2])

lrtX <- bimodLikData(x = x)
lrtY <- bimodLikData(x = y)
lrtZ <- bimodLikData(x = c(x, y))
lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
pchisq(q = lrt_diff, df = 3, lower.tail = F)

我們就可以通過(guò)討論lrtX + lrtY 與 lrtZ的關(guān)系蜀涨,通過(guò)pchisq來(lái)計(jì)算p值,顯著代表lrtX + lrtY >> lrtZ蝎毡;不顯著代表lrtX + lrtY ≈ lrtZ

[3].roc

roc計(jì)算p_val的方式巧用了二分類器來(lái)計(jì)算AUC值厚柳,通過(guò)AUC值定義出p_val

data.use = data.use
cells.1 = cells.1
cells.2 = cells.2
verbose = verbose

data.use = data.use
cells.1 = cells.1
cells.2 = cells.2


data1 = data.use[, cells.1, drop = FALSE]
data2 = data.use[, cells.2, drop = FALSE]
mygenes = rownames(x = data.use)
print.bar = verbose

# 以兩類細(xì)胞某基因的表達(dá)量為特征值
x = as.numeric(x = data1[1, ])
y = as.numeric(x = data2[1, ])

# 進(jìn)行二分類的學(xué)習(xí)
prediction.use <- prediction(
  ## 以兩類細(xì)胞某基因的表達(dá)量為特征值
  predictions = c(x, y),
  ## 以兩類細(xì)胞作為分類標(biāo)簽
  labels = c(rep(x = 1, length(x = x)), rep(x = 0, length(x = y))),
  label.ordering = 0:1
)

# 預(yù)測(cè)
perf.use <- performance(prediction.obj = prediction.use, measure = "auc")
# 計(jì)算AUC值
auc.use <- round(x = perf.use@y.values[[1]], digits = 3)

# AUC排序
to.return = data.frame(myAUC = rev(x = order(toRet$myAUC)))
# 計(jì)算p_val
p_val = to.return$power <- abs(x = to.return$myAUC - 0.5) * 2
[4].t-test

還有一種計(jì)算p_val的方式是直接用t-test

data.use = data.use
cells.1 = cells$cells.1
cells.2 = cells$cells.2

# 兩類細(xì)胞中某基因表達(dá)量的t-test
t.test(x = data.use[1, cells.1], y = data.use[1, cells.2])$p.value
[5].廣義線性回歸模型

這里分為三種情況,一種是某基因A在兩類細(xì)胞中的表達(dá)量滿足負(fù)二項(xiàng)分布沐兵,利用負(fù)二項(xiàng)分布廣義線性回歸模型計(jì)算p_val别垮;第二種是某基因A在兩類細(xì)胞中的表達(dá)量滿足泊松分布,利用泊松分布廣義線性回歸模型計(jì)算p_val扎谎;最后一種是零膨脹負(fù)二項(xiàng)分布的廣義線性回歸模型

對(duì)于負(fù)二項(xiàng)分布和泊松分布的廣義線性回歸模型:

#數(shù)據(jù)準(zhǔn)備
data.use = data.use
cells.1 = cells$cells.1
cells.2 = cells$cells.2
verbose = verbose

min.cells = 3
latent.vars = NULL
test.use = NULL

# 定義分組信息
group.info <- data.frame(
  group = rep(
    x = c('Group1', 'Group2'),
    times = c(length(x = cells.1), length(x = cells.2))
  )
)
# 分組信息因子化
rownames(group.info) <- c(cells.1, cells.2)
group.info[, "group"] <- factor(x = group.info[, "group"])

# 定義兩類細(xì)胞與分組之間的關(guān)系
latent.vars <- if (is.null(x = latent.vars)) {
  group.info
} else {
  cbind(x = group.info, latent.vars)
}

# 將某基因A的表達(dá)量加入到latent.vars中
latent.var.names <- colnames(x = latent.vars)
latent.vars[, "GENE"] <- as.numeric(x = data.use[1, ])

# 定義廣義線性回歸模型方程碳想,回歸響應(yīng)變量和決策變量為  "GENE ~ group"
fmla <- as.formula(object = paste(
  "GENE ~",
  paste(latent.var.names, collapse = "+")
))
p.estimate <- 2
# 負(fù)二項(xiàng)分布
if (test.use == "negbinom") {
    expr = p.estimate <- summary(
      object = glm.nb(formula = fmla, data = latent.vars)
    )$coef[2, 4]
} 
# 泊松分布
else if (test.use == "poisson") {
  expr = summary(object = glm(
    formula = fmla,
    data = latent.vars,
    family = "poisson"
  ))$coef[2,4]
}
## coef[2,4] 為模型的p_val

其中:

  1. latent.vars 為:

    第一列代表兩類細(xì)胞的barcodes;第二列為兩類細(xì)胞的分組信息毁靶;第三列為某基因A在兩類細(xì)胞中的表達(dá)量

  2. 廣義線性模型回歸的響應(yīng)變量和決策變量為 "GENE ~ group"
  3. coef[2,4] 為模型的p_val

對(duì)于零膨脹負(fù)二項(xiàng)分布的廣義線性回歸模型:
首先單細(xì)胞的表達(dá)矩陣是一個(gè)稀疏矩陣胧奔,里面包含著大量的 0 值,因此零膨脹模型會(huì)區(qū)分零表達(dá)和有表達(dá)的情況预吆,從而扣除零表達(dá)對(duì)數(shù)據(jù)擬合造成的影響龙填,利用有表達(dá)的細(xì)胞進(jìn)行數(shù)據(jù)擬合,再建立廣義線性回歸模型

# 讀出數(shù)據(jù)
data.use = data.use
cells.1 = cells$cells.1
cells.2 = cells$cells.2
verbose = verbose

latent.vars = NULL
verbose = TRUE


group.info <- data.frame(row.names = c(cells.1, cells.2))
latent.vars <- latent.vars %||% group.info
# 對(duì)兩類細(xì)胞分組,分為Group1和Group2
group.info[cells.1, "group"] <- "Group1"
group.info[cells.2, "group"] <- "Group2"
group.info[, "group"] <- factor(x = group.info[, "group"])
latent.vars.names <- c("condition", colnames(x = latent.vars))
latent.vars <- cbind(latent.vars, group.info)
latent.vars$wellKey <- rownames(x = latent.vars)
fdat <- data.frame(rownames(x = data.use))
colnames(x = fdat)[1] <- "primerid"
rownames(x = fdat) <- fdat[, 1]

# 創(chuàng)建數(shù)據(jù)矩陣岩遗,這里是一次性讀入所有的基因
sca <- MAST::FromMatrix(
  exprsArray = as.matrix(x = data.use),
  check_sanity = FALSE,
  cData = latent.vars,
  fData = fdat
)

# 定義回歸因子
cond <- factor(x = SummarizedExperiment::colData(sca)$group)
cond <- relevel(x = cond, ref = "Group1")
SummarizedExperiment::colData(sca)$condition <- cond

# 建立響應(yīng)變量和決策變量之間的關(guān)系
fmla <- as.formula(
  object = paste0(" ~ ", paste(latent.vars.names, collapse = "+"))
)
# 建立模型
zlmCond <- MAST::zlm(formula = fmla, sca = sca)
summaryCond <- summary(object = zlmCond, doLRT = 'conditionGroup2')
summaryDt <- summaryCond$datatable
# 計(jì)算p_val
p_val <- summaryDt[summaryDt[, "component"] == "H", 4]

其中:對(duì)于summaryDt
summaryDt

第四列為p_val

利用廣義線性回歸模型計(jì)算兩組差異的p_val的原理可以參照:

  1. 因子變量回歸模型
  2. 類別型決策變量的線性回歸

至于是哪一種分布類型胶背,則根據(jù)數(shù)據(jù)的擬合分布情況來(lái)決定

[6].DESeq2差異表達(dá)計(jì)算p_val

還有一種方式是利用DESeq2兩組比較的方式,計(jì)算每個(gè)基因的p_val

#讀入數(shù)據(jù)
data.use = data.use
cells.1 = cells$cells.1
cells.2 = cells$cells.2

# DESeq2的標(biāo)準(zhǔn)流程
## 細(xì)胞分組
group.info <- data.frame(row.names = c(cells.1, cells.2))
group.info[cells.1, "group"] <- "Group1"
group.info[cells.2, "group"] <- "Group2"
group.info[, "group"] <- factor(x = group.info[, "group"])
group.info$wellKey <- rownames(x = group.info)
## 創(chuàng)建數(shù)據(jù)矩陣喘先,這里是一次性讀入所有的基因
dds1 <- DESeq2::DESeqDataSetFromMatrix(
  countData = data.use,
  colData = group.info,
  design = ~ group
)
dds1 <- DESeq2::estimateSizeFactors(object = dds1)
dds1 <- DESeq2::estimateDispersions(object = dds1, fitType = "local")
dds1 <- DESeq2::nbinomWaldTest(object = dds1)
res <- DESeq2::results(
  object = dds1,
  contrast = c("group", "Group1", "Group2"),
  alpha = 0.05
)
# 計(jì)算p_val
to.return <- data.frame(p_val = res$pvalue, row.names = rownames(res))

其實(shí)DESeq2的底層邏輯也是利用廣義線性回歸模型計(jì)算p_val的,故理解可參照 [5].廣義線性回歸模型

[7].回歸模型似然比檢驗(yàn)

這一部分的原理參見(jiàn):DESeq2中的似然比檢驗(yàn)(LRT)
直接上代碼:

# 讀取數(shù)據(jù)
data.use = data.use
cells.1 = cells$cells.1
cells.2 = cells$cells.2
verbose = verbose

latent.vars = NULL
verbose = TRUE

# 設(shè)立分組信息
group.info <- data.frame(row.names = c(cells.1, cells.2))
group.info[cells.1, "group"] <- "Group1"
group.info[cells.2, "group"] <- "Group2"
group.info[, "group"] <- factor(x = group.info[, "group"])
data.use <- data.use[, rownames(group.info), drop = FALSE]
latent.vars <- latent.vars[rownames(group.info), , drop = FALSE]

# 建立模型廷粒,以第一個(gè)基因在兩類細(xì)胞中的表達(dá)量為例建模
model.data <- cbind(GENE = data.use[1, ], group.info)
# 以基因?yàn)闆Q策變量窘拯,二元變量group為響應(yīng)變量
fmla <- as.formula(object = "group ~ GENE")
# 對(duì)照模型
fmla2 <- as.formula(object = "group ~ 1")

# 以負(fù)二項(xiàng)分布建立glm
model1 <- glm(formula = fmla, data = model.data, family = "binomial")
model2 <- glm(formula = fmla2, data = model.data, family = "binomial")
# model1和model2的似然比檢驗(yàn),計(jì)算p_val
p_val <- lrtest <- lrtest(model1, model2)

其中:

  1. 對(duì)于model.data來(lái)說(shuō)
    model.data

    第一列代表兩類細(xì)胞的barcodes坝茎;第二列為某基因A在兩類細(xì)胞中的表達(dá)量涤姊;第三列為兩類細(xì)胞的分組信息

  2. 對(duì)于model1:
# 以基因?yàn)闆Q策變量,二元變量group為響應(yīng)變量
fmla <- as.formula(object = "group ~ GENE")
model1 <- glm(formula = fmla, data = model.data, family = "binomial")

model1

橫坐標(biāo)為基因表達(dá)的情況(基因的表達(dá)數(shù)值)嗤放,縱坐標(biāo)為二元變量(Group1和Group2)思喊;如果兩組細(xì)胞某基因A的表達(dá)相差很大,那么model1更 fit

  1. 對(duì)于model2:
# 對(duì)照模型
fmla2 <- as.formula(object = "group ~ 1")
model2 <- glm(formula = fmla, data = model.data, family = "binomial")

model2

model2 不fit

如果兩組細(xì)胞某基因A的表達(dá)相差很大次酌,那么model1更 fit恨课,model1與model2的似然比遠(yuǎn)大于1;
如果兩組細(xì)胞某基因A的表達(dá)相差不大岳服,那么model1和model2 都 不fit剂公,model1與model2的似然比約等于1
顯然,根據(jù)實(shí)際的表達(dá)情況吊宋,model1比model2更 fit纲辽,所以以似然比檢驗(yàn)的p_val為最終的p_val,更為顯著

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末璃搜,一起剝皮案震驚了整個(gè)濱河市拖吼,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌这吻,老刑警劉巖吊档,帶你破解...
    沈念sama閱讀 206,126評(píng)論 6 481
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場(chǎng)離奇詭異橘原,居然都是意外死亡籍铁,警方通過(guò)查閱死者的電腦和手機(jī),發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,254評(píng)論 2 382
  • 文/潘曉璐 我一進(jìn)店門(mén)趾断,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)拒名,“玉大人,你說(shuō)我怎么就攤上這事芋酌≡鱿裕” “怎么了?”我有些...
    開(kāi)封第一講書(shū)人閱讀 152,445評(píng)論 0 341
  • 文/不壞的土叔 我叫張陵,是天一觀的道長(zhǎng)同云。 經(jīng)常有香客問(wèn)我糖权,道長(zhǎng),這世上最難降的妖魔是什么炸站? 我笑而不...
    開(kāi)封第一講書(shū)人閱讀 55,185評(píng)論 1 278
  • 正文 為了忘掉前任星澳,我火速辦了婚禮,結(jié)果婚禮上旱易,老公的妹妹穿的比我還像新娘禁偎。我一直安慰自己,他們只是感情好阀坏,可當(dāng)我...
    茶點(diǎn)故事閱讀 64,178評(píng)論 5 371
  • 文/花漫 我一把揭開(kāi)白布如暖。 她就那樣靜靜地躺著,像睡著了一般忌堂。 火紅的嫁衣襯著肌膚如雪盒至。 梳的紋絲不亂的頭發(fā)上,一...
    開(kāi)封第一講書(shū)人閱讀 48,970評(píng)論 1 284
  • 那天士修,我揣著相機(jī)與錄音枷遂,去河邊找鬼。 笑死棋嘲,一個(gè)胖子當(dāng)著我的面吹牛登淘,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播封字,決...
    沈念sama閱讀 38,276評(píng)論 3 399
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼黔州,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼!你這毒婦竟也來(lái)了阔籽?” 一聲冷哼從身側(cè)響起流妻,我...
    開(kāi)封第一講書(shū)人閱讀 36,927評(píng)論 0 259
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤,失蹤者是張志新(化名)和其女友劉穎笆制,沒(méi)想到半個(gè)月后绅这,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 43,400評(píng)論 1 300
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡在辆,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 35,883評(píng)論 2 323
  • 正文 我和宋清朗相戀三年证薇,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片匆篓。...
    茶點(diǎn)故事閱讀 37,997評(píng)論 1 333
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡浑度,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出鸦概,到底是詐尸還是另有隱情箩张,我是刑警寧澤,帶...
    沈念sama閱讀 33,646評(píng)論 4 322
  • 正文 年R本政府宣布,位于F島的核電站先慷,受9級(jí)特大地震影響饮笛,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜论熙,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 39,213評(píng)論 3 307
  • 文/蒙蒙 一福青、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧脓诡,春花似錦素跺、人聲如沸。這莊子的主人今日做“春日...
    開(kāi)封第一講書(shū)人閱讀 30,204評(píng)論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)刊愚。三九已至踊跟,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間鸥诽,已是汗流浹背商玫。 一陣腳步聲響...
    開(kāi)封第一講書(shū)人閱讀 31,423評(píng)論 1 260
  • 我被黑心中介騙來(lái)泰國(guó)打工, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留牡借,地道東北人拳昌。 一個(gè)月前我還...
    沈念sama閱讀 45,423評(píng)論 2 352
  • 正文 我出身青樓,卻偏偏與公主長(zhǎng)得像钠龙,于是被迫代替她去往敵國(guó)和親炬藤。 傳聞我的和親對(duì)象是個(gè)殘疾皇子,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 42,722評(píng)論 2 345

推薦閱讀更多精彩內(nèi)容