練習(xí)數(shù)據(jù)集的GEO編號:GSE139324 (Immune Landscape of Viral- and Carcinogen-Driven Head and Neck Cancer)。原數(shù)據(jù)集有63個(gè)scRNA的數(shù)據(jù)础浮,本次選取10個(gè)練習(xí)榨惠。
library(Seurat)
library(harmony)
library(tidyverse)
library(patchwork)
rm(list = ls())
一. 多個(gè)單細(xì)胞樣本的合并
1. 讀取并合并數(shù)據(jù)
- 1.1 讀取數(shù)據(jù)
## 批量讀取數(shù)據(jù)
### 設(shè)置數(shù)據(jù)路徑與樣本名稱
assays <- dir("GSE139324/")
dir <- paste0("GSE139324/", assays)
# 按文件順序給樣本命名,名稱不要以數(shù)字開頭,中間不能有空格
samples_name = c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC',
'HNC20TIL', 'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')
- 1.2 批量創(chuàng)建seurat對象
scRNAlist <- list()
for(i in 1:length(dir)){
counts <- Read10X(data.dir = dir[i])
#不設(shè)置min.cells過濾基因會導(dǎo)致CellCycleScoring報(bào)錯(cuò):
#Insufficient data values to produce 24 bins.
scRNAlist[[i]] <- CreateSeuratObject(counts, project=samples_name[i],
min.cells=3, min.features = 200)
#給細(xì)胞barcode加個(gè)前綴褐健,防止合并后barcode重名
scRNAlist[[i]] <- RenameCells(scRNAlist[[i]], add.cell.id = samples_name[i])
#計(jì)算線粒體基因比例
if(T){
scRNAlist[[i]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^MT-")
}
#計(jì)算核糖體基因比例
if(T){
scRNAlist[[i]][["percent.rb"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^RP[SL]")
}
#計(jì)算紅細(xì)胞基因比例
if(T){
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB.genes <- CaseMatch(HB.genes, rownames(scRNAlist[[i]]))
scRNAlist[[i]][["percent.HB"]]<-PercentageFeatureSet(scRNAlist[[i]], features=HB.genes)
}
}
### 給列表命名并保存數(shù)據(jù)
dir.create("Integrate")
setwd("./Integrate")
names(scRNAlist) <- samples_name
#system.time(save(scRNAlist, file = "Integrate/scRNAlist0.Rdata"))
system.time(saveRDS(scRNAlist, file = "scRNAlist0.rds"))
這一步得到了一個(gè)包含了十個(gè)樣本Seurat對象的list
- 1.3 使用
merge
函數(shù)將scRNAlist合成一個(gè)Seurat對象
scRNA <- merge(scRNAlist[[1]], scRNAlist[2:length(scRNAlist)])
scRNA
# An object of class Seurat
# 18818 features across 19738 samples within 1 assay
# Active assay: RNA (18818 features, 0 variable features)
table(scRNA$orig.ident)
# HNC01PBMC HNC01TIL HNC10PBMC HNC10TIL HNC20PBMC
# 1721 1298 1750 1383 1525
# HNC20TIL PBMC1 PBMC2 Tonsil1 Tonsil2
# 1148 2444 2436 3324 2709
save(scRNA,file = 'scRNA_orig.Rdata')
#scRNAlist <- SplitObject(scRNA, split.by = "orig.ident") #分割Seurat對象
2. 數(shù)據(jù)質(zhì)控
### 繪制質(zhì)控小提琴圖
# 設(shè)置可能用到的主題
theme.set2 = theme(axis.title.x=element_blank())
# 設(shè)置繪圖元素
plot.featrures = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb", "percent.HB")
group = "orig.ident"
# 質(zhì)控前小提琴圖
plots = list()
for(i in seq_along(plot.featrures)){
plots[[i]] = VlnPlot(scRNA, group.by=group, pt.size = 0,
features = plot.featrures[i]) + theme.set2 + NoLegend()}
violin <- wrap_plots(plots = plots, nrow=2)
dir.create("QC")
ggsave("QC/vlnplot_before_qc.pdf", plot = violin, width = 9, height = 8)
### 設(shè)置質(zhì)控標(biāo)準(zhǔn)
minGene=500
maxGene=4000
maxUMI=15000
pctMT=10
pctHB=1
### 數(shù)據(jù)質(zhì)控并繪制小提琴圖
scRNA <- subset(scRNA, subset = nCount_RNA < maxUMI & nFeature_RNA > minGene &
nFeature_RNA < maxGene & percent.mt < pctMT & percent.HB < pctHB)
plots = list()
for(i in seq_along(plot.featrures)){
plots[[i]] = VlnPlot(scRNA, group.by=group, pt.size = 0,
features = plot.featrures[i]) + theme.set2 + NoLegend()}
violin <- wrap_plots(plots = plots, nrow=2)
ggsave("QC/vlnplot_after_qc.pdf", plot = violin, width = 10, height = 8)
3. 查看批次效應(yīng)(對merge后的Seurat對象進(jìn)行標(biāo)準(zhǔn)化和降維)
# 降維聚類
scRNA <- NormalizeData(scRNA) %>% FindVariableFeatures(nfeatures = 3000) %>% ScaleData()
scRNA <- RunPCA(scRNA, verbose = F)
ElbowPlot(scRNA, ndims = 50)
pc.num=1:30
scRNA <- scRNA %>% RunTSNE(dims=pc.num) %>% RunUMAP(dims=pc.num)
scRNA <- FindNeighbors(scRNA, dims=pc.num) %>% FindClusters()
DimPlot(scRNA, label = T)
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples.pdf", p, width = 8, height = 6)
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split.pdf", p, width = 18, height = 12)
saveRDS(scRNA, "scRNA.rds")
二. 數(shù)據(jù)整合
數(shù)據(jù)整合的目的:
1. 畫出tsne/umap圖英岭,以輔助細(xì)胞類型鑒定(把不同組之間的一類細(xì)胞整合在一起)
2. 軌跡分析可能需要用到umap圖
??數(shù)據(jù)整合只影響降維聚類,不影響差異分析(原始count矩陣沒有改變)湿右。
1. seurat錨點(diǎn)整合
參考:https://satijalab.org/seurat/articles/integration_large_datasets.html
Seurat2的整合主要用的是CCA
(canonical correlation analysis诅妹,典型關(guān)聯(lián)分析)的方法,Seurat3和Seurat4用的是CCA
+MNN
(mutual nearest neighbor毅人,互近鄰)
錨點(diǎn)整合操作速度很慢吭狡,且常常會過度整合,因此在實(shí)際操作中丈莺,跨物種整合的時(shí)候或不同的數(shù)據(jù)類型如ATAC划煮、蛋白組的數(shù)據(jù)和單細(xì)胞的數(shù)據(jù)整合的時(shí)候,可以使用錨點(diǎn)整合缔俄。單純單細(xì)胞數(shù)據(jù)的整合弛秋,可以使用Harmony蟹略。
- 1.1 讀入數(shù)據(jù)挖炬,拆分樣本
rm(list=ls())
library(future) #Seurat并行計(jì)算的一個(gè)包,不加載這個(gè)包不能進(jìn)行并行計(jì)算
options(future.globals.maxSize = 50 * 1024^3) #將全局變量上限調(diào)至50G(錨點(diǎn)整合很占內(nèi)存)
##重新創(chuàng)建沒有處理的經(jīng)過降維等處理的數(shù)據(jù)
scRNA <- readRDS("scRNA.rds")
cellinfo <- subset(scRNA@meta.data, select = c("orig.ident", "percent.mt", "percent.rb", "percent.HB"))
scRNA <- CreateSeuratObject(scRNA@assays$RNA@counts, meta.data = cellinfo)
#做錨點(diǎn)整合需要把樣本處理成單個(gè)的Seurat對象來兩兩組合
scRNAlist <- SplitObject(scRNA, split.by = "orig.ident")
#也可以按別的指標(biāo)(metadata中的)來進(jìn)行拆分,比如可以按不同的分組來拆分樣本碴倾,再進(jìn)行整合异雁。
- 1.2 標(biāo)準(zhǔn)化(由于錨點(diǎn)整合會把單個(gè)樣本兩兩組合纲刀,所以需要單獨(dú)做標(biāo)準(zhǔn)化)
#SCTransform標(biāo)準(zhǔn)化(??使用log標(biāo)準(zhǔn)化還是SCT標(biāo)準(zhǔn)化差別不大)
#如果用log標(biāo)準(zhǔn)化,后面FindIntegrationAnchors和IntegrateData函數(shù)的normalization.method參數(shù)選'LogNormalize'
scRNAlist <- parallel::mclapply(scRNAlist, FUN=function(x) SCTransform(x), mc.cores = 10) #10個(gè)對象最好寫10個(gè)核,沒有10個(gè)核少寫幾個(gè)也可以展哭。top命令可以查看服務(wù)器有幾個(gè)核匪傍,mc.core設(shè)置為1就每次處理一個(gè)對象秧饮。
# mclapply是lapply的多核版本
- 1.3 選擇用于整合的高變基因(三步)
### FindAnchors
### 每個(gè)樣本的高變基因不完全一樣柑船,SelectIntegrationFeatures可以整合這些高變基因,選出3000個(gè)
scRNA.features <- SelectIntegrationFeatures(scRNAlist, nfeatures = 3000)
### 將每個(gè)樣本的高變基因都調(diào)整成上一步選出的3000個(gè)
scRNAlist <- PrepSCTIntegration(scRNAlist, anchor.features = scRNA.features)
##尋找錨點(diǎn),運(yùn)行速度非常慢锐极,至少需要1-2小時(shí)
plan("multisession", workers = 10)
scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist,
normalization.method = "SCT", #如果前面是log標(biāo)準(zhǔn)化,這里改成LogNormalize
anchor.features = scRNA.features)
- 1.4 錨點(diǎn)整合
### Integrate 運(yùn)行速度慢
scRNA.sct.int <- IntegrateData(scRNA.anchors, normalization.method="SCT") #速度慢
plan("sequential") #把并行計(jì)算改為單核計(jì)算
- 1.5 降維,可視化
### redunction
scRNA <- RunPCA(scRNA.sct.int, npcs = 50, verbose = FALSE)
ElbowPlot(scRNA, ndims=50)
pc.num=1:20
scRNA <- scRNA %>% RunTSNE(dims=pc.num) %>% RunUMAP(dims=pc.num)
### Visual
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples_integr.pdf", p, width = 8, height = 6)
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split_integr.pdf", p, width = 18, height = 12)
##save seurat object
saveRDS(scRNA, "scRNA_SCT_int.rds")
2. harmony整合
Harmony整合的官網(wǎng)教程及其原理此前已經(jīng)介紹過:Harmony
- 2.1 準(zhǔn)備數(shù)據(jù)
rm(list = ls())
scRNA <- readRDS("scRNA.rds")
cellinfo <- subset(scRNA@meta.data, select = c("orig.ident", "percent.mt", "percent.rb", "percent.HB"))
scRNA <- CreateSeuratObject(scRNA@assays$RNA@counts, meta.data = cellinfo)
- 2.2 數(shù)據(jù)標(biāo)準(zhǔn)化(和錨點(diǎn)整合不同揍异,不需拆分樣本柿菩,直接標(biāo)準(zhǔn)化)
### SCT標(biāo)準(zhǔn)化數(shù)據(jù)
scRNA <- SCTransform(scRNA)
- 2.3 使用harmony整合數(shù)據(jù)
### PCA
scRNA <- RunPCA(scRNA, npcs=50, verbose=FALSE)
### 整合方法1:單個(gè)樣本間進(jìn)行整合(推薦懦胞,效果更好)
scRNA <- RunHarmony(scRNA, group.by.vars="orig.ident", assay.use="SCT", max.iter.harmony = 20)
# group.by.vars參數(shù)是設(shè)置按哪個(gè)分組來整合
# max.iter.harmony設(shè)置迭代次數(shù)后众,默認(rèn)是10。運(yùn)行RunHarmony結(jié)果會提示在迭代多少次后完成了收斂。
#??RunHarmony函數(shù)中有個(gè)lambda參數(shù),默認(rèn)值是1,決定了Harmony整合的力度蹂窖。lambda值調(diào)小鸦致,整合力度變大抗碰,反之。(只有這個(gè)參數(shù)影響整合力度,調(diào)整范圍一般在0.5-2之間)
###整合方法2:按其他分類如不同分組來校正(實(shí)測效果不如方法1)
if(F){
scRNA$batches <- scRNA$orig.ident
scRNA$batches <- recode(scRNA$batches,
"HNC01PBMC" = "batch1",
"HNC01TIL" = "batch2",
"HNC10PBMC" = "batch1",
"HNC10TIL" = "batch2",
"HNC20PBMC" = "batch1",
"HNC20TIL" = "batch2",
"PBMC1" = "batch1",
"PBMC2" = "batch1",
"Tonsil1" = "batch3",
"Tonsil2" = "batch3")
scRNA2 <- RunHarmony(scRNA, group.by.vars="batches", assay.use="SCT")
}
- 2.4 降維聚類
ElbowPlot(scRNA, ndims = 50)
pc.num=1:30
scRNA <- RunTSNE(scRNA, reduction="harmony", dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num)
#scRNA2 <- RunTSNE(scRNA2, reduction="harmony", dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num)
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples_harmony.pdf", p, width = 8, height = 6)
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split_harmony.pdf", p, width = 18, height = 12)
##save seurat object
saveRDS(scRNA, "scRNA_SCT_harmony.rds")
3. 整合結(jié)果評估
library(SingleR)
source("sc_function.R")
- 3.1 Seurat錨點(diǎn)整合結(jié)果評估
scRNA <- readRDS("scRNA_SCT_int.rds")
scRNA <- FindNeighbors(scRNA, dims = 1:20) %>% FindClusters()
load("ref_Hematopoietic.RData")
DefaultAssay(scRNA) <- "RNA"
scRNA <- cell_identify(scRNA, ref_Hematopoietic) #cell_identify是自己寫的函數(shù)删顶,ref_Hematopoietic是SingleR中的參考數(shù)據(jù)集
p <- DimPlot(scRNA, group.by = "SingleR", label = T)
ggsave("SingleR_Seurat.pdf", p, width = 8, height = 6)
DefaultAssay(scRNA) <- "integrated"
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Seurat_int.pdf", p, width = 18, height = 16)
DefaultAssay(scRNA) <- "SCT"
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Seurat_sct.pdf", p, width = 18, height = 16)
- 3.2 harmony整合結(jié)果評估
scRNA <- readRDS("scRNA_SCT_harmony.rds")
scRNA <- FindNeighbors(scRNA, dims = 1:30) %>% FindClusters()
load("ref_Hematopoietic.RData")
scRNA <- cell_identify(scRNA, ref_Hematopoietic)
p <- DimPlot(scRNA, group.by = "SingleR", label = T)
ggsave("SingleR_Harmony.pdf", p, width = 8, height = 6)
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Harmony.pdf", p, width = 18, height = 16)
saveRDS(scRNA, "scRNA.classified.rds")
4. 最后的吐槽
錨點(diǎn)整合是真的很容易過整合啊
同樣的數(shù)據(jù),左邊是Harmony整合,右邊是錨點(diǎn)整合。錨點(diǎn)整合完Ccr2+的細(xì)胞群都沒了呢??♀?