這篇文章介紹的是有分組的單細胞數(shù)據(jù)怎樣分析,數(shù)據(jù)來自GEO的GSE231920,有3個treat辉阶,3個control樣本,代碼完整瘩扼,可以自行下載數(shù)據(jù)跑一跑谆甜,但請注意細胞數(shù)量是6w,對計算資源要求較高集绰,自己的電腦跑不動规辱,需要在服務(wù)器上跑。
1.整理數(shù)據(jù)
因為數(shù)據(jù)組織的不是每個樣本一個文件夾的形式栽燕,所以需要自行整理罕袋,參考代碼如下,注意這段改名的代碼不要反復運行:
#untar("GSE231920_RAW.tar",exdir = "GSE231920_RAW")
#unlink("GSE231920_RAW.tar")
library(stringr)
fs = paste0("GSE231920_RAW/",dir("GSE231920_RAW/"))
fs
samples = dir("GSE231920_RAW/") %>% str_split_i("_",2) %>% unique();samples
#為每個樣本創(chuàng)建單獨的文件夾
lapply(samples, function(s){
ns = paste0("01_data/",s)
if(!file.exists(ns))dir.create(ns,recursive = T)
})
#每個樣本的三個文件復制到單獨的文件夾
lapply(fs, function(s){
#s = fs[1]
for(i in 1:length(samples)){
#i = 1
if(str_detect(s,samples[[i]])){
file.copy(s,paste0("01_data/",samples[[i]]))
}
}
})
#文件名字修改
on = paste0("01_data/",dir("01_data/",recursive = T));on
nn = str_remove(on,"GSM\\d+_sample\\d_");nn
file.rename(on,nn)
代碼主要參考:
https://satijalab.org/seurat/articles/integration_introduction
rm(list = ls())
2.批量讀取
rm(list = ls())
library(Seurat)
f = dir("01_data/")
scelist = list()
for(i in 1:length(f)){
pda <- Read10X(paste0("01_data/",f[[i]]))
scelist[[i]] <- CreateSeuratObject(counts = pda,
project = f[[i]])
print(dim(scelist[[i]]))
}
## [1] 33538 10218
## [1] 33538 8931
## [1] 33538 8607
## [1] 33538 12733
## [1] 33538 11038
## [1] 33538 10821
sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all)
head(sce.all@meta.data)
## orig.ident nCount_RNA nFeature_RNA
## AAACCCACACAAATAG-1_1 sample1 5624 1539
## AAACCCACAGGCTCTG-1_1 sample1 36854 1798
## AAACCCACAGGTCCCA-1_1 sample1 5659 1463
## AAACCCACAGTCGGTC-1_1 sample1 4243 1256
## AAACCCAGTTTGGCTA-1_1 sample1 5105 1563
## AAACCCATCCCATAAG-1_1 sample1 3817 1495
table(sce.all$orig.ident)
##
## sample1 sample2 sample3 sample4 sample5 sample6
## 10218 8931 8607 12733 11038 10821
sum(table(Idents(sce.all)))
## [1] 62348
3.質(zhì)控指標
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")
head(sce.all@meta.data, 3)
## orig.ident nCount_RNA nFeature_RNA percent.mt percent.rp
## AAACCCACACAAATAG-1_1 sample1 5624 1539 6.4544808 34.726174
## AAACCCACAGGCTCTG-1_1 sample1 36854 1798 0.1438107 7.640962
## AAACCCACAGGTCCCA-1_1 sample1 5659 1463 3.4811804 30.040643
## percent.hb
## AAACCCACACAAATAG-1_1 0.00000000
## AAACCCACAGGCTCTG-1_1 0.00000000
## AAACCCACAGGTCCCA-1_1 0.01767097
VlnPlot(sce.all,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rp",
"percent.hb"),
ncol = 3,pt.size = 0, group.by = "orig.ident")
sce.all = subset(sce.all,percent.mt<25)
4.整合降維聚類分群
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
sce.all = sce.all %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(pc.genes = VariableFeatures(.)) %>%
RunHarmony("orig.ident") %>%
FindNeighbors(dims = 1:15, reduction = "harmony") %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15,reduction = "harmony") %>%
RunTSNE(dims = 1:15,reduction = "harmony")
save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
UMAPPlot(sce.all,label = T)
TSNEPlot(sce.all,label = T)
5.注釋
library(celldex)
library(SingleR)
ls("package:celldex")
## [1] "BlueprintEncodeData" "DatabaseImmuneCellExpressionData"
## [3] "HumanPrimaryCellAtlasData" "ImmGenData"
## [5] "MonacoImmuneData" "MouseRNAseqData"
## [7] "NovershternHematopoieticData"
f = "ref_HumanPrimaryCellAtlasData.RData"
if(!file.exists(f)){
ref <- celldex::HumanPrimaryCellAtlasData()
save(ref,file = f)
}
ref <- list(get(load("ref_BlueprintEncode.RData")),
get(load("ref_HumanPrimaryCellAtlasData.RData")))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = list(ref[[1]]$label.main,ref[[2]]$label.main),
clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
## [1] "CD8+ T-cells" "B-cells" "Fibroblasts"
## [4] "CD4+ T-cells" "CD8+ T-cells" "Neutrophils"
## [7] "Endothelial_cells" "Fibroblasts" "Monocytes"
## [10] "Macrophages" "Fibroblasts" "Adipocytes"
## [13] "B-cells" "NK cells" "Monocytes"
## [16] NA "Endothelial cells" "Monocytes"
## [19] "Neutrophils" "Fibroblasts" "Adipocytes"
#查看注釋準確性
#plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
new.cluster.ids[is.na(new.cluster.ids)] = "unknown"
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20"
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
## [1] "CD8+ T-cells" "B-cells" "Fibroblasts"
## [4] "CD4+ T-cells" "Neutrophils" "Endothelial_cells"
## [7] "Monocytes" "Macrophages" "Adipocytes"
## [10] "NK cells" "unknown" "Endothelial cells"
p2 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p2
6.分組可視化及組件細胞比例比較
scRNA$seurat_annotations = Idents(scRNA)
table(scRNA$orig.ident)
##
## sample1 sample2 sample3 sample4 sample5 sample6
## 10109 8303 8510 12593 10874 10623
library(tinyarray)
pd = geo_download("GSE231920")$pd
pd$title
## [1] "IgG4-RD1" "IgG4-RD2" "IgG4-RD3" "Control1" "Control2" "Control3"
scRNA$group = ifelse(scRNA$orig.ident %in% c("sample1","sample2","sample3"), "treat","control")
DimPlot(scRNA, reduction = "umap", group.by = "group")
可以計算每個亞群的細胞數(shù)量和占全部細胞的比例
# 每種細胞的數(shù)量和比例
cell_counts <- table(Idents(scRNA))
cell.all <- cbind(cell_counts = cell_counts,
cell_Freq = round(prop.table(cell_counts)*100,2))
#各組中每種細胞的數(shù)量和比例
cell.num.group <- table(Idents(scRNA), scRNA$group)
cell.freq.group <- round(prop.table(cell.num.group, margin = 2) *100,2)
cell.all = cbind(cell.all,cell.num.group,cell.freq.group)
cell.all = cell.all[,c(1,3,4,2,5,6)]
colnames(cell.all) = paste(rep(c("all","control","treat"),times = 2),
rep(c("count","freq"),each = 3),sep = "_")
cell.all
## all_count control_count treat_count all_freq control_freq
## CD8+ T-cells 15077 5518 9559 24.71 16.19
## B-cells 9089 2870 6219 14.90 8.42
## Fibroblasts 12356 10773 1583 20.25 31.60
## CD4+ T-cells 7073 2303 4770 11.59 6.76
## Neutrophils 5513 4114 1399 9.04 12.07
## Endothelial_cells 3236 2360 876 5.30 6.92
## Monocytes 2948 1934 1014 4.83 5.67
## Macrophages 2093 1661 432 3.43 4.87
## Adipocytes 1468 1322 146 2.41 3.88
## NK cells 1200 695 505 1.97 2.04
## unknown 531 186 345 0.87 0.55
## Endothelial cells 428 354 74 0.70 1.04
## treat_freq
## CD8+ T-cells 35.51
## B-cells 23.10
## Fibroblasts 5.88
## CD4+ T-cells 17.72
## Neutrophils 5.20
## Endothelial_cells 3.25
## Monocytes 3.77
## Macrophages 1.60
## Adipocytes 0.54
## NK cells 1.88
## unknown 1.28
## Endothelial cells 0.27
7.差異分析
找某種細胞在不同組間的差異基因
if(!require("multtest"))BiocManager::install('multtest')
if(!require("metap"))install.packages('metap')
table(scRNA$seurat_annotations)
##
## CD8+ T-cells B-cells Fibroblasts CD4+ T-cells
## 15077 9089 12356 7073
## Neutrophils Endothelial_cells Monocytes Macrophages
## 5513 3236 2948 2093
## Adipocytes NK cells unknown Endothelial cells
## 1468 1200 531 428
sub.markers <- FindConservedMarkers(scRNA, ident.1 = "NK cells", grouping.var = "group", min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(sub.markers)
## treat_p_val treat_avg_log2FC treat_pct.1 treat_pct.2 treat_p_val_adj
## GNLY 0 6.695047 0.949 0.032 0
## KLRD1 0 5.818575 0.950 0.037 0
## GZMB 0 5.418138 0.909 0.039 0
## NKG7 0 4.850398 0.998 0.148 0
## PRF1 0 5.655445 0.897 0.054 0
## CTSW 0 4.974234 0.865 0.046 0
## control_p_val control_avg_log2FC control_pct.1 control_pct.2
## GNLY 0 6.275650 0.944 0.036
## KLRD1 0 5.566265 0.961 0.041
## GZMB 0 5.895107 0.937 0.033
## NKG7 0 5.118054 1.000 0.103
## PRF1 0 5.515456 0.902 0.046
## CTSW 0 4.268425 0.919 0.094
## control_p_val_adj max_pval minimump_p_val
## GNLY 0 0 0
## KLRD1 0 0 0
## GZMB 0 0 0
## NKG7 0 0 0
## PRF1 0 0 0
## CTSW 0 0 0
組間比較的氣泡圖
markers.to.plot = c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
"CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
"GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一組感興趣的基因
#如果idents有NA會報錯https://github.com/satijalab/seurat/issues/8772
#scRNA <- subset(scRNA, seurat_annotations %in% na.omit(scRNA$seurat_annotations))
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"),
dot.scale = 8, split.by = "group") +
RotatedAxis()
FeaturePlot(scRNA, features = c("CD3D", "GNLY", "IFI6"), split.by = "group", max.cutoff = 3, cols = c("grey",
"red"), reduction = "umap")
plots <- VlnPlot(scRNA, features = c("LYZ", "ISG15", "CXCL10"), split.by = "group", group.by = "seurat_annotations",
pt.size = 0, combine = FALSE)
library(patchwork)
wrap_plots(plots = plots, ncol = 1)
8.偽bulk 轉(zhuǎn)錄組差異分析
每個組要有多個樣本才能做
https://satijalab.org/seurat/articles/parsebio_sketch_integration
bulk <- AggregateExpression(scRNA, return.seurat = T, slot = "counts", assays = "RNA", group.by = c("seurat_annotations","orig.ident", "group"))
sub <- subset(bulk, seurat_annotations == "CD8+ T-cells")
Idents(sub) <- "group"
de_markers <- FindMarkers(sub, ident.1 = "treat", ident.2 = "control", slot = "counts", test.use = "DESeq2",
verbose = F)
de_markers$gene <- rownames(de_markers)
k1 = de_markers$avg_log2FC< -1 & de_markers$p_val <0.01
k2 = de_markers$avg_log2FC> 1 & de_markers$p_val <0.01
de_markers$change <- ifelse(k1,"down",ifelse(k2,"up","not"))
library(ggplot2)
library(ggrepel)
ggplot(de_markers, aes(avg_log2FC, -log10(p_val),color = change)) +
geom_point(size = 0.5, alpha = 0.5) +
geom_vline(xintercept = c(1,-1),linetype = 4)+
geom_hline(yintercept = -log10(0.01),linetype = 4)+
scale_color_manual(values = c("blue","grey","red"))+
theme_bw() +
ylab("-log10(unadjusted p-value)")