單細胞組間比較分析

這篇文章介紹的是有分組的單細胞數(shù)據(jù)怎樣分析,數(shù)據(jù)來自GEO的GSE231920,有3個treat辉阶,3個control樣本,代碼完整瘩扼,可以自行下載數(shù)據(jù)跑一跑谆甜,但請注意細胞數(shù)量是6w,對計算資源要求較高集绰,自己的電腦跑不動规辱,需要在服務(wù)器上跑。

1.整理數(shù)據(jù)

因為數(shù)據(jù)組織的不是每個樣本一個文件夾的形式栽燕,所以需要自行整理罕袋,參考代碼如下,注意這段改名的代碼不要反復運行:

#untar("GSE231920_RAW.tar",exdir = "GSE231920_RAW")
#unlink("GSE231920_RAW.tar")
library(stringr)
fs = paste0("GSE231920_RAW/",dir("GSE231920_RAW/"))
fs
samples = dir("GSE231920_RAW/") %>% str_split_i("_",2) %>% unique();samples

#為每個樣本創(chuàng)建單獨的文件夾
lapply(samples, function(s){
  ns = paste0("01_data/",s)
  if(!file.exists(ns))dir.create(ns,recursive = T)
})

#每個樣本的三個文件復制到單獨的文件夾
lapply(fs, function(s){
  #s = fs[1]
  for(i in 1:length(samples)){
    #i = 1
    if(str_detect(s,samples[[i]])){
      file.copy(s,paste0("01_data/",samples[[i]]))
    }
  }
})

#文件名字修改
on = paste0("01_data/",dir("01_data/",recursive = T));on
nn = str_remove(on,"GSM\\d+_sample\\d_");nn
file.rename(on,nn)

代碼主要參考:

https://satijalab.org/seurat/articles/integration_introduction

rm(list = ls())

2.批量讀取

rm(list = ls())
library(Seurat)
f = dir("01_data/")
scelist = list()
for(i in 1:length(f)){
  pda <- Read10X(paste0("01_data/",f[[i]]))
  scelist[[i]] <- CreateSeuratObject(counts = pda, 
                                     project = f[[i]])
  print(dim(scelist[[i]]))
}

## [1] 33538 10218
## [1] 33538  8931
## [1] 33538  8607
## [1] 33538 12733
## [1] 33538 11038
## [1] 33538 10821

sce.all = merge(scelist[[1]],scelist[-1])
sce.all = JoinLayers(sce.all)
head(sce.all@meta.data)

##                      orig.ident nCount_RNA nFeature_RNA
## AAACCCACACAAATAG-1_1    sample1       5624         1539
## AAACCCACAGGCTCTG-1_1    sample1      36854         1798
## AAACCCACAGGTCCCA-1_1    sample1       5659         1463
## AAACCCACAGTCGGTC-1_1    sample1       4243         1256
## AAACCCAGTTTGGCTA-1_1    sample1       5105         1563
## AAACCCATCCCATAAG-1_1    sample1       3817         1495

table(sce.all$orig.ident)

## 
## sample1 sample2 sample3 sample4 sample5 sample6 
##   10218    8931    8607   12733   11038   10821

sum(table(Idents(sce.all)))

## [1] 62348

3.質(zhì)控指標

sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")

head(sce.all@meta.data, 3)

##                      orig.ident nCount_RNA nFeature_RNA percent.mt percent.rp
## AAACCCACACAAATAG-1_1    sample1       5624         1539  6.4544808  34.726174
## AAACCCACAGGCTCTG-1_1    sample1      36854         1798  0.1438107   7.640962
## AAACCCACAGGTCCCA-1_1    sample1       5659         1463  3.4811804  30.040643
##                      percent.hb
## AAACCCACACAAATAG-1_1 0.00000000
## AAACCCACAGGCTCTG-1_1 0.00000000
## AAACCCACAGGTCCCA-1_1 0.01767097

VlnPlot(sce.all, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt",
                     "percent.rp",
                     "percent.hb"),
        ncol = 3,pt.size = 0, group.by = "orig.ident")
sce.all = subset(sce.all,percent.mt<25)

4.整合降維聚類分群

f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
  sce.all = sce.all %>% 
    NormalizeData() %>%  
    FindVariableFeatures() %>%  
    ScaleData(features = rownames(.)) %>%  
    RunPCA(pc.genes = VariableFeatures(.))  %>%
    RunHarmony("orig.ident") %>%
    FindNeighbors(dims = 1:15, reduction = "harmony") %>% 
    FindClusters(resolution = 0.5) %>% 
    RunUMAP(dims = 1:15,reduction = "harmony") %>% 
    RunTSNE(dims = 1:15,reduction = "harmony")
  save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)
UMAPPlot(sce.all,label = T)
TSNEPlot(sce.all,label = T)

5.注釋

library(celldex)
library(SingleR)
ls("package:celldex")

## [1] "BlueprintEncodeData"              "DatabaseImmuneCellExpressionData"
## [3] "HumanPrimaryCellAtlasData"        "ImmGenData"                      
## [5] "MonacoImmuneData"                 "MouseRNAseqData"                 
## [7] "NovershternHematopoieticData"

f = "ref_HumanPrimaryCellAtlasData.RData"
if(!file.exists(f)){
  ref <- celldex::HumanPrimaryCellAtlasData()
  save(ref,file = f)
}
ref <- list(get(load("ref_BlueprintEncode.RData")),
            get(load("ref_HumanPrimaryCellAtlasData.RData")))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = list(ref[[1]]$label.main,ref[[2]]$label.main), 
                      clusters = scRNA@active.ident)
pred.scRNA$pruned.labels

##  [1] "CD8+ T-cells"      "B-cells"           "Fibroblasts"      
##  [4] "CD4+ T-cells"      "CD8+ T-cells"      "Neutrophils"      
##  [7] "Endothelial_cells" "Fibroblasts"       "Monocytes"        
## [10] "Macrophages"       "Fibroblasts"       "Adipocytes"       
## [13] "B-cells"           "NK cells"          "Monocytes"        
## [16] NA                  "Endothelial cells" "Monocytes"        
## [19] "Neutrophils"       "Fibroblasts"       "Adipocytes"

#查看注釋準確性 
#plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
new.cluster.ids[is.na(new.cluster.ids)] = "unknown"
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)

##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "17" "18" "19" "20"

scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)

##  [1] "CD8+ T-cells"      "B-cells"           "Fibroblasts"      
##  [4] "CD4+ T-cells"      "Neutrophils"       "Endothelial_cells"
##  [7] "Monocytes"         "Macrophages"       "Adipocytes"       
## [10] "NK cells"          "unknown"           "Endothelial cells"

p2 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p2

6.分組可視化及組件細胞比例比較

scRNA$seurat_annotations = Idents(scRNA)
table(scRNA$orig.ident)

## 
## sample1 sample2 sample3 sample4 sample5 sample6 
##   10109    8303    8510   12593   10874   10623

library(tinyarray)
pd = geo_download("GSE231920")$pd
pd$title

## [1] "IgG4-RD1" "IgG4-RD2" "IgG4-RD3" "Control1" "Control2" "Control3"

scRNA$group = ifelse(scRNA$orig.ident %in% c("sample1","sample2","sample3"), "treat","control")
DimPlot(scRNA, reduction = "umap", group.by = "group")

可以計算每個亞群的細胞數(shù)量和占全部細胞的比例

# 每種細胞的數(shù)量和比例
cell_counts <- table(Idents(scRNA))
cell.all <- cbind(cell_counts = cell_counts, 
                  cell_Freq = round(prop.table(cell_counts)*100,2))
#各組中每種細胞的數(shù)量和比例
cell.num.group <- table(Idents(scRNA), scRNA$group) 
cell.freq.group <- round(prop.table(cell.num.group, margin = 2) *100,2)
cell.all = cbind(cell.all,cell.num.group,cell.freq.group)
cell.all = cell.all[,c(1,3,4,2,5,6)]
colnames(cell.all) = paste(rep(c("all","control","treat"),times = 2),
      rep(c("count","freq"),each = 3),sep = "_")
cell.all

##                   all_count control_count treat_count all_freq control_freq
## CD8+ T-cells          15077          5518        9559    24.71        16.19
## B-cells                9089          2870        6219    14.90         8.42
## Fibroblasts           12356         10773        1583    20.25        31.60
## CD4+ T-cells           7073          2303        4770    11.59         6.76
## Neutrophils            5513          4114        1399     9.04        12.07
## Endothelial_cells      3236          2360         876     5.30         6.92
## Monocytes              2948          1934        1014     4.83         5.67
## Macrophages            2093          1661         432     3.43         4.87
## Adipocytes             1468          1322         146     2.41         3.88
## NK cells               1200           695         505     1.97         2.04
## unknown                 531           186         345     0.87         0.55
## Endothelial cells       428           354          74     0.70         1.04
##                   treat_freq
## CD8+ T-cells           35.51
## B-cells                23.10
## Fibroblasts             5.88
## CD4+ T-cells           17.72
## Neutrophils             5.20
## Endothelial_cells       3.25
## Monocytes               3.77
## Macrophages             1.60
## Adipocytes              0.54
## NK cells                1.88
## unknown                 1.28
## Endothelial cells       0.27

7.差異分析

找某種細胞在不同組間的差異基因

if(!require("multtest"))BiocManager::install('multtest')
if(!require("metap"))install.packages('metap')
table(scRNA$seurat_annotations)

## 
##      CD8+ T-cells           B-cells       Fibroblasts      CD4+ T-cells 
##             15077              9089             12356              7073 
##       Neutrophils Endothelial_cells         Monocytes       Macrophages 
##              5513              3236              2948              2093 
##        Adipocytes          NK cells           unknown Endothelial cells 
##              1468              1200               531               428

sub.markers <- FindConservedMarkers(scRNA, ident.1 = "NK cells", grouping.var = "group", min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(sub.markers)

##       treat_p_val treat_avg_log2FC treat_pct.1 treat_pct.2 treat_p_val_adj
## GNLY            0         6.695047       0.949       0.032               0
## KLRD1           0         5.818575       0.950       0.037               0
## GZMB            0         5.418138       0.909       0.039               0
## NKG7            0         4.850398       0.998       0.148               0
## PRF1            0         5.655445       0.897       0.054               0
## CTSW            0         4.974234       0.865       0.046               0
##       control_p_val control_avg_log2FC control_pct.1 control_pct.2
## GNLY              0           6.275650         0.944         0.036
## KLRD1             0           5.566265         0.961         0.041
## GZMB              0           5.895107         0.937         0.033
## NKG7              0           5.118054         1.000         0.103
## PRF1              0           5.515456         0.902         0.046
## CTSW              0           4.268425         0.919         0.094
##       control_p_val_adj max_pval minimump_p_val
## GNLY                  0        0              0
## KLRD1                 0        0              0
## GZMB                  0        0              0
## NKG7                  0        0              0
## PRF1                  0        0              0
## CTSW                  0        0              0

組間比較的氣泡圖

markers.to.plot = c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
    "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
    "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一組感興趣的基因
#如果idents有NA會報錯https://github.com/satijalab/seurat/issues/8772
#scRNA <- subset(scRNA, seurat_annotations %in% na.omit(scRNA$seurat_annotations))
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"), 
        dot.scale = 8, split.by = "group") +
    RotatedAxis()
FeaturePlot(scRNA, features = c("CD3D", "GNLY", "IFI6"), split.by = "group", max.cutoff = 3, cols = c("grey",
    "red"), reduction = "umap")
plots <- VlnPlot(scRNA, features = c("LYZ", "ISG15", "CXCL10"), split.by = "group", group.by = "seurat_annotations",
    pt.size = 0, combine = FALSE)
library(patchwork)
wrap_plots(plots = plots, ncol = 1)

8.偽bulk 轉(zhuǎn)錄組差異分析

每個組要有多個樣本才能做

https://satijalab.org/seurat/articles/parsebio_sketch_integration

bulk <- AggregateExpression(scRNA, return.seurat = T, slot = "counts", assays = "RNA", group.by = c("seurat_annotations","orig.ident", "group"))

sub <- subset(bulk, seurat_annotations == "CD8+ T-cells")
Idents(sub) <- "group"
de_markers <- FindMarkers(sub, ident.1 = "treat", ident.2 = "control", slot = "counts", test.use = "DESeq2",
    verbose = F)
de_markers$gene <- rownames(de_markers)
k1 = de_markers$avg_log2FC< -1 & de_markers$p_val <0.01
k2 = de_markers$avg_log2FC> 1 & de_markers$p_val <0.01
de_markers$change <- ifelse(k1,"down",ifelse(k2,"up","not"))
library(ggplot2)
library(ggrepel)
ggplot(de_markers, aes(avg_log2FC, -log10(p_val),color = change)) + 
  geom_point(size = 0.5, alpha = 0.5) + 
  geom_vline(xintercept = c(1,-1),linetype = 4)+
  geom_hline(yintercept = -log10(0.01),linetype = 4)+
  scale_color_manual(values = c("blue","grey","red"))+
  theme_bw() +
  ylab("-log10(unadjusted p-value)") 
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末碍岔,一起剝皮案震驚了整個濱河市浴讯,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌蔼啦,老刑警劉巖榆纽,帶你破解...
    沈念sama閱讀 206,126評論 6 481
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場離奇詭異询吴,居然都是意外死亡掠河,警方通過查閱死者的電腦和手機,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,254評論 2 382
  • 文/潘曉璐 我一進店門猛计,熙熙樓的掌柜王于貴愁眉苦臉地迎上來唠摹,“玉大人,你說我怎么就攤上這事奉瘤」蠢” “怎么了?”我有些...
    開封第一講書人閱讀 152,445評論 0 341
  • 文/不壞的土叔 我叫張陵盗温,是天一觀的道長藕赞。 經(jīng)常有香客問我,道長卖局,這世上最難降的妖魔是什么斧蜕? 我笑而不...
    開封第一講書人閱讀 55,185評論 1 278
  • 正文 為了忘掉前任,我火速辦了婚禮砚偶,結(jié)果婚禮上批销,老公的妹妹穿的比我還像新娘洒闸。我一直安慰自己,他們只是感情好均芽,可當我...
    茶點故事閱讀 64,178評論 5 371
  • 文/花漫 我一把揭開白布丘逸。 她就那樣靜靜地躺著,像睡著了一般掀宋。 火紅的嫁衣襯著肌膚如雪深纲。 梳的紋絲不亂的頭發(fā)上,一...
    開封第一講書人閱讀 48,970評論 1 284
  • 那天劲妙,我揣著相機與錄音湃鹊,去河邊找鬼卖漫。 笑死主卫,一個胖子當著我的面吹牛蔫缸,可吹牛的內(nèi)容都是我干的册舞。 我是一名探鬼主播,決...
    沈念sama閱讀 38,276評論 3 399
  • 文/蒼蘭香墨 我猛地睜開眼聋丝,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了?” 一聲冷哼從身側(cè)響起肛搬,我...
    開封第一講書人閱讀 36,927評論 0 259
  • 序言:老撾萬榮一對情侶失蹤,失蹤者是張志新(化名)和其女友劉穎毕贼,沒想到半個月后温赔,有當?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 43,400評論 1 300
  • 正文 獨居荒郊野嶺守林人離奇死亡鬼癣,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點故事閱讀 35,883評論 2 323
  • 正文 我和宋清朗相戀三年陶贼,在試婚紗的時候發(fā)現(xiàn)自己被綠了。 大學時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片待秃。...
    茶點故事閱讀 37,997評論 1 333
  • 序言:一個原本活蹦亂跳的男人離奇死亡拜秧,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出章郁,到底是詐尸還是另有隱情枉氮,我是刑警寧澤,帶...
    沈念sama閱讀 33,646評論 4 322
  • 正文 年R本政府宣布暖庄,位于F島的核電站聊替,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏培廓。R本人自食惡果不足惜惹悄,卻給世界環(huán)境...
    茶點故事閱讀 39,213評論 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望肩钠。 院中可真熱鬧泣港,春花似錦暂殖、人聲如沸。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,204評論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽。三九已至惫东,卻和暖如春莉给,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背廉沮。 一陣腳步聲響...
    開封第一講書人閱讀 31,423評論 1 260
  • 我被黑心中介騙來泰國打工颓遏, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留,地道東北人滞时。 一個月前我還...
    沈念sama閱讀 45,423評論 2 352
  • 正文 我出身青樓叁幢,卻偏偏與公主長得像,于是被迫代替她去往敵國和親坪稽。 傳聞我的和親對象是個殘疾皇子曼玩,可洞房花燭夜當晚...
    茶點故事閱讀 42,722評論 2 345

推薦閱讀更多精彩內(nèi)容