參考:生信會(huì)客廳
harmony原理
Harmony需要輸入低維空間的坐標(biāo)值(embedding)弯院,一般使用PCA的降維結(jié)果缸兔。Harmony導(dǎo)入PCA的降維數(shù)據(jù)后瘟裸,會(huì)采用soft k-means clustering算法將細(xì)胞聚類学辱。常用的聚類算法僅考慮細(xì)胞在低維空間的距離缎除,但是soft clustering算法會(huì)考慮我們提供的校正因素域慷。這就好比我們的高考加分制度荒辕,小明高考成績(jī)本來(lái)達(dá)不到A大學(xué)的錄取分?jǐn)?shù)線,但是他有一項(xiàng)省級(jí)競(jìng)賽一等獎(jiǎng)加10分就夠線了犹褒。同樣的道理抵窒,細(xì)胞c2距離cluster1有點(diǎn)遠(yuǎn),本來(lái)不能算作cluster1的一份子叠骑;但是c2和cluster1的細(xì)胞來(lái)自不同的數(shù)據(jù)集李皇,因?yàn)槲覀兤谕煌臄?shù)據(jù)集融合,所以破例讓它加入cluster1了宙枷。聚類之后先計(jì)算每個(gè)cluster內(nèi)各個(gè)數(shù)據(jù)集的細(xì)胞的中心點(diǎn)掉房,然后根據(jù)這些中心點(diǎn)計(jì)算各個(gè)cluster的中心點(diǎn)。最后通過(guò)算法讓cluster內(nèi)的細(xì)胞向中心聚集慰丛,實(shí)在收斂不了的離群細(xì)胞就過(guò)濾掉卓囚。調(diào)整之后的數(shù)據(jù)重復(fù):聚類—計(jì)算cluster中心點(diǎn)—收斂細(xì)胞—聚類的過(guò)程,不斷迭代直至聚類效果趨于穩(wěn)定诅病。
- 從第三方的評(píng)測(cè)結(jié)果來(lái)看哪亿,harmony的整合效果可以與seurat的錨點(diǎn)整合方法媲美,并且運(yùn)行時(shí)間方面具有明顯優(yōu)勢(shì)贤笆,不過(guò)內(nèi)存消耗好像沒(méi)有那么夸張的優(yōu)秀蝇棉。(哈哈我的電腦用seurat錨點(diǎn)整合被卡住了,試了一下harmony芥永,時(shí)間挺快的篡殷,而且電腦也帶得動(dòng)(記錄一下學(xué)習(xí)筆記)
library(harmony)
install_github("immunogenomics/harmony")
library(devtools)
##==準(zhǔn)備seurat對(duì)象列表==##
library(Seurat)
library(tidyverse)
library(patchwork)
rm(list=ls())
dir = c('Data/GSE139324/GSE139324_SUB/GSM4138110',
'Data/GSE139324/GSE139324_SUB/GSM4138111',
'Data/GSE139324/GSE139324_SUB/GSM4138128',
'Data/GSE139324/GSE139324_SUB/GSM4138129',
'Data/GSE139324/GSE139324_SUB/GSM4138148',
'Data/GSE139324/GSE139324_SUB/GSM4138149',
'Data/GSE139324/GSE139324_SUB/GSM4138162',
'Data/GSE139324/GSE139324_SUB/GSM4138163',
'Data/GSE139324/GSE139324_SUB/GSM4138168',
'Data/GSE139324/GSE139324_SUB/GSM4138169') #GSE139324是存放數(shù)據(jù)的目錄
sample_name <- c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC',
'HNC20TIL', 'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')
scRNAlist <- list()
for(i in 1:length(dir)){
counts <- Read10X(data.dir = dir[i])
scRNAlist[[i]] <- CreateSeuratObject(counts, project=sample_name[i], min.cells=3, min.features = 200)
scRNAlist[[i]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^MT-")
scRNAlist[[i]] <- subset(scRNAlist[[i]], subset = percent.mt < 10)
}
saveRDS(scRNAlist, "scRNAlist.rds")
##==harmony整合多樣本==##
library(harmony)
scRNAlist <- readRDS("scRNAlist.rds")
##PCA降維
scRNA_harmony <- merge(scRNAlist[[1]], y=c(scRNAlist[[2]], scRNAlist[[3]], scRNAlist[[4]], scRNAlist[[5]],
scRNAlist[[6]], scRNAlist[[7]], scRNAlist[[8]], scRNAlist[[9]], scRNAlist[[10]]))
scRNA_harmony <- NormalizeData(scRNA_harmony) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose=FALSE)
##整合
system.time({scRNA_harmony <- RunHarmony(scRNA_harmony, group.by.vars = "orig.ident")})
# 用戶 系統(tǒng) 流逝
# 34.308 0.024 34.324
#user system elapsed
#62.90 4.06 72.08
#降維聚類
scRNA_harmony <- RunUMAP(scRNA_harmony, reduction = "harmony", dims = 1:30)
scRNA_harmony <- FindNeighbors(scRNA_harmony, reduction = "harmony", dims = 1:30) %>% FindClusters()
##作圖
#group_by_cluster
plot1 = DimPlot(scRNA_harmony, reduction = "umap", label=T)
#group_by_sample
plot2 = DimPlot(scRNA_harmony, reduction = "umap", group.by='orig.ident')
#combinate
plotc <- plot1+plot2
plotc
練習(xí)另外一個(gè)數(shù)據(jù)
#harmonny
library(harmony)
library(Seurat)
library(tidyverse)
library(dplyr)
library(patchwork)
setwd("D:\\genetic_r\\study\\geo\\lesson7")
library(harmony)
scRNA.2=Read10X("D:\\genetic_r\\study\\geo\\lesson7\\GSE152048_BC2.matrix\\BC2")
scRNA.3=Read10X("D:\\genetic_r\\study\\geo\\lesson7\\GSE152048_BC3.matrix\\BC3")
scRNA.2 = CreateSeuratObject(scRNA.2 ,project="sample_2",min.cells = 3, min.features = 200)
scRNA.3 = CreateSeuratObject(scRNA.3 ,project="sample_3",min.cells = 3, min.features = 200)
scRNA_harmony <- merge(scRNA.2, y=c(scRNA.3 ))
Warning message:
In CheckDuplicateCellNames(object.list = objects) :
Some cell names are duplicated across objects provided. Renaming to enforce unique cell names.
scRNA_harmony <- NormalizeData(scRNA_harmony) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose=FALSE)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Centering and scaling data matrix
|============================================================| 100%
#開(kāi)始整合
system.time({scRNA_harmony <- RunHarmony(scRNA_harmony, group.by.vars = "orig.ident")})
Harmony 1/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 2/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 3/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 4/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 5/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony 6/10
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Harmony converged after 6 iterations
Warning: Invalid name supplied, making object name syntactically valid. New object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details on syntax validity
用戶 系統(tǒng) 流逝
67.39 5.28 88.92
降維聚類
scRNA_harmony <- FindNeighbors(scRNA_harmony, reduction = "harmony", dims = 1:15) %>% FindClusters(resolution = 0.5)
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 14760
Number of edges: 529809
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9244
Number of communities: 12
Elapsed time: 3 seconds
降維可視化
scRNA_harmony <- RunUMAP(scRNA_harmony, reduction = "harmony", dims = 1:15)
11:41:02 UMAP embedding parameters a = 0.9922 b = 1.112
11:41:02 Read 14760 rows and found 15 numeric columns
11:41:02 Using Annoy for neighbor search, n_neighbors = 30
11:41:02 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:41:09 Writing NN index file to temp file C:\Users\ASUS\AppData\Local\Temp\RtmpO8LMz0\file2da4113d7aa5
11:41:09 Searching Annoy index using 1 thread, search_k = 3000
11:41:16 Annoy recall = 100%
11:41:39 Commencing smooth kNN distance calibration using 1 thread
11:41:45 Initializing from normalized Laplacian + noise
11:41:47 Commencing optimization for 200 epochs, with 640848 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:42:07 Optimization finished
畫圖看看整合效果
#group_by_cluster
plot1 = DimPlot(scRNA_harmony, reduction = "umap", label=T)
#group_by_sample
plot2 = DimPlot(scRNA_harmony, reduction = "umap", group.by='orig.ident')
#combinate
plotc <- plot1+plot2
plotc
對(duì)比一下Seurat整合樣本,Seurat是采用錨定的方法來(lái)整合埋涧。
數(shù)據(jù)歸一化處理
scRNA.list <- list(scRNA.2,scRNA.3)
for (i in 1:length(scRNA.list)) {
scRNA.list[[i]] <- NormalizeData(scRNA.list[[i]])
scRNA.list[[i]] <- FindVariableFeatures(scRNA.list[[i]], selection.method = "vst")
}
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
尋找錨點(diǎn)
system.time({scRNA.anchors <- FindIntegrationAnchors(object.list = scRNA.list)})
Computing 2000 integration features
Scaling features for provided objects
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=13s
Finding all pairwise anchors
| | 0 % ~calculating Running CCA
Merging objects
Finding neighborhoods
Finding anchors
Found 12929 anchors
Filtering anchors
Retained 4266 anchors
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=15m 55s
user system elapsed
365.04 207.64 973.92
#錨定后整合
system.time({scRNA_seurat <- IntegrateData(anchorset = scRNA.anchors)})
scRNA_seurat<- IntegrateData(anchorset = scRNA.anchors)
Merging dataset 1 into 2
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
save(scRNA.anchors,file='scRNA.anchors.RData')
rm(list = ls())
降維聚類
scRNA_seurat <- ScaleData(scRNA_seurat) %>% RunPCA(verbose=FALSE)
Centering and scaling data matrix
|===========================================================| 100%
scRNA_seurat <- FindNeighbors(scRNA_seurat, dims = 1:15) %>% FindClusters(resolution = 0.5)
Computing nearest neighbor graph
Computing SNN
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 14760
Number of edges: 523184
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9068
Number of communities: 12
Elapsed time: 3 seconds
scRNA_seurat <- ScaleData(scRNA_seurat, features = VariableFeatures(scRNA_seurat))
scRNA_seurat <- RunPCA(scRNA_seurat, features = VariableFeatures(scRNA_seurat))
plot1 <- DimPlot(scRNA_seurat, reduction = "pca", group.by="orig.ident")
plot2 <- ElbowPlot(scRNA_seurat, ndims=30, reduction="pca")
plotc <- plot1+plot2
pc.num=1:20
scRNA_seurat <- FindNeighbors(scRNA_seurat, dims = pc.num)
scRNA_seurat <- FindClusters(scRNA_seurat, resolution = 0.5)
table(scRNA_seurat@meta.data$seurat_clusters)
metadata <- scRNA_seurat@meta.data
cell_cluster <- data.frame(cell_ID=rownames(metadata), cluster_ID=metadata$seurat_clusters)
#tSNE
scRNA_seurat = RunTSNE(scRNA_seurat, dims = pc.num)
#group_by_cluster
plot1 = DimPlot(scRNA_seurat, reduction = "tsne", label=T)
#group_by_sample
plot2 = DimPlot(scRNA_seurat, reduction = "tsne", group.by='orig.ident')
#combinate
plotc <- plot1+plot2
#UMAP
scRNA_seurat <- RunUMAP(scRNA_seurat, dims = pc.num)
#group_by_cluster
plot3 = DimPlot(scRNA_seurat, reduction = "umap", label=T)
#group_by_sample
plot4 = DimPlot(scRNA_seurat, reduction = "umap", group.by='orig.ident')
#combinate
plotc <- plot3+plot4
plotc <- plot3+plot4+ plot_layout(guides = 'collect')
哈哈 暫時(shí)還看不出harmony與Seurat誰(shuí)的整合效果好贴唇。。飞袋。