這里是佳奧脓钾!
我們進(jìn)入到作者的GitHub下載一下代碼來(lái)看看吧。
https://github.com/KPLab/SCS_CAF
1 作者原始代碼
當(dāng)然桩警,由于package的版本日新月異可训,除非安裝相同版本的package,我就不運(yùn)行了生真,作為學(xué)習(xí)沉噩。
2 復(fù)現(xiàn)文章分析結(jié)果
作者沒(méi)有使用三大R包,這里我們使用之前講到的R包來(lái)復(fù)現(xiàn)文章中的圖柱蟀。
step 1-qc
讀入質(zhì)控文件
主要是讀取作者RNA-seq上游分析的一些結(jié)果找出離群的那些細(xì)胞川蒙。
qc1=read.table('qc/SS2_15_0048_qc.txt',header = T)
qc2=read.table('qc/SS2_15_0049_qc.txt',header = T)
qc=rbind(qc1,qc2)
DT::datatable(qc1)
DT::datatable(qc2)
DT::datatable(qc)
粗略的看一看各個(gè)細(xì)胞的一些統(tǒng)計(jì)學(xué)指標(biāo)的分布情況
library(ggplot2)
library(cowplot)
box <- lapply(colnames(qc[,3:12]),function(i) {
dat <- qc[,i,drop=F]
dat$sample=rownames(dat)
## 畫(huà)boxplot
ggplot(dat, aes('all cells', get(i))) +
geom_boxplot() +
xlab(NULL)+ylab(i)
})
plot_grid(plotlist=box, ncol=2 )
# ggsave(file="stat_all_cells.pdf")
批量過(guò)濾指標(biāo)
因?yàn)檫M(jìn)行了簡(jiǎn)單探索,對(duì)表型數(shù)據(jù)就有了把握长已,接下來(lái)可以進(jìn)行一定程度的過(guò)濾畜眨,因?yàn)榧?xì)節(jié)太多,這里為了展現(xiàn)批量處理方式术瓮,就不考慮太多康聂。
pa <- colnames(qc[,5:12])
tf <- lapply(pa,function(i) {
# i=pa[1]
dat <- qc[,i]
# dat <- abs(log10(dat))
fivenum(dat)
(up <- mean(dat)+2*sd(dat))
(down <- mean(dat)- 2*sd(dat) )
valid <- ifelse(dat > down & dat < up, 1,0 )
})
tf <- do.call(cbind,tf)
choosed_cells <- apply(tf,1,function(x) all(x==1))
qc=qc[choosed_cells,]
檢查隨便QC和作者詳細(xì)QC的區(qū)別
last_qc=read.table('qc/qc_2plates.filtered_cells.txt',header = T)
intersect(substr(rownames(last_qc),1,11),qc$experiment)
[1] "SS2_15_0048" "SS2_15_0049"
可以看到簡(jiǎn)單粗暴的隨意QC和作者用心QC結(jié)果類似,作者刪除的52個(gè)細(xì)胞我們隨便QC也刪除掉了其中的50個(gè)胞四。
不過(guò)恬汁,我們簡(jiǎn)單粗暴的QC,刪除的細(xì)胞數(shù)量稍微多了一點(diǎn)辜伟。
顯示運(yùn)行環(huán)境
sessionInfo()
step 2-get matrix
把counts值進(jìn)行過(guò)濾并且儲(chǔ)存為R數(shù)據(jù)
需要讀取作者GitHub項(xiàng)目的文件SS2_15_0048里面的counts值矩陣文件氓侧,然后進(jìn)行一系列的過(guò)濾手段。
考慮到GitHub文件大小限制导狡,下面的部分代碼不運(yùn)行约巷。
https://github.com/KPLab/SCS_CAF
#ENDOGENOUS GENES
# created by Nikolay Oskolkov, nikolay.oskolkov@scilifelabs.se, 2017
Path_Main<-"/CAF_SCS/"
Path_Main<-"./"
plate1_raw<-read.delim(paste(Path_Main,"/SS2_15_0048/counts.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_raw$gene<-make.unique(as.character(plate1_raw$gene))
colnames(plate1_raw)[2:length(colnames(plate1_raw))]<-paste("SS2_15_0048_",colnames(plate1_raw)[2:length(colnames(plate1_raw))],sep="")
plate2_raw<-read.delim(paste(Path_Main,"/SS2_15_0049/counts.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_raw$gene<-make.unique(as.character(plate2_raw$gene))
colnames(plate2_raw)[2:length(colnames(plate2_raw))]<-paste("SS2_15_0049_",colnames(plate2_raw)[2:length(colnames(plate2_raw))],sep="")
expr_raw<-merge(plate1_raw,plate2_raw,by="gene",all=TRUE)
rownames(expr_raw)<-as.character(expr_raw$gene)
expr_raw$gene<-NULL
sum(expr_raw==0)/(dim(expr_raw)[1]*dim(expr_raw)[2])
#SPIKES
plate1_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0048/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_raw_ercc$gene<-make.unique(as.character(plate1_raw_ercc$gene))
colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))]<-paste("SS2_15_0048_",colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))],sep="")
plate2_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0049/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_raw_ercc$gene<-make.unique(as.character(plate2_raw_ercc$gene))
colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))]<-paste("SS2_15_0049_",colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))],sep="")
expr_raw_ercc<-merge(plate1_raw_ercc,plate2_raw_ercc,by="gene",all=TRUE)
rownames(expr_raw_ercc)<-as.character(expr_raw_ercc$gene)
expr_raw_ercc$gene<-NULL
sum(expr_raw_ercc==0)/(dim(expr_raw_ercc)[1]*dim(expr_raw_ercc)[2])
barplot(sort(as.numeric(colSums(expr_raw_ercc)),decreasing=TRUE),ylab="SPIKE LIBRARY SIZE",xlab="CELL INDEX")
hist(as.numeric(colSums(expr_raw_ercc)),col="brown",main="Distribution of Spike Library Sizes",xlab="Spike Library Size",breaks=20)
#MERGING ENDOGENOUS GENES WITH SPIKES
print(paste0("There are ",dim(expr_raw)[1]," endogenous genes"))
print(paste0("There are ",dim(expr_raw_ercc)[1]," spikes"))
all.counts.raw<-rbind(expr_raw,expr_raw_ercc)
sum(all.counts.raw==0)/(dim(all.counts.raw)[1]*dim(all.counts.raw)[2])
dim(all.counts.raw[rowSums(all.counts.raw)==0,])
cell_QC<-read.delim(paste(Path_Main,"/qc/qc_2plates.filtered_cells.txt",sep=""),row.names=1,header=TRUE,sep="\t")
rownames(cell_QC)<-gsub("__","_",rownames(cell_QC))
all.counts.raw<-subset(all.counts.raw,select=colnames(all.counts.raw)[!colnames(all.counts.raw)%in%rownames(cell_QC)])
expr_raw<-subset(expr_raw,select=colnames(expr_raw)[!colnames(expr_raw)%in%rownames(cell_QC)])
expr_raw_ercc<-subset(expr_raw_ercc,select=colnames(expr_raw_ercc)[!colnames(expr_raw_ercc)%in%rownames(cell_QC)])
#FILTER OUT GENES HAVING LESS THAN 1 COUNT IN AVERAGE OVER ALL CELLS
all.counts.raw<-all.counts.raw[rowMeans(all.counts.raw)>0,]
expr_raw<-expr_raw[rowMeans(expr_raw)>=1,]
expr_raw_ercc<-expr_raw_ercc[rowMeans(expr_raw_ercc)>0,]
save(expr_raw,expr_raw_ercc,file = 'expr_raw_counts.Rdata')
QC過(guò)后的counts矩陣被保存為R對(duì)象保存為文件,所以現(xiàn)在可以檢查一下過(guò)濾后的矩陣
load(file = 'expr_raw_counts.Rdata')
expr_raw[1:4,1:4]
#PLOT CV^2 vs. MEAN RAW COUNT
library("matrixStats")
mean_expr_raw<-as.numeric(rowMeans(expr_raw,na.rm=TRUE))
sd_expr_raw<-rowSds(as.matrix(expr_raw),na.rm=TRUE)
cv_squared_expr_raw<-(sd_expr_raw/mean_expr_raw)^2
plot(log10(cv_squared_expr_raw)~log10(mean_expr_raw),pch=20,cex=0.5,xlab="log10 ( mean raw count )",ylab="log10 ( CV? )",main="RAW COUNTS")
mean_expr_raw_ercc<-as.numeric(rowMeans(expr_raw_ercc,na.rm=TRUE))
sd_expr_raw_ercc<-rowSds(as.matrix(expr_raw_ercc),na.rm=TRUE)
cv_squared_expr_raw_ercc<-(sd_expr_raw_ercc/mean_expr_raw_ercc)^2
points(log10(cv_squared_expr_raw_ercc)~log10(mean_expr_raw_ercc),col="red",pch=20,cex=1.5)
#FIT SPIKEINS WITH A CURVE
fit_expr_raw_ercc<-loess(log10(cv_squared_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))],span=1)
j<-order(log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))])
lines(fit_expr_raw_ercc$fitted[j]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))][j],col="red",lwd=3)
pred_expr_raw<-predict(fit_expr_raw_ercc,log10(mean_expr_raw))
#DETERMINE VARIABLE GENES THAT ARE ABOVE THE SPIKEINS CURVE
filtered_expr_raw<-expr_raw[log10(cv_squared_expr_raw)>=pred_expr_raw,]
filtered_expr_raw<-filtered_expr_raw[grepl("NA",rownames(filtered_expr_raw))==FALSE,]
COUNTS=filtered_expr_raw
save(COUNTS,file='last-COUNTS.Rdata')
對(duì)最后的counts矩陣進(jìn)行詳細(xì)檢查
load(file='last-COUNTS.Rdata')
counts=COUNTS
counts[1:4,1:4]
fivenum(apply(counts,1,function(x) sum(x>0) ))
boxplot(apply(counts,1,function(x) sum(x>0) ))
fivenum(apply(counts,2,function(x) sum(x>0) ))
hist(apply(counts,2,function(x) sum(x>0) ))
把RPKM值進(jìn)行過(guò)濾并且存儲(chǔ)為R數(shù)據(jù)
需要讀取作者GitHub項(xiàng)目的文件SS2_15_0048里面的counts值矩陣文件旱捧,然后進(jìn)行一系列的過(guò)濾手段独郎。
plate1_rpkm<-read.delim(paste(Path_Main,"/SS2_15_0048/rpkms.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_rpkm$gene<-make.unique(as.character(plate1_rpkm$gene))
colnames(plate1_rpkm)[2:length(colnames(plate1_rpkm))]<-paste("SS2_15_0048_",colnames(plate1_rpkm)[2:length(colnames(plate1_rpkm))],sep="")
plate2_rpkm<-read.delim(paste(Path_Main,"/SS2_15_0049/rpkms.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_rpkm$gene<-make.unique(as.character(plate2_rpkm$gene))
colnames(plate2_rpkm)[2:length(colnames(plate2_rpkm))]<-paste("SS2_15_0049_",colnames(plate2_rpkm)[2:length(colnames(plate2_rpkm))],sep="")
expr_rpkm<-merge(plate1_rpkm,plate2_rpkm,by="gene",all=TRUE)
rownames(expr_rpkm)<-as.character(expr_rpkm$gene)
expr_rpkm$gene<-NULL
sum(expr_rpkm==0)/(dim(expr_rpkm)[1]*dim(expr_rpkm)[2])
#SPIKES
plate1_rpkm_ercc<-read.delim(paste(Path_Main,"/SS2_15_0048/rpkms-ercc.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate1_rpkm_ercc$gene<-make.unique(as.character(plate1_rpkm_ercc$gene))
colnames(plate1_rpkm_ercc)[2:length(colnames(plate1_rpkm_ercc))]<-paste("SS2_15_0048_",colnames(plate1_rpkm_ercc)[2:length(colnames(plate1_rpkm_ercc))],sep="")
plate2_rpkm_ercc<-read.delim(paste(Path_Main,"/SS2_15_0049/rpkms-ercc.tab", sep=""),header=TRUE,check.names=FALSE,sep="\t")
plate2_rpkm_ercc$gene<-make.unique(as.character(plate2_rpkm_ercc$gene))
colnames(plate2_rpkm_ercc)[2:length(colnames(plate2_rpkm_ercc))]<-paste("SS2_15_0049_",colnames(plate2_rpkm_ercc)[2:length(colnames(plate2_rpkm_ercc))],sep="")
expr_rpkm_ercc<-merge(plate1_rpkm_ercc,plate2_rpkm_ercc,by="gene",all=TRUE)
rownames(expr_rpkm_ercc)<-as.character(expr_rpkm_ercc$gene)
expr_rpkm_ercc$gene<-NULL
#MERGING ENDOGENOUS GENES WITH SPIKES
print(paste0("There are ",dim(expr_rpkm)[1]," endogenous genes"))
print(paste0("There are ",dim(expr_rpkm_ercc)[1]," spikes"))
all.counts.rpkm<-rbind(expr_rpkm,expr_rpkm_ercc)
sum(all.counts.rpkm==0)/(dim(all.counts.rpkm)[1]*dim(all.counts.rpkm)[2])
dim(all.counts.rpkm[rowSums(all.counts.rpkm)==0,])
#PLOT BARPLOT OF TOP EXPRESSED GENES
mean_rpkms<-data.frame(Gene=rownames(all.counts.rpkm),Mean_rpkm=rowMedians(as.matrix(all.counts.rpkm)))
mean_rpkms<-mean_rpkms[order(-mean_rpkms$Mean_rpkm),]
head(mean_rpkms,20)
par(mfrow=c(1,1))
barplot(log10(as.numeric(mean_rpkms$Mean_rpkm)[1:50]),names.arg=mean_rpkms$Gene[1:50],las=2,ylim=c(0,5),cex.names=0.8,ylab="Median RPKM")
#FILTER POOR CELLS OUT
cell_QC<-read.delim(paste(Path_Main,"/qc/qc_2plates.filtered_cells.txt",sep=""),row.names=1,header=TRUE,sep="\t")
rownames(cell_QC)<-gsub("__","_",rownames(cell_QC))
all.counts.rpkm<-subset(all.counts.rpkm,select=colnames(all.counts.rpkm)[!colnames(all.counts.rpkm)%in%rownames(cell_QC)])
expr_rpkm<-subset(expr_rpkm,select=colnames(expr_rpkm)[!colnames(expr_rpkm)%in%rownames(cell_QC)])
expr_rpkm_ercc<-subset(expr_rpkm_ercc,select=colnames(expr_rpkm_ercc)[!colnames(expr_rpkm_ercc)%in%rownames(cell_QC)])
#FILTER OUT GENES HAVING LESS THAN 1 RPKM COUNT IN AVERAGE OVER ALL CELLS
RPKM.full<-all.counts.rpkm[rowSums(all.counts.rpkm)>0,]
save(RPKM.full,file='RPKM.full.Rdata') ## 找差異基因
all.counts.rpkm<-all.counts.rpkm[rowMeans(all.counts.rpkm)>=1,]
expr_rpkm<-expr_rpkm[rowMeans(expr_rpkm)>=1,]
expr_rpkm_ercc<-expr_rpkm_ercc[rowMeans(expr_rpkm_ercc)>=1,]
# save(expr_rpkm,expr_rpkm_ercc,file = 'expr_raw_rpkm.Rdata')
對(duì)最后的 RPKM 矩陣進(jìn)行詳細(xì)檢查
load(file='last-RPKM.Rdata')
dat=RPKM
dim(dat)
fivenum(apply(dat,1,function(x) sum(x>0) ))
boxplot(apply(dat,1,function(x) sum(x>0) ))
fivenum(apply(dat,2,function(x) sum(x>0) ))
hist(apply(dat,2,function(x) sum(x>0) ))
這個(gè)
RPKM
矩陣會(huì)進(jìn)入下一步分析, 主要是降維和聚類踩麦。
對(duì)最后的 RPKM.full 矩陣進(jìn)行詳細(xì)檢查
load(file='RPKM.full.Rdata')
dat=RPKM.full
dim(dat)
fivenum(apply(dat,1,function(x) sum(x>0) ))
boxplot(apply(dat,1,function(x) sum(x>0) ))
fivenum(apply(dat,2,function(x) sum(x>0) ))
hist(apply(dat,2,function(x) sum(x>0) ))
這個(gè)
RPKM.full
矩陣會(huì)進(jìn)入下一步分析, 主要是差異分析及表達(dá)量可視化。
顯示運(yùn)行環(huán)境
sessionInfo()
step 3-db scan
首先載入上一步的RPKM矩陣
load('last-RPKM.Rdata')
#run t-SNE 50 times and select the optimal run based on the itercosts parameter
#subgroups of cells are assigned by dbscan
library(Rtsne)
N_tsne <- 50
tsne_out <- list(length = N_tsne)
KL <- vector(length = N_tsne)
set.seed(1234)
然后進(jìn)行50次tsne降維
因?yàn)檫\(yùn)行耗時(shí)很可觀氓癌,所以我們保存運(yùn)行結(jié)果到文件谓谦,以便重復(fù)使用。
for(k in 1:N_tsne)
{
tsne_out[[k]]<-Rtsne(t(log10(RPKM+1)),initial_dims=30,verbose=FALSE,check_duplicates=FALSE,
perplexity=27, dims=2,max_iter=5000)
KL[k]<-tail(tsne_out[[k]]$itercosts,1)
print(paste0("FINISHED ",k," TSNE ITERATION"))
}
names(KL) <- c(1:N_tsne)
save(tsne_out,KL,file = 'tsne_out.Rdata')
查看最佳tsne結(jié)果
load(file = 'tsne_out.Rdata')
opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y
opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]
head(opt_tsne)
[,1] [,2]
[1,] 8.779314 -7.376137
[2,] 11.023719 -7.026307
[3,] 18.795992 -25.336875
[4,] -11.526798 30.196473
[5,] 11.672634 -26.438114
[6,] -8.153627 30.434412
對(duì)tsne結(jié)果進(jìn)行kmeans
table(kmeans(opt_tsne,centers = 4)$clust)
plot(opt_tsne, col=kmeans(opt_tsne,centers = 4)$clust, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
對(duì)tsne結(jié)果進(jìn)行dbscan
library(dbscan)
plot(opt_tsne, col=dbscan(opt_tsne,eps=3.1)$cluster, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
table(dbscan(opt_tsne,eps=3.1)$cluster)
# 比較兩個(gè)聚類算法區(qū)別
table(kmeans(opt_tsne,centers = 4)$clust,dbscan(opt_tsne,eps=3.1)$cluster)
得到最后的分組信息
有一個(gè)樣本是離群點(diǎn),把它歸入到細(xì)胞數(shù)量最多的那個(gè)組別。
library(dbscan)
CAFgroups<-dbscan(opt_tsne,eps=3.1)$cluster
CAFgroups_full<-dbscan(opt_tsne,eps=3.1)
CAFgroups[CAFgroups==0]<-1
CAFgroups_full$cluster[CAFgroups_full$cluster==0]<-1
plot(opt_tsne, col=CAFgroups, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
直接PCA看看dbscan聚類效果
RPKM.PCA<-prcomp(log2(t(RPKM)+1), center=TRUE)
plot(RPKM.PCA$x,main="first PCA", pch=19, col=CAFgroups)
save(CAFgroups,CAFgroups_full,file='CAFgroups.Rdata')
表達(dá)量的可視化(散點(diǎn)圖)
首先作者自定義了一個(gè)繪圖函數(shù), 接受基因名字击敌,tsne的坐標(biāo)矩陣涡匀,以及表達(dá)量矩陣。
#feature plots represent gene expression for each cell on the t-SNE display.
# It requires the name of the gene as a string,
# the output of Rtsne and the expression matrix with rownames representing the gene names.
plot.feature2<-function(gene, tsne.output=tsne.out, DATAuse=DATA){
plot.frame<-data.frame(x=tsne.output$Y[,1], y=tsne.output$Y[,2], log2expr=as.numeric(log2(DATAuse[gene,]+1)))
p<-ggplot(plot.frame,aes(x=x, y=y, col=log2expr))+
geom_point(size=1) +
labs(title=paste(gene))+
theme_classic()+
scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
theme(axis.title = element_blank())+
theme(axis.text = element_blank())+
theme(axis.line = element_blank())+
theme(axis.ticks = element_blank())+
theme(plot.title = element_text(size=20,face="italic"))+
theme(legend.title = element_blank())+
theme(legend.position = "none")
return(p)
}
就可以根據(jù)基因名践剂,從表達(dá)量矩陣?yán)锩娅@取該基因的表達(dá)情況鬼譬,然后把tsne的坐標(biāo)矩陣?yán)L制在畫(huà)布后,使用該基因的表達(dá)量化值來(lái)上顏色逊脯。
load(file = 'tsne_out.Rdata')
library(ggplot2)
opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y
opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]
load(file='RPKM.full.Rdata')
load(file='CAFgroups.Rdata')
plot.feature2("Pdgfra", opt_tsne_full, RPKM.full)
表達(dá)量的可視化(小提琴圖)
首先作者自定義了一個(gè)繪圖函數(shù), 接受基因名字优质,tsne的坐標(biāo)矩陣,以及表達(dá)量矩陣军洼。
#violin plots represent gene expression for each subpopulation. The color of each violin represents the mean gene expression after log2 transformation.
#gene: Gene name of interest as string. DATAuse: Gene expression matrix with rownames containing gene names. tsne.popus = dbscan output, axis= if FALSE no axis is printed. legend_position= default "none" indicates where legend is plotted. gene_name = if FALSE gene name will not be included in the plot.
plot.violin2 <- function(gene, DATAuse, tsne.popus, axis=FALSE, legend_position="none", gene_name=FALSE){
testframe<-data.frame(expression=as.numeric(DATAuse[paste(gene),]), Population=tsne.popus$cluster)
testframe$Population <- as.factor(testframe$Population)
colnames(testframe)<-c("expression", "Population")
col.mean<-vector()
for(i in levels(testframe$Population)){
col.mean<-c(col.mean,mean(testframe$expression[which(testframe$Population ==i)]))
}
col.mean<-log2(col.mean+1)
col.means<-vector()
for(i in testframe$Population){
col.means<-c(col.means,col.mean[as.numeric(i)])
}
testframe$Mean<-col.means
testframe$expression<-log2(testframe$expression+1)
p <- ggplot(testframe, aes(x=Population, y=expression, fill= Mean, color=Mean))+
geom_violin(scale="width") +
labs(title=paste(gene), y ="log2(expression)", x="Population")+
theme_classic() +
scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
scale_fill_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+
theme(axis.title.y = element_blank())+
theme(axis.ticks.y = element_blank())+
theme(axis.line.y = element_blank())+
theme(axis.text.y = element_blank())+
theme(axis.title.x = element_blank())+
theme(legend.position=legend_position )
if(axis == FALSE){
p<-p+
theme( axis.line.x=element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
}
if(gene_name == FALSE){
p<-p+ theme(plot.title = element_blank())
}else{ p<-p + theme(plot.title = element_text(size=10,face="bold"))}
p
}
定義好繪圖函數(shù)后巩螃,理論上可以繪制任意基因的表達(dá)量在不同的聚類的分組表達(dá)情況
plot.violin2(gene = "Pdgfra", DATAuse = RPKM.full, tsne.popus = CAFgroups_full)
顯示運(yùn)行環(huán)境
sessionInfo()
step 4-DEG
文獻(xiàn)描述是:
Differentually expressed genes are detected using Reproducibility-optimized test statistic (ROTS), for each subgroup compared to the other subgroups.
也就是說(shuō)作者并沒(méi)有使用3大單細(xì)胞轉(zhuǎn)錄組R包。
載入RPKM矩陣
load(file='RPKM.full.Rdata')
load(file='CAFgroups.Rdata')
分開(kāi)做4次差異分析
運(yùn)行速度非常慢匕争,建議過(guò)夜等待避乏,然后保存結(jié)果,方便重復(fù)使用甘桑。
library(ROTS)
library(plyr)
groups<-CAFgroups
groups[groups!=1]<-234
ROTS_input<-RPKM.full[rowMeans(RPKM.full)>=1,]
ROTS_input<-as.matrix(log2(ROTS_input+1))
Sys.time()
results_pop1 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop1<-data.frame(summary(results_pop1, fdr=1))
groups<-CAFgroups
groups[groups!=2]<-134
Sys.time()
results_pop2 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop2<-data.frame(summary(results_pop2, fdr=1))
groups<-CAFgroups
groups[groups!=3]<-124
Sys.time()
results_pop3 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop3<-data.frame(summary(results_pop3, fdr=1))
groups<-CAFgroups
groups[groups!=4]<-123
Sys.time()
results_pop4 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234)
Sys.time()
summary_pop4<-data.frame(summary(results_pop4, fdr=1))
save(summary_pop1,summary_pop2,summary_pop3,summary_pop4,
file = 'ROTS_summary_pop.Rdata')
這里直接載入 ROTS 的差異分析結(jié)果即可拍皮。
load(file = 'ROTS_summary_pop.Rdata')
head(summary_pop1)
head(summary_pop2)
head(summary_pop3)
head(summary_pop4)
每個(gè)亞群挑選top18的差異基因繪制熱圖
population_subset<-c(rownames(summary_pop1[summary_pop1$ROTS.statistic<0,])[1:18],rownames(summary_pop2[summary_pop2$ROTS.statistic<0,])[1:18],rownames(summary_pop3[summary_pop3$ROTS.statistic<0,])[1:18],rownames(summary_pop4[summary_pop4$ROTS.statistic<0,])[1:18])
RPKM_heatmap<-RPKM.full[population_subset,]
RPKM_heatmap<-RPKM_heatmap[,order(CAFgroups_full$cluster)]
RPKM_heatmap<-log2(RPKM_heatmap+1)
popul.col<-sort(CAFgroups_full$cluster)
popul.col<-replace(popul.col, popul.col==1,"#1C86EE" )
popul.col<-replace(popul.col, popul.col==2,"#00EE00" )
popul.col<-replace(popul.col, popul.col==3,"#FF9912" )
popul.col<-replace(popul.col, popul.col==4,"#FF3E96" )
library(gplots)
#pdf("heatmap_genes_population.pdf")
heatmap.2(as.matrix(RPKM_heatmap),ColSideColors = as.character(popul.col), tracecol = NA, dendrogram = "none",col=bluered, labCol = FALSE, scale="none", key = TRUE, symkey = F, symm=F, key.xlab = "", key.ylab = "", density.info = "density", key.title = "log2(RPKM+1)", keysize = 1.2, denscol="black", Colv=FALSE)
# dev.off()
顯示運(yùn)行環(huán)境
sessionInfo()
step 5-run Seurat
rm(list = ls())
Sys.setenv(R_MAX_NUM_DLLS=999)
## 首先counts數(shù)據(jù)
load(file='expr_raw_counts.Rdata')
counts=expr_raw
# using raw counts is the easiest way to process data through Seurat.
counts[1:4,1:4];dim(counts)
fivenum(apply(counts,1,function(x) sum(x>0) ))
boxplot(apply(counts,1,function(x) sum(x>0) ))
fivenum(apply(counts,2,function(x) sum(x>0) ))
hist(apply(counts,2,function(x) sum(x>0) ))
dat=log2(edgeR::cpm(counts)+1)
hc=hclust(dist(t(dat)))
plot(hc,labels = FALSE)
clus = cutree(hc, 4)
group_list= as.factor(clus)
table(group_list)
group_list
1 2 3 4
351 151 161 53
#提取批次信息
colnames(dat) #取列名
library(stringr)
plate=str_split(colnames(dat),'_',simplify = T)[,3] #取列名,以'_'號(hào)分割跑杭,提取第三列铆帽。
#str_split()函數(shù)可以分割字符串
table(plate)
n_g = apply(counts,2,function(x) sum(x>1)) #統(tǒng)計(jì)每個(gè)樣本有表達(dá)的有多少行(基因)
# 這里我們定義, reads數(shù)量大于1的那些基因?yàn)橛斜磉_(dá)德谅,一般來(lái)說(shuō)單細(xì)胞轉(zhuǎn)錄組過(guò)半數(shù)的基因是不會(huì)表達(dá)的爹橱。
# 而且大部分單細(xì)胞轉(zhuǎn)錄組技術(shù)很爛,通常超過(guò)75%的基因都沒(méi)辦法檢測(cè)到窄做。
df=data.frame(g=group_list,plate=plate,n_g=n_g) #新建數(shù)據(jù)框(細(xì)胞的屬性信息)
library(stringr)
meta=df
head(meta)
# 上面得到的 counts 和 meta 兩個(gè)變量愧驱,Seurat 需要使用
library(Seurat)
# https://satijalab.org/seurat/mca.html
# 構(gòu)建 Seurat 需要的對(duì)象
# 其中 min.cells 和 min.genes 兩個(gè)參數(shù)是經(jīng)驗(yàn)值
sce <- CreateSeuratObject(counts,
meta.data =meta,
min.cells = 5,
min.genes = 1000,
project = "sce")
# 參考:https://github.com/satijalab/seurat/issues/668
sce
?seurat
table(apply(counts,2,function(x) sum(x>0) )>1000)
table(apply(counts,1,function(x) sum(x>0) )>4)
## 可以看到上面的過(guò)濾參數(shù)是如何發(fā)揮作用的
head(sce@meta.data)
#dim(sce@data)
## 默認(rèn)使用細(xì)胞名字字符串的切割第一列來(lái)對(duì)細(xì)胞進(jìn)行分組
# 所以在這里只有一個(gè)SS2分組信息, 我們就增加一個(gè) group.by 參數(shù)
VlnPlot(sce,
features = c("nFeature_RNA", "nCount_RNA" ),
group.by = 'plate',
ncol = 2)
VlnPlot(sce,
features = c("nFeature_RNA", "nCount_RNA" ),
group.by = 'g',
nCol = 2)
sce <- NormalizeData(object = sce,
normalization.method = "LogNormalize",
scale.factor = 10000)
sce@data[1:4,1:4]
# 這里需要理解 dispersion 值
#sce <- FindVariableGenes(object = sce,
mean.function = ExpMean,
dispersion.function = LogVMR )
sce <- FindVariableFeatures(sce, selection.method = "vst")
all.genes <- rownames(sce)
## 根據(jù)經(jīng)驗(yàn)閾值挑選的變化基因個(gè)數(shù)。
#length( sce@var.genes)
length(rownames(sce))
## 去除一些技術(shù)誤差浸策,比如 nUMI或者ERCC
head(sce@meta.data)
#sce <- ScaleData(object = sce,
vars.to.regress = c("nUMI",'nGene' ))
sce <- ScaleData(sce, features = all.genes,
vars.to.regress = c("nCount_RNA","percent.ercc" ))
# 后面就不需要考慮ERCC序列了冯键。
sce@meta.data[1:4,1:4]
## 普通的PCA降維
sce <- RunPCA(sce, features = VariableFeatures(object = sce),
ndims.print = 1:5, nfeatures.print = 5)
#它利用的也就是sce@reductions$pca@feature.loadings結(jié)果
VizDimLoadings(sce, dims = 1:2, reduction = "pca")
## 根據(jù)參數(shù)來(lái)調(diào)整最后的分組個(gè)數(shù)
#其中包含了一個(gè)resolution的選項(xiàng),它會(huì)設(shè)置一個(gè)”間隔“值庸汗,該值越大惫确,間隔越大,得到的cluster數(shù)量越多
sce <- FindNeighbors(sce, dims = 1:15)
sce1 <- FindClusters(sce, resolution = 0.4)
## resolution 是最關(guān)鍵的參數(shù)
sce=sce1
sce <- RunTSNE(object = sce, dims.use = 1:10, do.fast = TRUE)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = sce)
## 對(duì)每個(gè)類別細(xì)胞都找到自己的marker基因
# Then find markers for every cluster compared to all remaining cells, report # only the positive ones
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE,
min.pct = 0.25,
thresh.use = 0.25)
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
head(meta)
# 下面的基因是文章作者給出的
gs=read.table('top18-genes-in-4-subgroup.txt')[,1]
gs
library(pheatmap)
intersect(top10$gene,gs)
# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name
DoHeatmap(sce, features = top10$gene) + NoLegend()
# save(sce.markers,sce,file='sce_seurat.Rdata')
# load(file='sce_seurat.Rdata')
step 6-SC3
之前已做繪圖,這部分代碼僅作展示改化。
####### SC3 Heatmaps #########
library(scater)
library(SC3)
anno<-data.frame(Plate = unlist(lapply(strsplit(colnames(RPKM),"_"),function(x) x[3])), Population = CAFgroups_full$cluster)
rownames(anno)<-colnames(RPKM)
pd <- new("AnnotatedDataFrame", data = anno)
sceset <- SingleCellExperiment(assays = list(normcounts = as.matrix(RPKM)) ,colData =anno)
counts(sceset)<-normcounts(sceset)
logcounts(sceset)<- log2(normcounts(sceset)+1)
rowData(sceset)$feature_symbol <- rownames(sceset)
sceset<-sceset[!duplicated(rownames(sceset)),]
sceset <- sc3(sceset, ks = 3:5, biology = TRUE)
pdf("sc3_heatmap.pdf", useDingbats = FALSE, width=6, height = 4.5)
sc3_plot_consensus(
sceset, k=4,
show_pdata = c(
"Plate",
"Population"
)
)
dev.off()
matrisome<-as.character(read.table("GitHub/170210_naba_matrisome.txt")[,1])
matrisome<-lookuptable[lookuptable[,1] %in% matrisome,4]
sce.matrisome <- SingleCellExperiment(assays = list(normcounts = as.matrix(RPKM.full[matrisome,])), colData = anno)
counts(sce.matrisome)<-normcounts(sce.matrisome)
logcounts(sce.matrisome)<- log2(normcounts(sce.matrisome)+1)
rowData(sce.matrisome)$feature_symbol <- rownames(sce.matrisome)
sceset<-sce.matrisome[!duplicated(rownames(sce.matrisome)),]
sce.matrisome <- sc3(sce.matrisome, ks = 4, biology = TRUE)
pdf("sce_matrisome_expression.pdf",useDingbats = FALSE, width=6, height = 4.5)
sc3_plot_expression(
sce.matrisome, k=4,
show_pdata = c(
"Plate",
"Population"
)
)
dev.off()
有關(guān)復(fù)現(xiàn)文章分析結(jié)果就到這里掩蛤。
下一篇我們進(jìn)入新章節(jié),關(guān)于公共數(shù)據(jù)陈肛。
我們下一篇再見(jiàn)揍鸟!