【LGG數(shù)據(jù)下載】
從CGGA數(shù)據(jù)官網(wǎng)下載RNA-seq數(shù)據(jù),選擇其中包含325個樣本的測序數(shù)據(jù)因苹,下載表達矩陣和臨床信息
表達矩陣:CGGA.mRNAseq_325.RSEM-genes.20200506.txt
臨床信息:CGGA.mRNAseq_325_clinical.20200506.txt
lncRNA注釋文件:gencode.v35.long_noncoding_RNAs.gtf
鏈接:http://www.cgga.org.cn/
讀取到Rstudio里摸袁,并提取m6a相關(guān)基因的表達矩陣当船,以及所有l(wèi)ncRNA的表達矩陣
## 讀取表達矩陣
expr<-read.table("CGGA.mRNAseq_325.RSEM-genes.20200506.txt",header = T,check.names = F)
# 讀取需要提取的m6a基因文件
m6a_gene<-read.csv("m6a.csv",header = T,check.names = F)
## 根據(jù)基因名提取m6a基因的表達矩陣
m6a_expr<-expr[expr$Gene_Name%in%m6a_gene$`Gene name`,]
# 保存為csv文件
write.csv(m6a_expr,file = "m6a_expr.csv")
## 提取lncRNA的表達矩陣
library("rtracklayer") #載入rtracklayer包用于讀取gtf文件
lnc_gtf<-rtracklayer::import("gencode.v35.long_noncoding_RNAs.gtf")
lnc_gtf<-as.data.frame(lnc_gtf)
## 從lnc_gft提取需要的信息
lnc_id<- lnc_gtf[,10:12]
unique(lnc_id$gene_type)
## 提取lncRNA對應的基因名
# 給注釋文件按基因名去重
lnc_id<-lnc_id[!duplicated(lnc_id$gene_name),]
# 按lncRNA基因名提取表達矩陣
lnc_expr<-expr[expr$Gene_Name%in%lnc_id$gene_name,]
write.csv(lnc_expr,file = "lnc_expr.csv")
結(jié)果:
20個m6a相關(guān)基因的表達矩陣
1076個lncRNA基因的表達矩陣
【結(jié)合臨床信息篩選需要的樣本】
因為后面需要做單因素cox柒莉,lasso回歸纲熏,列線圖碾盟,需要有預后生存信息的樣本棚辽,所以需要提取哪些對應的樣本
## 讀取臨床信息
clinic<-read.table("CGGA.mRNAseq_325_clinical.20200506.txt",header = T,check.names = F,sep = "\t")
clinic_II<-clinic[clinic$Grade=="WHO II",c(1,7,8)]
clinic_III<-clinic[clinic$Grade=="WHO III",c(1,7,8)]
## 選擇低級別膠質(zhì)瘤
clinic<-rbind(clinic_II,clinic_III)
## LGG patients with missing OS values or OS < 30 days were excluded in order to reduce statistical bias in our analysis.
clinic<-clinic[!is.na(clinic$OS),]
clinic<-clinic[!is.na(clinic$`Censor (alive=0; dead=1)`),]
clinic<-clinic[!clinic$OS<30,]
## 提取匹配臨床信息的樣本表達矩陣
m6a_expr<-m6a_expr[,c(T,colnames(m6a_expr)[-1]%in%clinic$CGGA_ID)]
lnc_expr<-lnc_expr[,c(T,colnames(lnc_expr)[-1]%in%clinic$CGGA_ID)]
save(lnc_expr,m6a_expr,clinic,file = "lnc_m6a_clinic.Rdata")
結(jié)果:
171個帶有臨床信息LGG樣本,并匹配了m6a和lncRNA
【pearson相關(guān)性分析】
篩選條件:pvalue<0.001&abs(correlation)>0.5
## 作pearson相關(guān)性分析
library(dplyr)
library(tidyr)
rownames(m6a_expr)<-m6a_expr$Gene_Name;m6a_expr<-m6a_expr[,-1]
rownames(lnc_expr)<-lnc_expr$Gene_Name;lnc_expr<-lnc_expr[,-1]
## for循環(huán)
#lnc_expr1<-lnc_expr[!apply(lnc_expr, 1,mean)==0,]
## for循環(huán)
# i<-1
cor_result<-list()
for (i in 1:ncol(m6a_expr)){
exp_cor<-rbind(m6a_expr[i,],lnc_expr)
## exp_cor已就緒
y <- as.numeric(exp_cor[1,])
colnames <- rownames(exp_cor)
cor_data_df <- data.frame(colnames)
for (m in 1:length(colnames)){
test <- cor.test(as.numeric(exp_cor[m,]),y,type="pearson")
cor_data_df[m,2] <- test$estimate
cor_data_df[m,3] <- test$p.value
}
names(cor_data_df) <- c("symbol","correlation","pvalue")
## pvalue<0.001&abs(correlation)>0.5
cor_data_sig <- cor_data_df %>%
filter(pvalue < 0.001) %>%
filter(abs(correlation)>0.5)
cor_result[[i]]<-cor_data_sig
print(i)
}
save(cor_result,file = "cor_result.Rdata")
cor_result[[3]]
## 將相關(guān)性分析后得到的基因名提取出來
cor_gene<-matrix(data = NA,ncol = 1)
for (i in 1:20) {
mt<-cor_result[[i]]$symbol[-1]
mt<-as.matrix(mt[!mt%in%cor_gene])
cor_gene<-rbind(cor_gene,mt)
print(i)
}
cor_gene<-cor_gene[-1,]
length(unique(cor_gene))
## 162個
cor_gene_num<-unique(cor_gene)
## 保存txt文件
write.table(cor_gene_num,file = "cor_gene_num.txt")
【單因素cox分析】
篩選條件:pvalue<0.05
## 單因素cox分析
## 數(shù)據(jù)準備:162個lncRNA的表達矩陣和之前匹配好的臨床信息
lnc<-read.table("cor_gene_num.txt",header = T,sep = " ")
exp_lnc<-t(lnc_expr)
exp_lnc<-exp_lnc[,colnames(exp_lnc)%in%lnc$x]
## 提取477個有表達矩陣的樣本的臨床信息
ovtime01<-clinic[match(rownames(exp_lnc),clinic$CGGA_ID),]
colnames(ovtime01)<-c("Patient_ID","OStime","OSstatus")
## 為單因素cox分析做準備
unicox<-cbind(ovtime01,exp_lnc)
## 創(chuàng)建一個空白的數(shù)據(jù)框用于存儲分析結(jié)果
univar_out <- data.frame(matrix(NA,ncol(unicox)-3,6))
univar_out[,1]<-colnames(unicox)[-(1:3)]
colnames(univar_out)<-c("Genesymbol","Coeffcient","HR","lower .95","upper .95","P-value")
## cox_data即為輸入文件,關(guān)于為什么不用unicox冰肴,因為matrix把數(shù)據(jù)類型給弄亂了而且相關(guān)性分析需要數(shù)據(jù)服從方差齊正太分布特點
cox_data<-unicox[,c(4:ncol(unicox))]
class(cox_data)
cox_data<-as.matrix(cox_data)
cox_data<-apply(cox_data, c(1,2),as.numeric)
# cox_data<-log2(cox_data + 0.01)# 左偏轉(zhuǎn)正太分布屈藐,好像不用轉(zhuǎn)換
cox_data<-cbind(unicox[,1:3],cox_data)
cox_data$OSstatus<-as.numeric(cox_data$OSstatus)
library(survival)
for (i in 1:(ncol(cox_data)-3)){
cox = coxph(Surv(OStime,OSstatus) ~ cox_data[,i+3],data = cox_data)
cox_summ = summary(cox)
univar_out[i,2]=cox_summ$coefficients[,1]
univar_out[i,3]=cox_summ$coefficients[,2]
univar_out[i,4]=cox_summ$conf.int[,3]
univar_out[i,5]=cox_summ$conf.int[,4]
univar_out[i,6]=cox_summ$coefficients[,5]
}
save(univar_out,file = "univar_out.RData")
univar_out_0.05 = univar_out[univar_out[,6]<0.05,]
write.table(univar_out_0.05,file = "univar_out_0.05.txt",sep = "\t",quote = F)
結(jié)果:
最后得到68個與LGG預后相關(guān)的lncRNA
【給TCGA和CGGA單因素cox回歸得到的基因取交集,繪制熱圖】
lnc_cor<-read.table("VEEN-lnc.txt",header = F)
class(cor_result[[2]])
## 創(chuàng)建兩個表格存儲相關(guān)系數(shù)和pvalue
cor<-matrix(NA,nrow = nrow(lnc_cor),ncol = 21) ## TCGA是21個m6a基因熙尉,CGGA是20個m6a基因
rownames(cor)<-lnc_cor$V1
pvalue<-matrix(NA,nrow = nrow(lnc_cor),ncol = 21)
rownames(pvalue)<-lnc_cor$V1
## 給這兩個表格標列名
colnames<-do.call(cbind,lapply(cor_result, function(data){data[1,1]}))
colnames(cor)<-colnames[1,]
colnames(pvalue)<-colnames[1,]
library(tidyr)
## 提取相關(guān)系數(shù)
for (i in 1:ncol(cor)) {
cor[rownames(cor)%in%cor_result[[i]]$symbol,i]<-cor_result[[i]]$correlation[cor_result[[i]]$symbol%in%rownames(cor)]
}
cor<-replace_na(cor,0)
write.csv(cor,file = "cor_matrix.csv")
## 提取p值
for (i in 1:ncol(pvalue)) {
pvalue[rownames(pvalue)%in%cor_result[[i]]$symbol,i]<-cor_result[[i]]$pvalue[cor_result[[i]]$symbol%in%rownames(pvalue)]
}
pvalue<-replace_na(pvalue,1)
write.csv(pvalue,file = "pvalue_matrix.csv")
## 繪制熱圖
#install.packages("reshape2")
#install.packages("RColorBrewer")
library(reshape2)
library(RColorBrewer)
options(stringsAsFactors = F)
up <- pvalue
dn <- cor
dn=t(dn)
up=t(up)
#設(shè)置顏色
colVector=c("#AB221F","#FFFADD","#3878C1")
#設(shè)置標簽
m6a.level <- as.character(rownames(dn))
lnc.level <- as.character(colnames(dn))
#把行轉(zhuǎn)為列
dn.long <- setNames(melt(dn), c('Gene', 'lnc', 'Frequency'))
dn.long$Categrory <- "DN"
up.long <- setNames(melt(up), c('Gene', 'lnc', 'Frequency'))
up.long$Categrory <- "UP"
#右下角顏色
dn.long$range <- cut(dn.long$Frequency,
breaks = seq(floor(min(dn.long$Frequency)),
ceiling(max(dn.long$Frequency)),0.01))
rangeMat1 <- levels(dn.long$range)
rbPal1 <- colorRampPalette(colors = c(colVector[3],"white",colVector[1]))
col.vec1 <- rbPal1(length(rangeMat1)); names(col.vec1) <- rangeMat1
dn.long$color <- col.vec1[as.character(dn.long$range)]
#左上角顏色
up.long$range <- cut(up.long$Frequency, breaks = seq(floor(min(up.long$Frequency)),ceiling(max(up.long$Frequency)),0.01))
rangeMat2 <- levels(up.long$range)
rbPal2 <- colorRampPalette(colors = c(colVector[3],colVector[2]))
col.vec2 <- rbPal2(length(rangeMat2)); names(col.vec2) <- rangeMat2
up.long$color <- col.vec2[as.character(up.long$range)]
#合并右上角和左上角
heatmat <- rbind.data.frame(dn.long,up.long)
pdf("heatmap.pdf",width = 7,height = 6)
layout(mat=matrix(c(1,0,1,2,1,0,1,3,1,0),5,2,byrow=T),widths=c(length(lnc.level),2))
#繪制熱圖區(qū)域
par(bty="n", mgp = c(2,0.5,0), mar = c(5.0, 5.5, 3, 3),tcl=-.25,xpd = T)
x=as.numeric(factor(heatmat$lnc,levels = lnc.level))
y=as.numeric(factor(heatmat$Gene,levels = m6a.level))
#創(chuàng)建空白畫布
plot(1,xlim=c(1,length(unique(x))+1),ylim=c(1,length(unique(y))+1),
xaxs="i", yaxs="i",xaxt="n",yaxt="n",
type="n",bty="n",xlab="",ylab="",
main = "Correlation between m6a genes and lncRNA",cex.main=2)
#填充顏色
for(i in 1:nrow(heatmat)) {
if(heatmat$Categrory[i]=="DN") polygon(x[i]+c(0,1,1),y[i]+c(0,0,1),col=heatmat$color[i])
if(heatmat$Categrory[i]=="UP") {
polygon(x[i]+c(0,1,0),y[i]+c(0,1,1),col=heatmat$color[i])
if(heatmat$Frequency[i]<0.001){
text(x[i]+0.5,y[i]+0.8,"***",cex=0.8)
}else if(heatmat$Frequency[i]<0.01){
text(x[i]+0.5,y[i]+0.8,"**",cex=0.8)
}else if(heatmat$Frequency[i]<0.05){
text(x[i]+0.5,y[i]+0.8,"*",cex=0.8)
}
}
}
#基因名和癌癥
axis(1,at = sort(unique(x)) + 0.5,labels = lnc.level,lty = 0,las = 2)
axis(2,at = sort(unique(y)) + 0.5,labels = m6a.level,lty = 0,las = 1)
#mtext("lnc types",side = 1,line = 3.5,cex=1.2)
#繪制圖例
par(mar=c(0,0,0,5),xpd = T,cex.axis=1.6)
barplot(rep(1,length(col.vec2)),border = NA, space = 0,ylab="",xlab="",ylim=c(1,length(col.vec2)),horiz=TRUE,
axes = F, col=col.vec2) # Loss
axis(4,at=c(1,ceiling(length(col.vec2)/2),length(col.vec2)),c(round(min(up),1),'Pvalue',round(max(up),1)),tick=FALSE)
par(mar=c(0,0,0,5),xpd = T,cex.axis=1.6)
barplot(rep(1,length(col.vec1)),border = NA, space = 0,ylab="",xlab="",ylim=c(1,length(col.vec1)),horiz=TRUE,
axes = F, col=col.vec1) # Gain
axis(4,at=c(1,ceiling(length(col.vec1)/2),length(col.vec1)),c(round(min(dn),1),'Cor',round(max(dn),1)),tick=FALSE)
dev.off()