Title:An Integrated Three-Long Non-coding RNA Signature Predicts Prognosis in Colorectal Cancer Patients
本文的內(nèi)容是用GDC下載并整理表達矩陣和臨床信息數(shù)據(jù)愈魏。
1.從網(wǎng)頁選擇數(shù)據(jù),下載manifest文件
數(shù)據(jù)存放網(wǎng)站:https://portal.gdc.cancer.gov/
在Repository勾選自己需要的case和file類型萄唇。以CHOL為例:
case-Project選擇TCGA-CHOL巩检。
file-選擇如圖:
左右分別是mrna 和clinical的樣本選擇截圖。選好后椅寺,點擊右側(cè)manifest鍵下載對應的清單文件浑槽。
2.使用gdc-client工具下載
注意:
將gdc-client(mac)或gdc-client.exe(windows)放在工作目錄下;
將manifest文件放在工作目錄下返帕。
options(stringsAsFactors = F)
cancer_type="TCGA-COAD"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("mrna"))dir.create("mrna")
dir()
#下面兩行命令在terminal完成
#./gdc-client download -m gdc_manifest.clinical.txt -d clinical
#./gdc-client download -m gdc_manifest.COAD_RNAseq.txt -d mrna
length(dir("./clinical/")) #結(jié)果是459.說明有459個病人
length(dir("./mrna/")) #結(jié)果是521桐玻,說明有521個樣本
可以看到,下載的文件是按樣本存放的荆萤,我們需要得到的是表格镊靴,需要將他們批量讀入R語言并整理。
3.整理臨床信息
library(XML)
result <- xmlParse("./clinical/007b27c5-d36e-4bec-9321-b03b773fd3b8/nationwidechildrens.org_clinical.TCGA-A6-6651.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))
xmls = dir("clinical/",pattern = "*.xml$",recursive = T)
td = function(x){
result <- xmlParse(file.path("clinical/",x))
rootnode <- xmlRoot(result)
xmldataframe <- xmlToDataFrame(rootnode[2])
return(t(xmldataframe))
}
cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]
4.整理表達矩陣
探索數(shù)據(jù):先任選兩個counts文件讀取链韭,并觀察geneid的順序是否一致偏竟。
options(stringsAsFactors = F)
x = read.table("mrna/02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz")
x2 = read.table("mrna/0826add5-571b-41be-b186-936959fc9d79/dfeb2d54-28da-4521-b488-bdac246bb59b.htseq.counts.gz")
identical(x$V1,x2$V1)
由此可知,他們的geneid順序是一致的敞峭,可以直接cbind踊谋,不會導致順序錯亂。
批量讀取所有的counts.gz文件旋讹。
count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)
ex = function(x){
result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
return(result)
}
head(ex("02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"))
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp) #60488 521 有6萬多行殖蚕,521個樣本
exp[1:4,1:4]
發(fā)現(xiàn)問題:這樣產(chǎn)生出來的表達矩陣沒有列名。
解決辦法:找到一個文件名與樣本ID一一對應的文件沉迹。cart-json文件睦疫。
meta <- jsonlite::fromJSON("metadata.cart.2020-04-23.json")
colnames(meta)
ids <- meta$associated_entities;class(ids)
ids[[1]]
class(ids[[1]][,3]) #并不是每一個數(shù)據(jù)都是第3列,需要自己手動檢查
#不過也可以用str_detect(ids[[1]],"TCGA") 返回值為TRUE就是我們要的列
可以看到鞭呕,meta$associated_entities是個列表笼痛,這個列表里包含數(shù)據(jù)框,數(shù)據(jù)框的第一列內(nèi)容就是tcga樣本id琅拌。
注意缨伊,換了數(shù)據(jù)需要自己探索存放在哪一列。不一定是完全一樣的进宝,需要確認清楚刻坊。
ID = sapply(ids,function(x){x[,3]})
file2id = data.frame(file_name = meta$file_name,
ID = ID)
文件名與TCGA樣本ID的對應關系已經(jīng)得到,接下來是將其添加到表達矩陣中党晋,成為行名谭胚。需要找到讀取文件的順序徐块,一一對應修改。
head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
count_files2的順序就是列名的順序灾而,根據(jù)它來調(diào)整file2id的順序胡控。此處需要再次理解一下match函數(shù)。
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]
表達矩陣整理完成,需要過濾一下那些在很多樣本里表達量都為0的基因旁趟。過濾標準不唯一昼激。
dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 41), ]
dim(exp)
exp[1:4,1:4]
分組信息
根據(jù)樣本ID的第14-15位,給樣本分組(tumor和normal)
table(stringr::str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(substr(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list) #normal:41 tumor:480
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))
接下來就是差異分析锡搜,但是差異分析之前要先把lncRNA挑出來橙困,然后還有配對樣本挑出來,是非常難的兩步8汀!(小潔老師說的)
01.需求:文中作者把配對數(shù)據(jù)以及LncRNA都挑出來了
TCGA的RNA-seq數(shù)據(jù)使用的geneid是ensembl id肠缔,里面不僅有mRNA夏跷,也有非編碼基因和其他類型。
所以明未,如何從TCGA得到的表達矩陣中分別提取出mRNA和lncRNA的表達量呢槽华?
02.思路
1.找到TCGA數(shù)據(jù)對應的參考基因組版本。
2.下載該版本的參考基因組文件亚隅,找到mRNA和lncRNA對應的ensembl id
3.在表達矩陣中篩選硼莽。
03.動起來
1.找參考基因組版本
gdc首頁的support
about the data - GDC Reference Files
可以看到庶溶,使用的參考基因組版本是genecode的v22煮纵。(版本很多,這個是14年的版本了)
2.找區(qū)分類型的列
在gtf文件里并不是直接分出了lncRNA偏螺,需要找gtf文件里對biotype的說明行疏,不看不知道,一看發(fā)現(xiàn)這是一個很長的表格套像。
其中對lncRNA的說明是:
Generic long non-coding RNA biotype that replaced the following biotypes: 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic and sense_overlapping.
所以需要將genetype里這些類型對應的行挑出來酿联,就是lncRNA了。
然后與表達矩陣行名進行匹配替換夺巩,就可以分別得到mRNA和lncRNA的矩陣了贞让。
全部樣本的表達矩陣,供生存分析用
#step1:讀取并探索gtf文件----
options(stringsAsFactors = F)
if(!file.exists("anno.Rdata")){
#BiocManager::install("rtracklayer")
library(rtracklayer)
x = rtracklayer::import("gencode.v22.annotation.gtf")
class(x)
x2 = as.data.frame(x);dim(x2)
colnames(x2)
tj = as.data.frame(table(x2$type));tj
tj2 = as.data.frame(table(x2$gene_type));tj2
#step2:先篩選出gene對應的行
nrow(x2);x2 = x2[x2$type=="gene",];nrow(x2)
tj3 = as.data.frame(table(x2$gene_type));tj3
#step3:提取lnc和mRNA及其對應的ensambelid和symbol
lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
table(x2$gene_type %in% lnc_bype)
table(x2$gene_type == "protein_coding")
lnc_anno = x2[x2$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")]
mRNA_anno = x2[x2$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")]
save(lnc_anno,mRNA_anno,file = "anno.Rdata")
}
load("anno.Rdata")
#step4:表達矩陣拆分和注釋
load("TCGA-COADgdc.Rdata")
table(rownames(exp) %in% mRNA_anno$gene_id)
mRNA_exp = exp[rownames(exp) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改變順序柳譬,可以確認一下
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp[1:4,1:4] #這里的mRNA是所有樣本521個樣本的mRNA
mRNA_exp = na.omit(mRNA_exp)
#同樣辦法拆出lncRNA:也是所有樣本的(521個樣本)
lnc_exp = exp[rownames(exp) %in% lnc_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,lnc_anno,by = "gene_id")
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
lnc_exp = lnc_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(lnc_exp) = x$gene_name
lnc_exp[1:4,1:4]
lnc_exp = na.omit(lnc_exp)
配對樣本的lnc喳张、mRNA 表達矩陣,供差異分析用
挑選樣本美澳,是選列销部,與行(基因)無關
1.選出與normal樣本匹配的tumor樣本它呀,合并為新的表達矩陣
#挑選出來normal的衷佃,然后匹配樣本名
exp2 = rbind(mRNA_exp,lnc_exp) #mRNA和lncRNA合并,不改變順序
identical(rownames(mRNA_exp),head(rownames(exp2),nrow(mRNA_exp)))
table(stringr::str_sub(colnames(exp2),14,15)) #正常樣本個數(shù)?41個大于10的那就是41個正常樣本
library(stringr)
exp2_nor = exp2[,str_sub(colnames(exp2),14,15)==11]
exp2_tum = exp2[,str_sub(colnames(exp2),14,15)!=11]
patient = str_sub(colnames(exp2_nor),1,12) #有配對樣本的病人id
table(str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient))
# 看看腫瘤樣本有重復嗎:
exp2_tum = exp2_tum[,str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)]
exp2_tum = exp2_tum[,str_sort(colnames(exp2_tum))] #str_sort是排序
exp2_tum = exp2_tum[,!duplicated(str_sub(colnames(exp2_tum),1,12) )] #排序去重,這樣樣本有01A和01B可選時土辩,就會留下01A。
#讓正常樣本順序與腫瘤樣本順序一致,這樣才方便寫配對向量
tmp = match(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_nor = exp2_nor[,tmp]
identical(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_small = cbind(exp2_nor,exp2_tum)
拆分回lnc和mRNA
mRNA_exp_small = head(exp2_small,nrow(mRNA_exp))
lnc_exp_small = tail(exp2_small,nrow(lnc_exp))
得到了幾個表達矩陣丢氢,捋一下:6個矩陣
dim(exp) #原始ensambel矩陣 35267 521
dim(exp2) #lncRNA和mRNA矩陣合并得到的symbol矩陣 26682 521
dim(lnc_exp) #lncRNA 8714 521
dim(mRNA_exp) #mRNA 17968 521
dim(lnc_exp_small) #配對的lncRNA矩陣 8714 82
dim(mRNA_exp_small)#配對的lncRNA矩陣 17968 82
save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"lmexp.Rdata"))
save(lnc_exp_small,mRNA_exp_small,file = paste0(cancer_type,"lmexp_small.Rdata"))
然后就是差異分析了:
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(edgeR))BiocManager::install('edgeR')
1.lnc_RNA-edgeR 配對差異分析
rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(lnc_exp_small)/2)
pairinfo = rep(1:(ncol(lnc_exp_small)/2),times = 2)
table(group_list)
library(edgeR)
dge <- DGEList(counts=lnc_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)
fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
table(DEG$change)
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = lnc_exp_small[cg,]
table(head(DEG[cg,"change"],580))
if(!require(tinyplanet))devtools::install_local("tinyplanet-master.zip",upgrade = F)
library(ggplot2)
library(tinyplanet)
lnc_exp_small[1:4,1:4]
#PCA
dat = log(lnc_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#熱圖火山圖
x = lnc_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"lnccg.Rdata"))
2.mRNA-edgeR 配對差異分析 可以搜索edgeR paired 網(wǎng)上有代碼寫配對怎么分析
rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(mRNA_exp_small)/2)
pairinfo = rep(1:41,times = 2)
table(group_list)
library(edgeR)
dge <- DGEList(counts=mRNA_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)
fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
table(DEG$change)
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = mRNA_exp_small[cg,]
table(head(DEG[cg,"change"],580))
if(!require(tinyplanet))devtools::install_local("tinyplanet-master.zip",upgrade = F)
library(ggplot2)
library(tinyplanet)
mRNA_exp_small[1:4,1:4]
#PCA
dat = log(mRNA_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#熱圖火山圖
x = mRNA_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"mRNAcg.Rdata"))
以上代碼出的圖在下面:
下面是生存分析:
生存分析只需要tumor數(shù)據(jù)已旧,不要normal,將其去掉歼指,新表達矩陣數(shù)據(jù)命名為exprSet爹土;
clinical信息需要進一步整理,成為生存分析需要的格式踩身,新臨床信息數(shù)據(jù)命名為meta胀茵。
由于不同癌癥的臨床信息表格組織形式不同,這里的代碼需要根據(jù)實際情況修改挟阻。
rm(list=ls())
options(stringsAsFactors = F)
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp.Rdata")
load("TCGA-COADlnccg.Rdata")
library(stringr)
#clinical通常有幾十列琼娘,選出其中有用的幾列。
tmp = data.frame(colnames(clinical))
clinical = clinical[,c(
'bcr_patient_barcode',
'vital_status',
'days_to_death',
'days_to_last_followup',
'race_list',
'age_at_initial_pathologic_diagnosis',
'gender' ,
'stage_event'
)]
#其實days_to_last_followup應該是找age_at_initial_pathologic_diagnosis附鸽,這表格里沒有脱拼,用days_to_birth計算一下年齡,暫且替代坷备。
dim(clinical)
#rownames(clinical) <- NULL
rownames(clinical) <- clinical$bcr_patient_barcode
clinical[1:4,1:4]
meta = clinical[clinical$days_to_death != 0,]
#簡化meta的列名
colnames(meta)=c('ID','event','death','last_followup','race','age','gender','stage')
#表達矩陣處理
exprSet=lnc_exp[cg,group_list=='tumor']
table(str_sub(colnames(exprSet),1,12) %in% meta$ID)
exprSet = exprSet[,str_sub(colnames(exprSet),1,12) %in% meta$ID]
exprSet = exprSet[,str_sort(colnames(exprSet))]
exprSet = exprSet[,!duplicated(str_sub(colnames(exprSet),1,12) )]
#調(diào)整meta的ID順序與exprSet列名一致
meta=meta[match(str_sub(colnames(exprSet),1,12),meta$ID),]
all(meta$ID==str_sub(colnames(exprSet),1,12))
#整理生存分析的輸入數(shù)據(jù)----
#1.1由隨訪時間和死亡時間計算生存時間(月)
is.empty.chr = function(x){
ifelse(stringr::str_length(x)==0,T,F)
}
is.empty.chr(meta[2,3])
meta[,3][is.empty.chr(meta[,3])]=0
meta[,4][is.empty.chr(meta[,4])]=0
meta$time=(as.numeric(meta[,3])+as.numeric(meta[,4]))/30
#1.2 根據(jù)生死定義event熄浓,活著是0,死的是1
meta$event=ifelse(meta$event=='Alive',0,1)
#1.3 年齡和年齡分組
meta$age=as.integer(meta$age)
meta$age_group=ifelse(meta$age>median(meta$age),'older','younger')
#1.4 stage
library(stringr)
meta$stage
tmp = str_split(meta$stage,' ',simplify = T)[,2]
str_count(tmp,"T")
str_locate(tmp,"T")[,1]
tmp = str_sub(tmp,1,str_locate(tmp,"T")[,1]-1)
table(tmp)
table(is.na(tmp))
meta$stage = tmp
#1.5 race
table(meta$race)
save(meta,exprSet,group_list,cancer_type,file = paste0(cancer_type,"sur_model.Rdata"))
下面就是生存分析的可視化:logrank批量生存分析和cox批量生存分析
1.logrank批量生存分析
logrankfile = paste0(cancer_type,"log_rank_p.Rdata")
if(!file.exists(logrankfile)){
mySurv=with(meta,Surv(time, event))
log_rank_p <- apply(exprSet , 1 , function(gene){
# gene=exprSet[1,]
meta$group=ifelse(gene>median(gene),'high','low')
data.survdiff=survdiff(mySurv~group,data=meta)
p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
return(p.val)
})
log_rank_p=sort(log_rank_p)
save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01)
table(log_rank_p<0.05)
2.cox批量生存分析
coxfile = paste0(cancer_type,"cox.Rdata")
if(!file.exists(coxfile)){
mySurv=with(meta,Surv(time, event))
cox_results <-apply(exprSet , 1 , function(gene){
# gene= exprSet[1,]
group=ifelse(gene>median(gene),'high','low')
survival_dat <- data.frame(group=group,stage=meta$stage,age=meta$age,
gender=meta$gender,
stringsAsFactors = F)
m=coxph(mySurv ~ gender + age + stage + group, data = survival_dat)
beta <- coef(m)
se <- sqrt(diag(vcov(m)))
HR <- exp(beta)
HRse <- HR * se
#summary(m)
tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
HR = HR, HRse = HRse,
HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
return(tmp['grouplow',])
})
cox_results=t(cox_results)
save(cox_results,file = coxfile)
}
load(coxfile)
table(cox_results[,4]<0.01)
table(cox_results[,4]<0.05)
lr = names(log_rank_p)[log_rank_p<0.05]
cox = rownames(cox_results)[cox_results[,4]<0.05]
length(intersect(lr,cox))
save(cox,lr,file = paste0(cancer_type,"cox_lr.Rdata"))
然后是Lasso回歸
1.準備輸入數(shù)據(jù)
load("TCGA-COADsur_model.Rdata")
#load("TCGA-CHOLsur_model.Rdata")
load("TCGA-COADcox_lr.Rdata")
exprSet = exprSet[cox,]
2.構(gòu)建lasso回歸模型
輸入數(shù)據(jù)是表達矩陣(僅含tumor樣本)和對應的生死省撑。
x=t(log2(exprSet+1))
y=meta$event
library(glmnet)
model_lasso <- glmnet(x, y,nlambda=10, alpha=1)
print(model_lasso)
這里是舉例子赌蔑,所以只計算了10個λ值,解釋一下輸出結(jié)果三列的意思竟秫。
- Df 是自由度
- 列%Dev代表了由模型解釋的殘差的比例娃惯,對于線性模型來說就是模型擬合的R^2(R-squred)。它在0和1之間肥败,越接近1說明模型的表現(xiàn)越好趾浅,如果是0,說明模型的預測結(jié)果還不如直接把因變量的均值作為預測值來的有效馒稍。
- Lambda 是構(gòu)建模型的重要參數(shù)皿哨。
解釋的殘差百分比越高越好,但是構(gòu)建模型使用的基因的數(shù)量也不能太多纽谒,需要取一個折中值证膨。
2.1挑選合適的λ值
計算1000個,畫圖佛舱,篩選表現(xiàn)最好的λ值
set.seed(13098)
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)
兩條虛線分別指示了兩個特殊的λ值,一個是lambda.min,一個是lambda.1se,這兩個值之間的lambda都認為是合適的椎例。lambda.1se構(gòu)建的模型最簡單挨决,即使用的基因數(shù)量少,而lambda.min則準確率更高一點订歪,使用的基因數(shù)量更多一點脖祈。
2.2 用這兩個λ值重新建模
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
這兩個值體現(xiàn)在參數(shù)lambda上。有了模型刷晋,可以將篩選的基因挑出來了盖高。所有基因存放于模型的子集beta中,用到的基因有一個s0值眼虱,沒用的基因只記錄了“.”喻奥,所以可以用下面代碼挑出用到的基因。
head(model_lasso_min$beta)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
length(choose_gene_1se)
load("TCGA-COADcox_lr.Rdata")
for_cox = intersect(choose_gene_min,lr)
save(for_cox,choose_gene_min,file = "for_cox.Rdata")
3.模型預測和評估
3.1自己預測自己
newx參數(shù)是預測對象捏悬。輸出結(jié)果lasso.prob是一個矩陣撞蚕,第一列是min的預測結(jié)果,第二列是1se的預測結(jié)果过牙,預測結(jié)果是概率甥厦,或者說百分比,不是絕對的0和1寇钉。
將每個樣本的生死和預測結(jié)果放在一起刀疙,直接cbind即可。
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
head(re)
3.2 箱線圖
對預測結(jié)果進行可視化扫倡。以實際的生死作為分組谦秧,畫箱線圖整體上查看預測結(jié)果。
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr)
p1 = ggboxplot(re, x = "event", y = "prob_min",
color = "event", palette = "jco",
add = "jitter")+ stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
color = "event", palette = "jco",
add = "jitter")+ stat_compare_means()
library(patchwork)
p1+p2
可以看到撵溃,真實結(jié)果是死(0)的樣本疚鲤,預測的值就小一點(靠近0),真實結(jié)果是活著(1)的樣本征懈,預測的值就大一點(靠近1)石咬,整體上趨勢是對的揩悄,但不是完全準確卖哎,模型是可用的。
對比兩個λ值構(gòu)建的模型删性,差別不大亏娜,1se的預測值準確一點。
3.3 ROC曲線
計算AUC取值范圍在0.5-1之間蹬挺,越接近于1越好维贺。可以根據(jù)預測結(jié)果繪制ROC曲線巴帮。
library(ROCR)
library(caret)
# 自己預測自己
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
plot(perf_1se,colorize=FALSE, col="red")
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)))
- 強迫癥選項溯泣,想把兩個模型畫一起虐秋。
plot(perf_min,colorize=FALSE, col="blue")
plot(perf_1se,colorize=FALSE, col="red",add = T)
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.3, labels = paste0("AUC = ",round(auc_min,3)),col = "blue")
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)),col = "red")
-還想用ggplot2畫的更好看一點
tpr_min = performance(pred_min,"tpr")@y.values[[1]]
tpr_1se = performance(pred_1se,"tpr")@y.values[[1]]
dat = data.frame(tpr_min = perf_min@y.values[[1]],
fpr_min = perf_min@x.values[[1]],
tpr_1se = perf_1se@y.values[[1]],
fpr_1se = perf_1se@x.values[[1]])
library(ggplot2)
ggplot() +
geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") +
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
annotate("text",x = .75, y = .25,
label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
scale_x_continuous(name = "fpr")+
scale_y_continuous(name = "tpr")
ggplot() +
geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") +
geom_line(data = dat,aes(x = fpr_1se, y = tpr_1se),color = "red")+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
annotate("text",x = .75, y = .25,
label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
annotate("text",x = .75, y = .15,label = paste("AUC of 1se = ",round(auc_1se,2)),color = "red")+
scale_x_continuous(name = "fpr")+
scale_y_continuous(name = "tpr")
然后是cox-風險森林圖
1.準備輸入數(shù)據(jù)
load("TCGA-COADsur_model.Rdata")
load("for_cox.Rdata")
library(stringr)
2.挑選感興趣的基因構(gòu)建coxph模型
出自文章Integrated genomic analysis identifies subclasses and prognosis signatures of kidney cancer中,五個miRNA是miR-21,miR-143,miR-10b,miR-192,miR-183
將他們從表達矩陣中取出來垃沦,就得到了5個基因在522個腫瘤樣本中的表達量客给,可作為列添加在meta表噶的后面,組成的數(shù)據(jù)框賦值給dat。
e=t(exprSet[for_cox,])
e=log2(e+1)
colnames(e)=str_replace(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)
用survival::coxph()函數(shù)構(gòu)建模型
library(survival)
library(survminer)
#s=Surv(time, event) ~ gender + stage + age + RP5_884M6.1+RP11_148K1.12+RP11_108K3.2+RP1_79C4.4+AC004510.3
# s = as.formula(paste("Surv(time, event) ~ ",
# paste(c(colnames(meta)[c(7,8,6)],
# colnames(e)),
# collapse = "+")
# )
# )
#s=Surv(time, event) ~ RP5_884M6.1+RP11_148K1.12+RP11_108K3.2+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(colnames(e),collapse = "+")))
model <- coxph(s, data = dat )
# 去掉不顯著的基因
summary(model)$concordance
summary(model)
tmp = summary(model)$coefficients[,5]
re = names(tmp[tmp<0.05])
#s=Surv(time, event) ~ RP5_884M6.1+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(re,collapse = "+")))
model <- coxph(s, data = dat )
summary(model)
3.模型可視化-森林圖
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)
4.模型預測
fp <- predict(model,type = "risk")
boxplot(fp)
library(Hmisc)
options(scipen=200)
with(dat,rcorr.cens(fp,Surv(time, event)))
# 若要找到最佳模型肢簿,我們可以進行變量選擇靶剑,可以采用逐步回歸法進行分析
這里只是舉個栗子,自己預測自己的C-index是1-with(dat,rcorr.cens(fp,Surv(time, event)))[[1]]
,實戰(zhàn)應該是拿另一個數(shù)據(jù)集來預測池充,或者將一個數(shù)據(jù)集分兩半桩引,一半構(gòu)建模型,一半驗證,可以使用機器學習的R包切割數(shù)據(jù)收夸。
C-index用于計算生存分析中的COX模型預測值與真實之間的區(qū)分度(discrimination)坑匠,也稱為Harrell's concordanceindex。C-index在0.5-1之間卧惜。0.5為完全不一致,說明該模型沒有預測作用,1為完全一致,說明該模型預測結(jié)果與實際完全一致笛辟。
5.劃分高低風險并畫生存分析圖
names(fp) = rownames(dat)
ri = ifelse(fp<median(fp),"lowrisk","highrisk")
names(ri) = names(fp)
ri = factor(ri)
sfit <- survfit(Surv(time, event)~ri, data=meta)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
risk.table =TRUE,pval =TRUE,
conf.int =TRUE,xlab ="Time in months",
ggtheme =theme_light(),
ncensor.plot = TRUE)
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
time=meta[names(sort(fp)),'time'] ,
event=meta[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一個圖----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二個圖
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三個圖
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap::pheatmap(tmp,
col= mycolors,
show_colnames = F,
cluster_cols = F)
#拼圖實現(xiàn)三圖聯(lián)動
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
c(rep(2,7)),
c(rep(3,7))) #布局矩陣
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1, 2,3))
按照預測值劃分高低風險,加上注釋
fp_dat=data.frame(patientid=1:length(fp),
fp=as.numeric(sort(fp)),
ri= ri[order(fp)])
sur_dat=data.frame(patientid=1:length(fp),
time=meta[names(sort(fp)),'time'] ,
event=meta[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一個圖----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
geom_point(aes(color = ri))+
scale_color_manual(values = c("red","blue"))+
theme_bw();p1
#第二個圖
p2=ggplot(sur_dat,aes(x=patientid,y=time))+
geom_point(aes(col=event))+
scale_color_manual(values = c("red","blue"))+
theme_bw();p2
#第三個圖
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
annotation_col = data.frame(group = ri,
row.names = names(ri))
ann_colors = list(
group = c(lowrisk="blue", highrisk="red")
)
p3=pheatmap::pheatmap(tmp,
col= mycolors,
show_colnames = F,
cluster_cols = F,
annotation_col = annotation_col,
annotation_colors =ann_colors,
annotation_legend = F)
#拼圖實現(xiàn)三圖聯(lián)動
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7)),
c(rep(2,7)),
c(rep(3,7))) #布局矩陣
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1,2,3))