基于R對TCGA患者臨床數(shù)據(jù)整理和Cox分析

一者祖、數(shù)據(jù)整理

1 導(dǎo)入數(shù)據(jù)

利用data.table包導(dǎo)入數(shù)據(jù),以LUAD為例

library(data.table)
cl = fread('/Users/xena/TCGA-LUAD.GDC_phenotype.tsv.gz')
colnames(cl) 
dim(cl)
tmp=as.data.frame(colnames(cl)) 

2 選擇感興趣的臨床信息

一般常規(guī)項包括submitter_id姆打,vital_status鲫懒,tumor_stage,pathologic_M醉者,pathologic_N但狭,pathologic_T,age_at_initial_pathologic_diagnosis撬即,days_to_death立磁,days_to_last_follow_up,race剥槐,gender

先看所選參數(shù)在cl的位置唱歧,table()查看有幾個level,有無缺失值

which(colnames(cl)=='submitter_id.samples')
which(colnames(cl)=='vital_status.demographic')
table(cl[,'age_at_initial_pathologic_diagnosis'])
colnames(cl[,c(1,80,98,42:44,6,76,86,78,61,103,104,102,36,53)])
meta=as.data.frame(cl[,c(1,80,98,42:44,6,76,86,78,61,103,104,102,36,53)])
colnames(meta)
meta[1:4,1:4]
save(cl, meta, file = 'GDC_TCGA_LUAD_clinical_df.Rdata')
load(file = 'GDC_TCGA_LUAD_clinical_df.Rdata')

3 初步整理參數(shù)

去掉lost follow up的樣本

phe <- meta
table(phe[,15])
phe <- phe[!phe$lost_follow_up == 'YES',]

計算生存時間time

phe[,8][is.na(phe[,8])]=0
phe[,9][is.na(phe[,9])]=0
phe$days <- as.numeric(phe[,9])+as.numeric(phe[,8])
phe$time <- phe$days/30

劃分age_group

phe$age_at_initial_pathologic_diagnosis=as.numeric(phe$age_at_initial_pathologic_diagnosis)
phe$age_group=ifelse(phe$age_at_initial_pathologic_diagnosis>median(phe$age_at_initial_pathologic_diagnosis,na.rm=T),'older','younger')
table(phe$age_at_initial_pathologic_diagnosis)
table(phe$age_group)

stage

table(phe$stage)
phe <- phe[phe$stage != 'not reported',]
table(phe$stage)
phe$stage <- gsub('[stage]','',phe$stage)
phe$stage <- gsub('[a-c]','',phe$stage)
phe$stage <- toupper(phe$stage)

T

levels(factor(phe$T))
phe <- phe[!phe$T == '',]
phe$T <- gsub('[a-b]','',phe$T)
levels(factor(phe$T))

N

table(phe$N)
levels(factor(phe$N))
phe <- phe[!phe$N == 'NX',]
phe$N=as.factor(phe$N)

M

table(phe$M)
levels(factor(phe$M))
phe <- phe[!phe$M == '',]
phe <- phe[!phe$M == 'MX',]
phe$M <- gsub('[a-b]','',phe$M)
levels(factor(phe$M))

save

save(phe, file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')

二粒竖、整理患者信息表

load(file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')
clinical_info <- clin
head(clinical_info) 
library(tableone)

1 初步整理

對需要觀測的臨床特質(zhì)值進行重新編碼

clinical_info$age<-as.numeric(clinical_info$age)
clinical_info$AGE<-factor(ifelse(clinical_info$age>60,'>60','<=60'),ordered = T)
clinical_info$Gender<-factor(toupper(clinical_info$gender),ordered = T)
table(clinical_info$stage)
clinical_info$stage<-factor(clinical_info$stage,ordered = T) 
clinical_info$T<-factor(clinical_info$T,ordered = T)
clinical_info$N<-factor(clinical_info$N,ordered = T)
clinical_info$M<-factor(clinical_info$M,ordered = T) 
clinical_info$event<-factor(toupper(clinical_info$event),ordered = T)

去除不需要的臨床信息

colnames(clinical_info)
clinical_info <- clinical_info[,-c(7:8,10:16)]
clinical_info
group_list=ifelse(as.numeric(substr(clinical_info$ID,14,15)) < 10,'Tumor','Normal')
table(group_list)
clinical_info <- clinical_info[group_list== 'Tumor',]
dput(names(clinical_info))

Vector of variables to summarize

myVars <- dput(names(clinical_info))

Vector of categorical variables that need transformation

 catVars <- myVars[c(2:8)]

2 三線表類型之一 訓(xùn)練集和數(shù)據(jù)集

切割數(shù)據(jù)

library(caret)
set.seed(123456789)
sam<- createDataPartition(clinical_info$event, p = .5,list = FALSE)
train <- clinical_info[sam,]
test <- clinical_info[-sam,]
#查看兩組一些臨床參數(shù)切割比例
prop.table(table(train$stage))
prop.table(table(test$stage))
#添加分組
train$group<-'training datasets'
test$group<-'testing datasets'
clinical_info<-rbind(train,test)
clinical_info$group<-factor(clinical_info$group)

save(clinical_info,test,train, file = 'GDC_TCGA_LUAD-clin-group-test-train.Rdata')

最重要的三線表通常是以訓(xùn)練集和數(shù)據(jù)集來區(qū)分:group

##生成三線表
vars <-colnames(clinical_info)[c(2:8)]
library(tableone)
tb_group<-CreateTableOne(vars = myVars, strata = c("group"), 
                         data = clinical_info,factorVars = catVars) 
#tab1<-print(tb_group, nonnormal = c('age','time'),exact = c(myVars,'AGE'), smd = TRUE)
tab1<-print(tb_group,  smd = TRUE)
summary(tab1)
tab_out<-print(tb_group, catDigits = 1, contDigits = 2, pDigits = 3,
           quote = FALSE, missing = T, explain = TRUE, printToggle = TRUE,
           test = TRUE, smd = T, noSpaces = FALSE, padColnames = FALSE,
           varLabels = FALSE, format = c("fp", "f", "p", "pf")[1],
           showAllLevels = T, cramVars = NULL, dropEqual = FALSE,
           exact = NULL, nonnormal = NULL, minMax = FALSE)

Save to a CSV file

write.csv(tab_out, file = "TCGA-LUAD-phe_clinical_tables_group.csv")

3 需要一個全部病人信息的臨床三線表

tb_all<-CreateTableOne(vars = myVars,
                          data = clinical_info,
                          factorVars = catVars) 
tab2<-print(tb_all)
tab_out2<-print(tb_all, catDigits = 1, contDigits = 2, pDigits = 3,
               quote = FALSE, missing = T, explain = TRUE, 
                printToggle = TRUE,
               test = TRUE, smd = T, noSpaces = FALSE, 
                padColnames = FALSE,
               varLabels = FALSE, format = c("fp", "f", "p", "pf")[1],
               showAllLevels = T, cramVars = NULL, dropEqual = FALSE,
               exact = NULL, nonnormal = NULL, minMax = FALSE)

Save to a CSV file

write.csv(tab_out2, file = "TCGA-LUAD-phe_clinical_tables_all.csv")

三颅崩、COX回歸分析

1 整理分期 data as numeric

load(file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')
clin <- phe

event

clin$event=ifelse(clin$event=='Alive',0,1)

stage

levels(factor(clin$stage))
clin$stage=as.factor(clin$stage)
clin$stage=as.numeric(clin$stage)
levels(factor(clin$stage))

age

levels(factor(clin$age_group))
clin$age_group=as.factor(clin$age_group)
clin$age_group=as.numeric(clin$age_group)

gender

levels(factor(clin$gender))
clin$gender=as.factor(clin$gender)
clin$gender=as.numeric(clin$gender)

T

levels(factor(clin$T))
clin$T=as.factor(clin$T)
clin$T=as.numeric(clin$T)
levels(factor(clin$T))

N

table(clin$N)
levels(factor(clin$N))
clin$N=as.factor(clin$N)
clin$N=as.numeric(clin$N)
levels(factor(clin$N))

M

table(clin$M)
levels(factor(clin$M))
clin$M=as.factor(clin$M)
clin$M=as.numeric(clin$M)
levels(factor(clin$M))

save

save(clin, file = 'GDC_TCGA_LUAD_clinical_cox.Rdata')

2 cox

載入數(shù)據(jù)

library(plyr)
library(survival)
library(survminer)
load(file = 'GDC_TCGA_LUAD_clinical_cox.Rdata')
load(file = 'GDC_TCGA_LUAD-clin-group-test-train.Rdata')

選擇載入不同組數(shù)據(jù)分別進行cox分析

### 1 total

group_list=ifelse(as.numeric(substr(clin$ID,14,15)) < 10,'Tumor','Normal')
table(group_list)
clin <- clin[group_list== 'Tumor',]

### 2 test

clin <- clin[clin$ID %in% test$ID, ]
clin$time <- as.numeric(clin$time)
clin$event <- as.numeric(clin$event)

### 3 training

clin <- clin[clin$ID %in% train$ID, ]
clin$time <- as.numeric(clin$time)
clin$event <- as.numeric(clin$event)

unicox

BaSurv <- Surv(time = clin$time, event = clin$event)
clin$BaSurv <- with(clin,BaSurv)
GCox <- coxph(BaSurv~gender, data = clin)
GSum <- summary(GCox)
GSum$coefficients
HR <- round(GSum$coefficients[,2],2)
PValue <- round(GSum$coefficients[,5],3)
CI <- paste0(round(GSum$conf.int[,3:4],2), collapse = '-')
Unicox <- data.frame('Characteristics'= 'gender',
                     'Hazard Ratio' = HR,
                     'CI95' = CI,
                     'P Value' = PValue)
UniCox <- function(x){
  FML <- as.formula(paste0('BaSurv~',x))
  GCox <- coxph(FML, data = clin)
  GSum <- summary(GCox)
  HR <- round(GSum$coefficients[,2],2)
  PValue <- round(GSum$coefficients[,5],3)
  CI <- paste0(round(GSum$conf.int[,3:4],2), collapse = '-')
  Unicox <- data.frame('Characteristics'= x,
                       'Hazard Ratio' = HR,
                       'CI95' = CI,
                       'P Value' = PValue)
  return(Unicox)
}
UniCox('gender')
UniCox('T')
colnames(clin)
VarNames <- colnames(clin)[c(3:6,8:9,16:17)]
UniVar <- lapply(VarNames, UniCox)
UniVar <- ldply(UniVar, data.frame)
UniVar$Characteristics[UniVar$P.Value < 0.05]

write.csv(UniVar, file = 'GDC_TCGA_LUAD-clin-UniCox.csv')

multicox

fml <- as.formula(paste0('BaSurv~',
                       paste0(UniVar$Characteristics[UniVar$P.Value<0.05],
                                collapse = '+')))
Multicox <- coxph(fml, data = clin)
Multisum <- summary(Multicox)
MultiName <- as.character(UniVar$Characteristics[UniVar$P.Value < 0.05])
MHR <- round(Multisum$coefficients[,2],2)
MPV <- round(Multisum$coefficients[,5],3)
MCIL <- round(Multisum$conf.int[,3],2)
MCIU <- round(Multisum$conf.int[,4],2)
MCI <- paste0(MCIL,'-',MCIU)
MulCox <- data.frame('Characteristics'= MultiName,
                     'Hazard Ratio' = MHR,
                     'CI95' = MCI,
                     'P Value' = MPV)
paste0(UniVar$Characteristics[UniVar$P.Value < 0.05], collapse = '+')
Multicox <- coxph(BaSurv~stage+N+T+gender+smoking_history+EvsA,data = clin)

整合表格

Final <- merge.data.frame(UniVar,MulCox,by='Characteristics',
                          all = T, sort = T)
#輸出
write.csv(Final,file = 'GDC_TCGA_LUAD_clin_cox-result-all.csv')

3 KM生存曲線

library(RColorBrewer)

以stage為例,其他參數(shù)一樣做

surv <- Surv(time =  clin$time, event = clin$event)
surv
fit <- survfit(surv~clin$stage, data=clin)
survdiff(surv~clin$stage)
summary(fit)
fit
ggsurvplot(fit, conf.int=F, pval=TRUE)
ggsurvplot(fit, 
           pval = T, # 在圖上添加log rank檢驗的p值
           legend.labs=c("I","II",'III','IV'), #表頭標(biāo)簽注釋男女
           legend = c(0.8, 0.85),
           ylab="Cumulative survival (percentage)",xlab = "Months",
           linetype = "strata", # 生存曲線的線型
           surv.median.line = "hv", # 標(biāo)注出中位生存時間
           ggtheme = theme_bw(), #背景布局
           palette =  brewer.pal(7, "Set1")[c(3,2,4,1)],# 圖形顏色風(fēng)格
           font.main = c(20, "plain"),
           font.x = c(20, "plain"),
           font.y = c(20, "plain"),
           font.tickslab = c(18, "plain"),
           font.legend = c(18, "plain"),
           font.pval = c(20, "plain")
) 
ggsave(filename = 'survival_stage-train.pdf',width = 8,height = 6)

致謝:生信技能樹

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末蕊苗,一起剝皮案震驚了整個濱河市沿后,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌岁歉,老刑警劉巖得运,帶你破解...
    沈念sama閱讀 206,839評論 6 482
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場離奇詭異锅移,居然都是意外死亡熔掺,警方通過查閱死者的電腦和手機,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,543評論 2 382
  • 文/潘曉璐 我一進店門非剃,熙熙樓的掌柜王于貴愁眉苦臉地迎上來置逻,“玉大人,你說我怎么就攤上這事备绽∪耄” “怎么了?”我有些...
    開封第一講書人閱讀 153,116評論 0 344
  • 文/不壞的土叔 我叫張陵肺素,是天一觀的道長恨锚。 經(jīng)常有香客問我,道長倍靡,這世上最難降的妖魔是什么猴伶? 我笑而不...
    開封第一講書人閱讀 55,371評論 1 279
  • 正文 為了忘掉前任,我火速辦了婚禮塌西,結(jié)果婚禮上他挎,老公的妹妹穿的比我還像新娘。我一直安慰自己捡需,他們只是感情好办桨,可當(dāng)我...
    茶點故事閱讀 64,384評論 5 374
  • 文/花漫 我一把揭開白布。 她就那樣靜靜地躺著站辉,像睡著了一般呢撞。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發(fā)上饰剥,一...
    開封第一講書人閱讀 49,111評論 1 285
  • 那天狸相,我揣著相機與錄音,去河邊找鬼捐川。 笑死脓鹃,一個胖子當(dāng)著我的面吹牛,可吹牛的內(nèi)容都是我干的古沥。 我是一名探鬼主播瘸右,決...
    沈念sama閱讀 38,416評論 3 400
  • 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼岩齿!你這毒婦竟也來了太颤?” 一聲冷哼從身側(cè)響起,我...
    開封第一講書人閱讀 37,053評論 0 259
  • 序言:老撾萬榮一對情侶失蹤盹沈,失蹤者是張志新(化名)和其女友劉穎龄章,沒想到半個月后吃谣,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 43,558評論 1 300
  • 正文 獨居荒郊野嶺守林人離奇死亡做裙,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點故事閱讀 36,007評論 2 325
  • 正文 我和宋清朗相戀三年岗憋,在試婚紗的時候發(fā)現(xiàn)自己被綠了。 大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片锚贱。...
    茶點故事閱讀 38,117評論 1 334
  • 序言:一個原本活蹦亂跳的男人離奇死亡仔戈,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出拧廊,到底是詐尸還是另有隱情监徘,我是刑警寧澤,帶...
    沈念sama閱讀 33,756評論 4 324
  • 正文 年R本政府宣布吧碾,位于F島的核電站凰盔,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏倦春。R本人自食惡果不足惜廊蜒,卻給世界環(huán)境...
    茶點故事閱讀 39,324評論 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望溅漾。 院中可真熱鬧山叮,春花似錦、人聲如沸添履。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,315評論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽暮胧。三九已至锐借,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間往衷,已是汗流浹背钞翔。 一陣腳步聲響...
    開封第一講書人閱讀 31,539評論 1 262
  • 我被黑心中介騙來泰國打工, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留席舍,地道東北人布轿。 一個月前我還...
    沈念sama閱讀 45,578評論 2 355
  • 正文 我出身青樓,卻偏偏與公主長得像来颤,于是被迫代替她去往敵國和親汰扭。 傳聞我的和親對象是個殘疾皇子,可洞房花燭夜當(dāng)晚...
    茶點故事閱讀 42,877評論 2 345

推薦閱讀更多精彩內(nèi)容