0你画、R包準備
先加載所需要的R包(提前下載好)
# 加載R包
library(TCGAbiolinks)
library(tibble)
library(SummarizedExperiment)
library("stringr")
library(dplyr)
library(stats4)
library(BiocGenerics)
library(parallel)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(survival)
library(survminer)
檢查下TCGAbiolinks版本(如果版本過低可能會導致TCGA數(shù)據(jù)讀取失敗)
packageVersion("TCGAbiolinks")
‘2.29.6’
1拍棕、數(shù)據(jù)下載
以TCGA中肝癌的RNA數(shù)據(jù)為示例(可直接將數(shù)據(jù)下載到本地)
TCGAbiolinks:::getGDCprojects()$project_id
cancer_type="TCGA-LIHC"
clinical=GDCquery_clinic(project=cancer_type,type="clinical")
dim(clinical)
clinical[1:4,1:4]
head(colnames(clinical))
query <- GDCquery(project =cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"
)
GDCdownload(query, method = "api", files.per.chunk = 50)
expdat <- GDCprepare(query)###讀取下載好的數(shù)據(jù)
2祷舀、數(shù)據(jù)整理
counts <- as.data.frame(assay(expdat))#默認提取counts數(shù)據(jù)
head(counts)###矩陣預覽
counts數(shù)據(jù)
group_list=ifelse(as.numeric(substr(colnames(counts),14,15)) < 10,'tumor','normal')
table(group_list)
#僅取tumor樣本
exp_tumor=counts[,group_list=='tumor']
為什么小于10的是tumor吟宦,大于10的是normal呢疫衩,因為Sample號從01-29的钮呀,其中01-09是tumor扛邑,也就是癌癥樣本怜浅;其中10-29是normal,也就是癌旁
TCGA編號
meta = clinical
#第一列作為行名
meta <- column_to_rownames(meta,var = "submitter_id")
colnames(meta)
#篩選我們感興趣的臨床信息
meta=meta[,colnames(meta) %in% c("vital_status",
"days_to_last_follow_up",
"days_to_death",
"race",
"gender",
"age_at_index",
"tumor_stage")]
#調(diào)整蔬崩、篩選臨床樣本信息數(shù)量與順序與表達矩陣相同
meta=meta[match(substr(colnames(exp_tumor),1,12),rownames(meta)),]
計算生存時間
#
#days_to_last_follow_up:距離死亡的時間
#day_to_last_follow_up:最后一個隨訪時間到開始時間
meta$days_to_death[is.na(meta$days_to_death)] <- 0 #缺失值標記為0
meta$days_to_last_follow_up[is.na(meta$days_to_last_follow_up)] <- 0
head(meta)
mata數(shù)據(jù)預覽
#時間以月份記恶座,保留兩位小數(shù)
meta$time=round(meta$days/30,2)
meta$time = as.numeric(meta$time)
#2、根據(jù)生死定義活著是0沥阳,死的是1
meta$event=ifelse(meta$vital_status=='Alive',0,1)
table(meta$event)
#3 年齡分組(部分樣本缺失跨琳,考慮可能的影響應該不大)
meta$age_at_index[is.na(meta$age_at_index)] <- 0
meta$age_at_index=as.numeric(meta$age_at_index)
meta$age_group=ifelse(meta$age_at_index>median(meta$age_at_index),'older','younger')
3鹅经、將ENS號轉(zhuǎn)換為基因名
#####將ENS號轉(zhuǎn)換為基因名
ENSEMBL=unlist(str_split(rownames(exprSet),"[.]",simplify=T))[,1]####去除ENS的版本號
df<-as.data.frame(ENSEMBL)
df$symbol <- mapIds(org.Hs.eg.db,
keys=ENSEMBL,
column="SYMBOL",
keytype="ENSEMBL")
轉(zhuǎn)換完成
將基因名添加到exprSet數(shù)據(jù)中并作為行名
exprSet$symbol<-df$symbol
exprSet <- exprSet[!duplicated(exprSet$symbol)& !is.na(exprSet$symbol), ]####去除重復并且刪掉NA
rownames(exprSet)<-exprSet$symbol
exprSet <- dplyr::select(exprSet,-symbol)
將基因名作為數(shù)據(jù)行名
接下來選擇目標基因進行生存分析即可
exprSet2<-as.matrix(exprSet)###不轉(zhuǎn)矩陣無法進行后續(xù)的計算
#######選擇目標基因進行生存分析
meta$gene = ifelse(exprSet2["MS4A1",]>median(exprSet2["MS4A1",]),'high','low')
sfit1=survfit(Surv(time, event)~gene, data=meta)
ggsurvplot(sfit1,pval =TRUE, data = meta, risk.table = TRUE)
生存分析