在前面系列教程中分享了如何下載TCGA表達矩陣(2022新版TCGA數(shù)據(jù)下載與整理)帮碰,通過R進行數(shù)據(jù)合并就能得到TCGA泛癌表達矩陣了市埋。有時候需要分析基因表達與TMB之間的相關性,就會涉及到TCGA突變數(shù)據(jù)下載和TMB計算灵迫。TCGA突變數(shù)據(jù)maf以前可以直接在TCGA下載浑劳,也可以用TCGAbiolinks包的GDCquery_Maf函數(shù)下載秋泄,后來TCGA不讓下載了琐馆,TCGAbiolinks的GDCquery_Maf函數(shù)在新版本里面也被拋棄了。但是TCGAbiolinks包的作者提供了替代方案恒序,具體見TCGAbiolinks: Searching, downloading and visualizing mutation files瘦麸,作者提供了hg38和hg19兩個版本的下載方法,以hg38的TCGA-CHOL為例:
library("TCGAbiolinks")
query <- GDCquery(
project = "TCGA-CHOL",
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
得到的maf就可以用于后續(xù)TMB計算了歧胁,TMB的計算參考這個鏈接(TCGA 數(shù)據(jù)分析實戰(zhàn) —— TMB 與免疫浸潤聯(lián)合分析):
get_TMB <- function(file) {
# 需要用到的列
use_cols <- c(
"Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count"
)
# 刪除這些突變類型
mut_type <- c(
"5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region"
)
# 讀取文件
df <- read_csv(file, col_select = use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 計算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
為了得到泛癌的TMB數(shù)據(jù)滋饲,我將代碼寫成循環(huán),并稍微做了些調整喊巍。
rm(list=ls())
library(TCGAbiolinks)
library(dplyr)
library(stringr)
projects <- getGDCprojects()$project_id
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
projects <- projects[order(projects)]
pan_TMB=list()
for (project in projects){
query <- GDCquery(
project = project,
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
## TMB calculation
# Code Reference https://zhuanlan.zhihu.com/p/394609586
get_TMB <- function(file) {
use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count")
# 刪除這些突變類型
mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region")
# 讀取文件
df <- select(file, use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 計算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
pan_TMB[[project]]=get_TMB(maf)
}
pan_TMB=do.call(rbind,pan_TMB)
pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
pan_TMB=pan_TMB[,c(5,1,3)]
colnames(pan_TMB)=c("Cancer","ID","TMB")
save(pan_TMB,file="pan_TMB.Rdata")
但是在循環(huán)到TCGA-MESO環(huán)節(jié)屠缭,總是會報"Error: Failure to retrieve index 140/0",然后R就重啟了崭参。沒辦法呵曹,把TCGA-MESO單獨出來,再合并阵翎。中間處理TCGA-MESO時在用maftools的read.maf時也有報錯"Error in read.maf(x, isTCGA = TRUE) : No non-synonymous mutations found Check vc_nonSyn`` argumet in
read.maffor details"逢并,后來按照Github上的方法直接讀入表格([No non-synonymous mutations found Check vc_nonSyn argumet in read.maf
for details #838](https://github.com/PoisonAlien/maftools/issues/838)
最終代碼如下:
rm(list=ls())
library(TCGAbiolinks)
library(dplyr)
library(stringr)
projects <- getGDCprojects()$project_id
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
projects <- projects[order(projects)]
projects=projects[-19]
pan_TMB=list()
for (project in projects){
query <- GDCquery(
project = project,
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
## TMB calculation
# Code Reference https://zhuanlan.zhihu.com/p/394609586
get_TMB <- function(file) {
use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode",
"HGVSc", "t_depth", "t_alt_count")
# 刪除這些突變類型
mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
"Silent", "RNA", "Splice_Region")
# 讀取文件
df <- select(file, use_cols)
data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
# 計算 VAF
mutate(vaf = t_alt_count / t_depth) %>%
group_by(Tumor_Sample_Barcode) %>%
summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
return(data)
}
pan_TMB[[project]]=get_TMB(maf)
}
# For TCGA-MESO
query <- GDCquery(
project = "TCGA-MESO",
data.category = "Simple Nucleotide Variation",
access = "open",
legacy = FALSE,
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
# Reference:https://github.com/PoisonAlien/maftools/issues/838
mafFilePath2 = dir(path = "GDCdata/TCGA-MESO",pattern = "masked.maf.gz$",full.names = T,recursive=T)
TCGA_MESO = lapply(mafFilePath2, data.table::fread, skip = "Hugo_Symbol")
TCGA_MESO = data.table::rbindlist(l = TCGA_MESO, use.names = TRUE, fill = TRUE)
pan_TMB[["TCGA-MESO"]]=get_TMB(TCGA_MESO)
pan_TMB=do.call(rbind,pan_TMB)
pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
pan_TMB=pan_TMB[,c(5,1,3)]
colnames(pan_TMB)=c("Cancer","ID","TMB")
pan_TMB$TMB=round(pan_TMB$TMB,1) # 保留一位小數(shù)
pan_TMB=pan_TMB[order(pan_TMB$Cancer),]
save(pan_TMB,file="pan_TMB.Rdata")
最終數(shù)據(jù)如下:
查看下每個腫瘤的突變情況。
table(pan_TMB$Cancer)
# ACC BLCA BRCA CESC CHOL COAD DLBC ESCA GBM HNSC KICH KIRC KIRP LAML LGG LIHC LUAD LUSC MESO OV PAAD PCPG
# 90 414 986 289 51 454 47 184 461 510 66 357 282 128 521 371 616 544 78 462 170 182
# PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC UCS UVM
# 493 161 238 469 431 146 494 121 518 57 80
通過ID可以和TCGA表達矩陣合并郭卫,再根據(jù)TCGA基因與免疫評分帶側邊密度圖的相關性點圖的方法 就可以進行TMB與基因表達相關性分析了。