第一步:進(jìn)入TCGA官網(wǎng)下載數(shù)據(jù)
網(wǎng)址:https://portal.gdc.cancer.gov
-
進(jìn)入官網(wǎng)數(shù)據(jù)下載界面后,點(diǎn)擊Repository
-
在Files和Cases兩個(gè)界面中選擇自己需要的數(shù)據(jù)
-
FPKM數(shù)據(jù)格式在Workflow選項(xiàng)中選擇
-
選擇完數(shù)據(jù)后,點(diǎn)擊Add All Files to Cart
-
點(diǎn)擊右上角的Cart匾委,Cart旁邊的數(shù)字表示我們選擇了多少個(gè)樣本的數(shù)據(jù)
-
點(diǎn)擊Download按鈕然后出來兩個(gè)選項(xiàng)赂乐,Manifest表示使用GDC Data Transfer Tool讀取Manifest文件下載數(shù)據(jù)咖气,Cart表示直接通過瀏覽器鏈接下載數(shù)據(jù)崩溪。同時(shí)還需要點(diǎn)擊Metadata按鈕,下載Metadata文件
注:GDC Data Transfer Tool下載數(shù)據(jù)方法見 http://www.reibang.com/p/f4e92d226e6d
第二步:使用R合成表達(dá)矩陣
- 下載下來數(shù)據(jù)是很多個(gè)文件夾觉既,每個(gè)文件夾是一個(gè)樣本的數(shù)據(jù)瞪讼,因此文件夾個(gè)數(shù)應(yīng)該等于Cart中的個(gè)數(shù)粹断,如果不等瓶埋,代表我們下載數(shù)據(jù)時(shí)有丟失。我們將所有的數(shù)據(jù)文件夾拷貝到一個(gè)文件夾中曾撤,命名為rawdata
- 在R中處理晕粪,代碼如下:
rm(list = ls())
options(stringsAsFactors = F)
#我們自己設(shè)置好工作路徑,然后將rawdata文件夾拷貝到工作路徑下
dir.create("all_data")
for (dirname in dir("rawdata/")){
file <- list.files(paste0(getwd(),"/rawdata/",dirname),pattern = "*.FPKM")
file.copy(paste0(getwd(),"/rawdata/",dirname,"/",file),"all_data")
}
dir.create("unpacked_FPKM")
#所有樣本的單個(gè)文件都拷貝在了all_data文件夾中尖啡,但是這些文件都是壓縮格式的,然后使用解壓縮軟件將所有壓縮文件統(tǒng)一解壓縮到unpacked_FPKM文件夾中
metadata <- jsonlite::fromJSON("metadata.cart.2020-04-24.json")
require(dplyr)
metadata_id <- metadata %>%
dplyr::select(c(file_name,associated_entities))
naid_df <- data.frame()
for (i in 1:nrow(metadata)){
naid_df[i,1] <- substr(metadata_id$file_name[i],1,nchar(metadata_id$file_name[i])-3)
naid_df[i,2] <- metadata_id$associated_entities[i][[1]]$entity_submitter_id
}
colnames(naid_df) <- c("filename","TCGA_id")
#naid_df儲存了文件名和TCGA_id的對應(yīng)關(guān)系
files <- dir("unpacked_FPKM")
myfread <- function(files){
data.table::fread(paste0("unpacked_FPKM/",files))[,2]
}
f <- lapply(files,myfread)
f <- do.call(cbind,f)
rownames(naid_df) <- naid_df[,1]
naid_df <- naid_df[files,]
colnames(f) <- naid_df$TCGA_id
gene_id <- data.table::fread(paste0("unpacked_FPKM/",files[1]))$V1
expr_df <- cbind(gene_id=gene_id,f)
save(expr_df,naid_df,file = "FPKM_ENSG_exprdf.Rdata")
第三步:將ensembl數(shù)據(jù)庫的ENSG編號轉(zhuǎn)換成gene symbol
- 在ensembl數(shù)據(jù)庫中下載數(shù)據(jù)剩膘,網(wǎng)址:http://asia.ensembl.org/index.html
-
點(diǎn)擊Downloads→databases→Human選項(xiàng)中的GTF
-
下載圖示文件
- R中處理:
rm(list=ls())
options(stringsAsFactors = F)
gtf<-rtracklayer::import('Homo_sapiens.GRCh38.100.chr.gtf')
gtf_df <- as.data.frame(gtf)
save(gtf_df,file = "gtf_df.Rdata")
load("FPKM_ENSG_exprdf.Rdata")
metadata <- naid_df[,-1]
metadata<-data.frame(TCGA_id=metadata)
require(dplyr)
require(tidyr)
expr_df_nopoint <- expr_df %>%
tidyr::separate(gene_id,into = c("gene_id"),sep="\\.")
#提取蛋白編碼基因
mRNA_exprSet <- gtf_df %>%
dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>%
dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%
dplyr::inner_join(expr_df_nopoint,by ="gene_id") %>%
tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")
save(mRNA_exprSet,file = "mRNA_exprSet.Rdata")
#提前LncRNA的基因
ncRNA <- c("sense_overlapping","lincRNA","3prime_overlapping_ncRNA",
"processed_transcript","sense_intronic",
"bidirectional_promoter_lncRNA","non_coding",
"antisense_RNA")
LncRNA_exprSet <- gtf_df %>%
dplyr::filter(type=="transcript",transcript_biotype %in% ncRNA) %>%
dplyr::select(c(gene_name,gene_id,transcript_biotype)) %>%
dplyr::distinct() %>%
dplyr::inner_join(expr_df_nopoint,by ="gene_id") %>%
tidyr::unite(gene_id,gene_name,gene_id,transcript_biotype,sep = " | ")
save(LncRNA_exprSet,file = "LncRNA_exprSet.Rdata")