- #00 R包安裝
~~~R
rm(list=ls())
if (!requireNamespace("BiocManager", quietly=TRUE))
? install.packages("BiocManager")
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
~~~
##1.1浊洞、臨床數(shù)據(jù)下載和整理##
~~~R
cancer_type=paste("TCGA","STAD",sep="-")
print(cancer_type)
#下載臨床數(shù)據(jù)方式一
clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
write.csv(clinical,file = paste(cancer_type,"clinical.csv",sep = "-"))
cl_df1 <- read.csv("TCGA_STAD_clinical.csv",header = T)
#View(cl_df1)
#####下載臨床下載方式二(官網(wǎng)頁面下載)推薦捡多,因為臨床信息文件不會很大
#合并數(shù)據(jù)整理############官網(wǎng)下載cart然后接下來整理成數(shù)據(jù)框
#更改 R工作目錄到下載臨床信息了文件夾里
library("XML")
library("methods")
####更改工作目錄到有xml文件的目錄下
all_fiels=list.files(path = "./" ,pattern='*.xml$',recursive=T)#head(all_fiels)
#寫循環(huán)氢哮, 臨床信息整理為數(shù)據(jù)框
cl = lapply(all_fiels, function(x){
? #x=all_fiels[1]
? result <- xmlParse(file = file.path("./",x))
? rootnode <- xmlRoot(result)?
? xmldataframe <- xmlToDataFrame( rootnode[2] )
? #xml共有兩個節(jié)點忽洛,第二個節(jié)點中儲存著病人的信息
? return(t(xmldataframe))
})
cl_df <- unique(t(do.call(cbind,cl)))
#View(cl_df)
save(cl_df,file = 'TCGA_STAD_clinical_df.Rdata')#最好保存在上一個文件夾下
load(file = 'TCGA_STAD_clinical_df.Rdata')
#write.csv(cl_df,file = 'TCGA_STAD_clinical_df.csv')
#write.table(cl_df,file = 'TCGA_STAD_clinical_df.txt')#推薦
~~~
##1.2 #臨床數(shù)據(jù)整理#####
~~~R
colnames(cl_df)
cl_df_select<-as.data.frame(cl_df[,c(5,6,8,9,11,12,37,38)] )
#write.csv(cl_df_select,file = "cl_df_select.csv")
#cl_df_select <- read.csv(file = "cl_df_select.csv",header = T)
#View(cl_df_select)
#########對stage_event列分割######
cl_df_select_new<-tidyr::separate(cl_df_select,stage_event,into = c("stage","TMN"),sep="T")%>% separate(TMN, c('T', 'MN'), sep = 'N')%>% separate(MN, c('M', 'N'), sep = 'M')
#這兩項的TNM分期不清,去除####
cl_df_select_new <- cl_df_select_new[-c(68,389),]
#View(cl_df_select_new)
##刪除
{
cl_df_select_new<- cl_df_select_new[!cl_df_select_new[,7]=="7th",]
cl_df_select_new<- cl_df_select_new[!cl_df_select_new[,7]=="6th",]
cl_df_select_new<-tidyr::separate(cl_df_select_new,stage,into = c("th","stage"),sep="h")
cl_df_select_new<-cl_df_select_new