做植物是一件比較艱苦的事情啥刻,不但資源少贮缕,而且有限的資源未必還能用的好腮猖,就拿Bioconductor上的注釋包來說吧税稼,我在「Bioconductor」不要輕易相信AnnotationHub的物種注釋包, 里面就提到擬南芥的物種包用的注釋其實(shí)一直都沒有更新胆胰。究其原因狞贱,是因?yàn)閿M南芥的物種包里的注釋一直是從TAIR的FTP下載,而我另一篇文章TAIR周期性更新的注釋原來不在FTP服務(wù)器上也說了蜀涨,最新的擬南芥注釋信息是要在另外的地方進(jìn)行下載瞎嬉。所以,我寫了「Bioconductor」再次提醒厚柳,研究植物的不要輕易相信你用的注釋包氧枣, 讓大家嘗試用enricher
解決問題。
但是生活不能茍且别垮,我好歹在生信圈搬了幾年磚便监,遇到困難不能退縮,于是我決定自己構(gòu)建一個(gè)擬南芥的物種包碳想。代碼如下:
library(RSQLite)
library(AnnotationForge)
options(stringsAsFactors = F)
# GENE-GO注釋的數(shù)據(jù)框
# ATH_GO_TERM.txt were create
# by `cat ATH_GO_GOSLIM.txt | cut -f 1,6,8,10 > ATH_GO_TERM.txt`
go_df <- read.table("./ATH_GO_TERM.txt",
sep="\t", header = FALSE,
as.is = TRUE)
go_df$V3 <- ifelse(go_df$V3 == "C", "CC",
ifelse(go_df$V3 == "P", "BP",
ifelse(go_df$V3 == "F", "MF", "")))
# http://www.geneontology.org/page/guide-go-evidence-codes
# select high confidence evidence
go_df <- go_df[! go_df$V4 %in% "IEA",]
colnames(go_df) <- c("GID","GO","ONTOLOGY","EVIDENCE")
# GENE-PUB的數(shù)據(jù)框
pub_df <- read.table("./Locus_Published_20180330.txt.gz",
sep="\t",
header = TRUE)
## 只選擇AT開頭的基因
pub_df <- pub_df[grepl(pattern = "^AT\\d", pub_df$name),]
pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name, split = "\\."))[,1],
pub_df)
## pubmed_id 不能為空
pub_df <- pub_df[!is.na(pub_df$PMID),]
colnames(pub_df) <- c("GID","GENEID","REFID",
"PMID","PUBYEAR")
# GENE-SYMBOL的注釋數(shù)據(jù)庫(kù)
symbol_df <- read.table("./gene_aliases_20180330.txt.gz",
sep = "\t",
header = TRUE)
symbol_df <- symbol_df[grepl(pattern = "^AT\\d", symbol_df$name),]
colnames(symbol_df) <- c("GID","SYMBOL","FULL_NAME")
# GENE-FUNCTION
func_df <- read.table("./Araport11_functional_descriptions_20180330.txt.gz",
sep = "\t",
header=TRUE)
func_df <- func_df[grepl(pattern = "^AT\\d", func_df$name),]
func_df <- cbind(GID=do.call(rbind,strsplit(func_df$name, split = "\\."))[,1],
func_df)
colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE",
"SHORT_DESCRIPTION",
"CURATOR_SUMMARY",
"COMPUTATIONAL_DESCRIPTION")
## 去重復(fù)行
go_df <- go_df[!duplicated(go_df),]
go_df <- go_df[,c(1,2,4)]
pub_df <- pub_df[!duplicated(pub_df),]
symbol_df <- symbol_df[!duplicated(symbol_df),]
func_df <- func_df[!duplicated(func_df),]
# no duplicated row
# all GID should be same type, be aware of factor
file_path <- file.path( getwd())
makeOrgPackage(go=go_df,
pub_info = pub_df,
symbol_info = symbol_df,
function_info = func_df,
version = "0.1",
maintainer = "xuzhougeng <xuzhougeng@163.com>",
author="xuzhogueng <xuzhougeng@163.com>",
outputDir = file_path,
tax_id = "3702",
genus = "At",
species = "tair10",
goTable = "go"
)
#install.packages("./org.Atair10.eg.db", repos = NULL,
# type = "source")
最后會(huì)在指定目錄下生成"org.Atair10.eg.db", 然后就可以用
install.packages("./org.Atair10.eg.db", repos = NULL,
type = "source")
而且我測(cè)試了烧董,能和Y叔的clusterProfiler完美結(jié)合
library(org.Atair10.eg.db)
org <- org.Atair10.eg.db
ego_down <-enrichGO(gene = DEG_GENES,
OrgDb = org,
keyType = "GID",
ont = "BP"
)
目前我是自己用為主,如果你們有需要胧奔,可以按照如下代碼進(jìn)行安裝
# 解決依賴包的問題
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("org.At.tair.db", version = "3.8")
# 安裝我的注釋包
install.packages("https://raw.githubusercontent.com/xuzhougeng/org.At.tair.db/master/org.Atair10.eg.db.tgz", repos=NULL, type="source")
出現(xiàn)問題逊移,歡迎在我的GitHubhttps://github.com/xuzhougeng/org.At.tair.db上提出issue