本文大部分內(nèi)容來(lái)源于網(wǎng)絡(luò),寫(xiě)此文章僅是為了自己后面查找方便煤率,如有侵權(quán)仰冠,聯(lián)系刪除。
參考來(lái)自于:
自己構(gòu)建物種包OrgDb蝶糯,然后用clusterProfiler富集分析 - 簡(jiǎn)書(shū) (jianshu.com)
構(gòu)建物種的OrgDb教程
eggnog 網(wǎng)站信息http://eggnog-mapper.embl.de/洋只, 可以直接提交蛋白序列,如果提交的為多個(gè)轉(zhuǎn)錄本的蛋白序列昼捍,預(yù)測(cè)結(jié)果為各個(gè)轉(zhuǎn)錄本的注釋信息识虚,需要自己將各個(gè)轉(zhuǎn)錄本的注釋信息整合為單個(gè)基因的注釋信息
1. 構(gòu)建系統(tǒng)OrgDb數(shù)據(jù)庫(kù)
Step1 加載需要的庫(kù),若無(wú)妒茬,則用install.packages()安裝
library(clusterProfiler)
library(tidyverse)
library(stringr)
library(KEGGREST)
library(AnnotationForge)
library(clusterProfiler)
library(dplyr)
library(jsonlite)
library(purrr)
library(RCurl)
options(stringsAsFactors = F)
emapper <-read.table("your_eggnog_result",header = TRUE, sep = "\t")
#讀入自己的基因組注釋信息担锤,直接提交到eggnog網(wǎng)站,如果是區(qū)分轉(zhuǎn)錄本進(jìn)行的注釋?zhuān)枰獙⒆⑨屝畔⒄蠟榛虻淖⑨屝畔?#該文檔主要是四列乍钻,包括query Preferred_name GOs KEGG_ko
emapper[emapper==""]<-NA #將缺省值替換為NA
step2 提取GO信息
提取emapper中的query列肛循、Preferred_name列以及GOs列
gene_info <- emapper %>% dplyr::select(GID = query,GENENAME = Preferred_name) %>% na.omit()
gos <- emapper %>% dplyr::select(query, GOs) %>% na.omit()
構(gòu)建一個(gè)空的gene2go數(shù)據(jù)框,為后面填充數(shù)據(jù)用
gene2go = data.frame(GID =character(),GO = character(),EVIDENCE = character())
對(duì)gene2go空數(shù)據(jù)框進(jìn)行填充银择,循環(huán)的作用簡(jiǎn)單來(lái)說(shuō)就單行變多行
for (row in 1:nrow(gos)) {
the_gid <- gos[row, "query"][[1]]
the_gos <- str_split(gos[row,"GOs"], ",", simplify = FALSE)[[1]]
df_temp <- data_frame(GID=rep(the_gid, length(the_gos)),
GO = the_gos,
EVIDENCE = rep("IEA", length(the_gos)))
gene2go <- rbind(gene2go, df_temp)}
將gene2go中GOs列中的“-”替換為NA育拨,然后使用na.omit()刪除含有NA的行
gene2go$GO[gene2go$GO=="-"]<-NA
gene2go<-na.omit(gene2go)
step3 提取KEGG信息
#把Emapper中的query列、KEGG_ko列提出出來(lái)
gene2ko <- emapper %>% dplyr::select(GID =query, Ko = KEGG_ko) %>% na.omit()
#將gene2ko的Ko列中的“Ko:”刪除欢摄,不然后面找pathway會(huì)報(bào)錯(cuò)
gene2ko$Ko <- gsub("ko:","",gene2ko$Ko)
#下載KO的json文件并【放到當(dāng)前文件夾下】熬丧,下載地址:https://www.genome.jp/kegg-bin/get_htext?ko00001
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))}}}
save(pathway2name, ko2pathway, file = "kegg_info.RData")}
調(diào)用函數(shù)后在本地創(chuàng)建kegg_info.RData文件
update_kegg()
載入kegg_info.RData文件
load(file = "kegg_info.RData")
根據(jù) gene2ko中的ko信息將ko2pathway中的pathway列提取出來(lái)
gene2pathway <- gene2ko %>% left_join(ko2pathway,by = "Ko") %>% dplyr::select(GID, Pathway) %>% na.omit()
在網(wǎng)站https://www.ncbi.nlm.nih.gov/taxonomy上查詢(xún)物種信息
tax_id="4577"
genus="zea"
species="mays"
去除重復(fù)
gene2go <- unique(gene2go)
gene2go <- gene2go[!duplicated(gene2go),]
gene2ko <- gene2ko[!duplicated(gene2ko),]
gene2pathway <- gene2pathway[!duplicated(gene2pathway),]
構(gòu)建OrgDb數(shù)據(jù)庫(kù)
makeOrgPackage(gene_info=gene_info,
go=gene2go,
ko=gene2ko,
pathway=gene2pathway,
version="0.0.1", #版本
maintainer = "landuomayi<********@gmail.com>", #修改為你的名字和郵箱
author = "landuomayi<********@gmail.com>", #修改為你的名字和郵箱
outputDir = ".", #輸出文件位置
tax_id=tax_id,
genus=genus,
species=species,
goTable="go")
2 進(jìn)行culsterprofiler GO、KEGG分析
2.1 導(dǎo)入自己構(gòu)建的 OrgDb
install.packages("org.Zmays.eg.db", repos=NULL, type="sources")
library(org.Zmays.eg.db)
columns(org.Zmays.eg.db)
2.2 讀入自己的差異表達(dá)基因列表
gene_list <- rownames(DEGs) #讀入自己的差異表達(dá)基因列表
2.3 KEGG分析
從 OrgDB 提取 Pathway 和基因的對(duì)應(yīng)關(guān)系
pathway2gene <- AnnotationDbi::select(org.Zmays.eg.db,
keys = keys(org.Zmays.eg.db),
columns = c("Pathway","Ko")) %>%
na.omit() %>%
dplyr::select(Pathway, GID)
導(dǎo)入 Pathway 與名稱(chēng)對(duì)應(yīng)關(guān)系
load("kegg_info.RData")
KEGG pathway 富集
kegg <- enricher(gene_list,
TERM2GENE = pathway2gene,
TERM2NAME = pathway2name,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
minGSSize = 1)
kegg_results <- as.data.frame(kegg)
kegg_bar <-barplot(kegg_results, showCategory=20,color="pvalue", font.size=10) #繪制柱狀圖
ggsave("kegg_bar.png",kegg_bar, width = 16, height = 9, units = "in", dpi = 600)# 保存圖片
kegg_dot <-dotplot(kegg_results, showCategory=20,color="pvalue", font.size=10) #繪制點(diǎn)狀圖
ggsave("kegg_dot.png",kegg_dot, width = 16, height = 9, units = "in", dpi = 600)# 保存圖片
2.4 GO分析
分子功能(Molecular Function)
erich.go.MF <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "MF",keyType = "GID")
g_MF<-barplot(erich.go.MF,showCategory = 15)
ggsave("GO_MF.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)
生物過(guò)程(Biological Process)
erich.go.BP <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "BP",keyType = "GID")
g_BP<-barplot(erich.go.BP,showCategory = 15)
ggsave("GO_BP.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)
細(xì)胞組成(Cellular Component)
erich.go.CC <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "CC",keyType = "GID")
g_CC<-barplot(erich.go.CC,showCategory = 15)
ggsave("GO_CC.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)