有時(shí)候我們想看某一特定通路的富集情況纹磺,這時(shí)就需要獲取該通路的基因集。
一钮惠、從 GO 中獲取
下載腳本 getGoTerm.R茅糜,獲取所有的 GO geneSet,然后保存為RData素挽,因?yàn)?get_GO_data
這一步非常耗時(shí)蔑赘。
source("getGoTerm.R")
GO_DATA <- get_GO_data("org.Hs.eg.db", "ALL", "SYMBOL")
save(GO_DATA, file = "GO_DATA.RData")
然后,我寫了兩個(gè)小函數(shù) findGO
和 getGO
分別用于尋找 GO ID 和獲取 GO geneSet。
findGO <- function(pattern, method = "key"){
if(!exists("GO_DATA"))
load("GO_DATA.RData")
if(method == "key"){
pathways = cbind(GO_DATA$PATHID2NAME[grep(pattern, GO_DATA$PATHID2NAME)])
} else if(method == "gene"){
pathways = cbind(GO_DATA$PATHID2NAME[GO_DATA$EXTID2PATHID[[pattern]]])
}
colnames(pathways) = "pathway"
if(length(pathways) == 0){
cat("No results!\n")
} else{
return(pathways)
}
}
getGO <- function(ID){
if(!exists("GO_DATA"))
load("GO_DATA.RData")
allNAME = names(GO_DATA$PATHID2EXTID)
if(ID %in% allNAME){
geneSet = GO_DATA$PATHID2EXTID[ID]
names(geneSet) = GO_DATA$PATHID2NAME[ID]
return(geneSet)
} else{
cat("No results!\n")
}
}
用法示例
load("GO_DATA.RData") # 載入數(shù)據(jù) GO_DATA
findGO("insulin") # 尋找 含有指定關(guān)鍵字的 pathway name 的 pathway
findGO("INS", method = "gene") # 尋找含有指定基因名的 pathway
getGO("GO:1901142") # 獲取指定 GO ID 的 gene set
$`insulin metabolic process`
[1] "CEACAM1" "CPE" "ERN1" "IDE" "PCSK2" "ERO1B"
二缩赛、從KEGG 中獲取
1. 獲取目標(biāo)通路的ID號(hào)
以人類糖尿病相關(guān)通路為例耙箍,訪問 https://www.kegg.jp/kegg/pathway.html ,在 Select prefix
中輸入人類的縮寫 hsa
酥馍,Enter keywords
中輸入關(guān)鍵詞 diabetes
辩昆,檢索到了8個(gè)相關(guān)通路,點(diǎn)進(jìn)第一個(gè) Type II diabetes mellitus
旨袒,可以查看詳細(xì)信息汁针,ID號(hào)為 hsa04930。
2. 開始獲取
# 方法一
if(!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("KEGGREST", version = "3.10")
library("KEGGREST")
gsInfo = keggGet('hsa04930')[[1]]
names(gsInfo)
geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
geneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
names(geneSet) = gsInfo$NAME
> geneSet
$`Type II diabetes mellitus - Homo sapiens (human)`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
# 方法二
library("rjson")
download.file("http://togows.dbcls.jp/entry/pathway/hsa04930/genes.json", "hsa04930.json")
json = fromJSON(file ="hsa04930.json")
geneSet = list(as.character(sapply(json[[1]], function(x) sapply(strsplit(x[1], ";"), function(x) x[1]))))
> geneSet
[[1]]
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
不過第二種方法不能獲取geneSet的 NAME
砚尽,因此更推薦第一種方法施无。
同樣將以上步驟包裝為函數(shù):
getKEGG <- function(ID){
library("KEGGREST")
gsList = list()
for(xID in ID){
gsInfo = keggGet(xID)[[1]]
if(!is.null(gsInfo$GENE)){
geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
xgeneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
NAME = sapply(strsplit(gsInfo$NAME, " - "), function(x) x[1])
names(xgeneSet) = NAME
gsList[NAME] = xgeneSet
} else{
cat(" ", xID, "No corresponding gene set in specific database.\n")
}
}
return(gsList)
}
> getKEGG('hsa04930')
$`Type II diabetes mellitus - Homo sapiens (human)`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
> getKEGG(c("hsa04930", "hsa04940", "hsa04950"))
$`Type II diabetes mellitus`
[1] "INS" "INSR" "IRS1" "IRS2" "IRS4" "PIK3R1" "PIK3R2"
[8] "PIK3R3" "PIK3CA" "PIK3CD" "PIK3CB" "SLC2A4" "ADIPOQ" "MAPK1"
[15] "MAPK3" "MTOR" "PRKCZ" "SOCS1" "SOCS2" "SOCS3" "SOCS4"
[22] "IKBKB" "MAPK8" "MAPK10" "MAPK9" "TNF" "PRKCD" "PRKCE"
[29] "PDX1" "MAFA" "SLC2A2" "HK3" "HK1" "HK2" "HKDC1"
[36] "GCK" "PKM" "PKLR" "KCNJ11" "ABCC8" "CACNA1C" "CACNA1D"
[43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
$`Type I diabetes mellitus`
[1] "INS" "GAD1" "GAD2" "PTPRN" "PTPRN2" "CPE"
[7] "HSPD1" "ICA1" "HLA-DMA" "HLA-DMB" "HLA-DOA" "HLA-DOB"
[13] "HLA-DPA1" "HLA-DPB1" "HLA-DQA1" "HLA-DQA2" "HLA-DQB1" "HLA-DRA"
[19] "HLA-DRB1" "HLA-DRB3" "HLA-DRB4" "HLA-DRB5" "CD80" "CD86"
[25] "CD28" "IL12A" "IL12B" "IL2" "IFNG" "HLA-A"
[31] "HLA-B" "HLA-C" "HLA-F" "HLA-G" "HLA-E" "FASLG"
[37] "FAS" "PRF1" "GZMB" "LTA" "TNF" "IL1A"
[43] "IL1B"
$`Maturity onset diabetes of the young`
[1] "HHEX" "MNX1" "ONECUT1" "PDX1" "NR5A2" "NEUROG3" "NKX2-2"
[8] "NKX6-1" "PAX6" "PAX4" "NEUROD1" "RFX6" "HES1" "HNF1B"
[15] "FOXA2" "MAFA" "HNF4A" "HNF1A" "HNF4G" "FOXA3" "PKLR"
[22] "SLC2A2" "INS" "IAPP" "GCK" "BHLHA15"