僅針對(duì)人類而言。
雖然一直對(duì)GO和KEGG不感冒艳悔,但流程化的分析還是要做的易阳。主要包括兩部分:
- 將基因編號(hào)轉(zhuǎn)為ENTREZID(具有唯一性):基因編號(hào)來(lái)自ANNOVAR的注釋結(jié)果,建議別用SYMBOL滤灯,因?yàn)檫@種名稱特異性較差巍杈,在轉(zhuǎn)成ENTREZID時(shí)可能出現(xiàn)不唯一的現(xiàn)象忧饭。symbol與entrezID并不是絕對(duì)的一一對(duì)應(yīng)的
- 利用ClusterProfiler進(jìn)行富集分析:Y叔更新快,不用擔(dān)心數(shù)據(jù)庫(kù)過(guò)時(shí)筷畦,操作方便眷昆,出圖好看易調(diào)節(jié)。
各類數(shù)據(jù)庫(kù)的ID汁咏,非常多,常見(jiàn)(用)的基本都包含在library(org.Hs.eg.db)中了作媚。
![這么多種ID][1]
01 基因編號(hào)轉(zhuǎn)換
人類的數(shù)據(jù)可以利用library(org.Hs.eg.db)來(lái)進(jìn)行轉(zhuǎn)換攘滩。另外,org.db上有20個(gè)物種的數(shù)據(jù)庫(kù)可供使用
http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
***強(qiáng)烈建議看下官方文檔纸泡,針對(duì)數(shù)據(jù)庫(kù)的使用有非常詳細(xì)的介紹漂问,好用~
將ENSEMBL編號(hào)轉(zhuǎn)換為ENTREZID
輸入文件example_ensGene:ENSEMBL的列表
source("http://bioconductor.org/biocLite.R")
biocLite(org.Hs.eg.db) #此為人類基因編號(hào)系統(tǒng);mouse為org.Mm.eg.db
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db) #查看基因編號(hào)系統(tǒng)名稱
EG2Ensembl=toTable(org.Hs.egENSEMBL) #將ENTREZID和ENSEMBL對(duì)應(yīng)的數(shù)據(jù)存入該變量
#gene_id ensembl_id
#1 ENSG00000121410
ens=read.table("example_ensGene")
#ENSG00000092621
ens=ens$V1
geneLists=data.frame(ensembl_id=ens)
results=merge(geneLists,EG2Ensembl,by='ensembl_id',all.x=T)
id=na.omit(results$gene_id) #提取出非NA的ENTREZID
gene=id
#[1] "728642" "728642" "728642" "728642" "5725" "23357"
將SYMBOL轉(zhuǎn)為ENTREZID
自定義函數(shù)女揭,然后批量轉(zhuǎn)換蚤假,來(lái)自生信技能樹(shù)中JIMMY的貼子,親測(cè)可用吧兔。
geneIDannotation <- function(geneLists=c(1,2,9),name=T,map=T,ensemble=F,accnum=F){
## input ID type : So far I just accept entrezID or symbol
## default, we will annotate the entrezID and symbol, chromosone location and gene name
suppressMessages(library("org.Hs.eg.db"))
all_EG=mappedkeys(org.Hs.egSYMBOL)
EG2Symbol=toTable(org.Hs.egSYMBOL)
if( all(! geneLists %in% all_EG) ){
inputType='symbol'
geneLists=data.frame(symbol=geneLists)
results=merge(geneLists,EG2Symbol,by='symbol',all.x=T)
}else{
inputType='entrezID'
geneLists=data.frame(gene_id=geneLists)
results=merge(geneLists,EG2Symbol,by='gene_id',all.x=T)
}
if ( name ){
EG2name=toTable(org.Hs.egGENENAME)
results=merge(results,EG2name,by='gene_id',all.x=T)
}
if(map){
EG2MAP=toTable(org.Hs.egMAP)
results=merge(results,EG2MAP,by='gene_id',all.x=T)
}
if(ensemble){
EG2ENSEMBL=toTable(org.Hs.egENSEMBL)
results=merge(results,EG2ENSEMBL,by='gene_id',all.x=T)
}
if(accnum){
EG2accnum=toTable(org.Hs.egREFSEQ)
results=merge(results,EG2MAP,by='gene_id',all.x=T)
}
return(results)
}
geneIDannotation()
geneIDannotation(c('TP53','BRCA1','KRAS','PTEN'))
一個(gè)超級(jí)方便快捷的ID轉(zhuǎn)換方法
利用ClusterProfiler中的bitr
require(ClusterProfiler)
x <- c("GPX3", "GLRX", "LBP", "CRYAB", "DEFB1", "HCLS1", "SOD2", "HSPA2",
"ORM1", "IGFBP1", "PTHLH", "GPC3", "IGFBP3","TOB1", "MITF", "NDRG1",
"NR1H4", "FGFR3", "PVR", "IL6", "PTPRM", "ERBB2", "NID2", "LAMB1",
"COMP", "PLS3", "MCAM", "SPP1", "LAMC1", "COL4A2", "COL4A1", "MYOC",
"ANXA4", "TFPI2", "CST6", "SLPI", "TIMP2", "CPM", "GGT1", "NNMT",
"MAL", "EEF1A2", "HGD", "TCN2", "CDA", "PCCA", "CRYM", "PDXK",
"STC1", "WARS", "HMOX1", "FXYD2", "RBP4", "SLC6A12", "KDELR3", "ITM2B")
eg = bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
head(eg)
02 GO富集分析
over-representation test:enrichGO()
library(clusterProfiler)
ALL <- enrichGO(gene, "org.Hs.eg.db", keyType = "ENTREZID",ont = 'ALL',pvalueCutoff = 0.05,pAdjustMethod = "BH", qvalueCutoff = 0.1, readable=T) #一步到位
BP<-enrichGO(gene, "org.Hs.eg.db", keyType = "ENTREZID",ont = "BP",pvalueCutoff = 0.05,pAdjustMethod = "BH",qvalueCutoff = 0.1, readable=T) #3種分開(kāi)進(jìn)行富集
MF <- enrichGO(gene, "org.Hs.eg.db", keyType = "ENTREZID",ont = "MF",pvalueCutoff = 0.05,pAdjustMethod = "BH",qvalueCutoff = 0.1, readable=T)
CC <- enrichGO(gene, "org.Hs.eg.db", keyType = "ENTREZID",ont = "CC",pvalueCutoff = 0.05,pAdjustMethod = "BH",qvalueCutoff = 0.1, readable=T)
#gene: a vector of entrez gene id
#"org.Hs.eg.db":OrgDb
#keyType:keytype of input gene
#ont: One of "MF", "BP", and "CC" subontologies.
#pvalueCutoff:Cutoff value of pvalue.
#pAdjustMethod: one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
#universe: background genes
#qvalueCutoff: qvalue cutoff
#minGSSize: minimal size of genes annotated by Ontology term for testing.
#maxGSSize: maximal size of genes annotated for testing
#readable: whether mapping gene ID to gene Name(Symbol)
#pool: If ont=’ALL’, whether pool 3 GO sub-ontologies
write.table(as.data.frame(ALL@result), file="GOALL.txt",quote=FALSE)
#可以對(duì)富集結(jié)果進(jìn)行去冗余磷仰,方便查看關(guān)鍵信息;無(wú)法針對(duì)ALL進(jìn)行去冗余(我也不曉得為啥)
CC_simp <- simplify(CC, cutoff=0.7,by="p.adjust",select_fun=min)
BP_simp <- simplify(BP, cutoff=0.7,by="p.adjust",select_fun=min)
MF_simp <- simplify(MF, cutoff=0.7,by="p.adjust",select_fun=min)
write.table(as.data.frame(CC_simp@result), file="GO_simp.txt")
write.table(as.data.frame(BP_simp@result), file="GO_simp.txt",append=T,col.names=F)
write.table(as.data.frame(MF_simp@result), file="GO_simp.txt",append=T,col.names=F)
做出好看的泡泡圖
p=dotplot(ALL,x="count",
showCategory = 14,colorBy="pvalue") #showCategory即展示幾個(gè)分類,最好都展示(取ALL@result的行數(shù))
library(ggplot2)
library(Cairo)
CairoPDF("enrichGO.pdf",width=10,height=8) #PDF格式非常棒,可在PS中調(diào)整dpi
p=dotplot(ALL,showCategory = 14,
colorBy="pvalue",
font.size=18)
p + scale_size(range=c(2, 8)) #設(shè)置點(diǎn)的大小
#showCategory即展示幾個(gè)分類境蔼,最好都展示(取ALL@result的行數(shù))
#font.size設(shè)置文字大小
dev.off()
![enrichGO.png-182.3kB][2]
#條形圖
barplot(ALL, showCategory=15,title="EnrichmentGO_ALL")#條狀圖灶平,按p從小到大排的
(這個(gè)圖...好丑)
![Rplot.jpeg-143kB][3]
03 KEGG富集
kk <- enrichKEGG(gene = gene,
organism ='hsa',
pvalueCutoff = 0.05,
qvalueCutoff = 0.1,
minGSSize = 1,
#readable = TRUE ,
use_internal_data =FALSE)
write.table(as.data.frame(kk@result), file="test_kk.txt")
p=dotplot(kk,showCategory = 14,colorBy="pvalue",font.size=18)
效果同上圖
04 Reactome pathway enrichment analysis
#install
source("https://bioconductor.org/biocLite.R")
biocLite("ReactomePA")
require(org.Mm.eg.db)
keytypes(org.Mm.eg.db)
EG2Ensembl=toTable(org.Mm.egENSEMBL)
#convert gene ID
query=read.table("geneList.txt")
#ENSMUSG00000043572
#ENSMUSG00000040035
#ENSMUSG00000019564
#ENSMUSG00000042446
query=query$V1
geneLists=data.frame(ensembl_id=query)
results=merge(geneLists,EG2Ensembl,by='ensembl_id',all.x=T)
id=na.omit(results$gene_id) #提取出非NA的ENTREZID
gene=id
#Pathway enrichment analysis
require(ReactomePA)
x <- enrichPathway(gene=gene,pvalueCutoff=0.05, readable=T,organism = "mouse")
head(as.data.frame(x))
# ID Description GeneRatio BgRatio pvalue p.adjust qvalue
#R-MMU-422475 R-MMU-422475 Axon guidance 12/74 278/9315 1.658138e-06 0.0005073902 0.0004415883
#R-MMU-163685 R-MMU-163685 Integration of energy metabolism 7/74 90/9315 6.607488e-06 0.0010109457 0.0008798392
write.table(x,file="ReactomePA.xls",quote=F,sep="\t")
#visulization
barplot(x,showCategory=24)
dotplot(x,showCategory=24)
emapplot(x) #enrichment map
cnetplot(x, categorySize="pvalue", foldChange=gene) #cnetplot(x, categorySize="pvalue", foldChange=geneList)
Note: 這兩個(gè)包也可以用來(lái)做GSEA
[1]: http://static.zybuluo.com/fatlady/panpe77ea0yw9iqmgt0vt768/image_1casoc2uevrp6gr1f5b88mrqr9.png
[2]: http://static.zybuluo.com/fatlady/lmu9a8kk8bm1bm3psomi4hpu/rare_enrichGO.png
[3]: http://static.zybuluo.com/fatlady/x4btt326zwh3ay0tcj4bk5y3/Rplot.jpeg