復(fù)盤一下荡陷,找基因所在的通路下
新的任務(wù)缩筛,找到這 276 genes encompassed nine major DDR pathways:
base excision repair (BER),
nucleotide excision repair (NER),
mismatch repair (MMR),
the Fanconi anemia (FA) pathway,
homology-dependent recombination (HR),
non-homologous DNA end joining (NHEJ),
direct damage reversal repair (DR),
translesion DNA synthesis (TLS),
nucleotide pool maintenance (NP)
然后搞清楚每個(gè)基因出現(xiàn)在多少個(gè)通路会油,跟上次的任務(wù)比較像蛤高,來自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf
文章在線網(wǎng)址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/
根據(jù)Jimmy老師提示KEGGREST包含KEGG通路里的所有基因,那么反過來理論上也是可行的吕世。所以查找KEGGREST包的說明彰触,了解這個(gè)包的使用。
了解一下數(shù)據(jù)
讀入數(shù)據(jù)
rm (list=ls())
Sys.setenv(LANGUAGE = "en") #顯示英文報(bào)錯(cuò)信息
options(stringsAsFactors = FALSE) #禁止chr轉(zhuǎn)成factor
{ #install.packages
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
}
# 載入數(shù)據(jù)命辖,補(bǔ)充材料表1
library("xlsx")
res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
head(res)
# colnames(res) <- res[1,]
# res <- res[-1,]
colnames(res) <- c("Entrez_Gene_ID","Gene_Symbol")
head(res)
這里可以查看差異基因在哪些KEGG通路
library(KEGGREST)
listDatabases()
[1] "pathway" "brite" "module" "ko" "genome" "vg" "ag" "compound"
[9] "glycan" "reaction" "rclass" "enzyme" "disease" "drug" "dgroup" "environ"
[17] "genes" "ligand" "kegg"
#
res1 <- paste("hsa:",res[-1, 1]) #根據(jù)包說明書對(duì)變量重命名
res2 <- gsub(" ", "", res1) #正則表達(dá)式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
分析流程:
res1 <- paste("hsa:",res[-1, 1]) #根據(jù)包說明書對(duì)變量重命名
res2 <- gsub(" ", "", res1) #正則表達(dá)式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
dim(as.data.frame(res3))
#開始進(jìn)行分析
DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
# 批量提取幾個(gè)pathway的基因
DDRgene <- lapply(DDR.list, function(i){
#i="hsa03410"
d1 <- KEGGREST::keggGet(i)[[1]]$GENE
gene <- sapply(seq(2, length(d1), 2), function(x){
d2 <- unlist(strsplit(d1[x], ";"))[1]
})
})
#有三個(gè)通路在KEGG數(shù)據(jù)庫中找不到况毅,在其他數(shù)據(jù)庫
#https://reactome.org/PathwayBrowser/找到的分蓖,但是與原文有些不一致
#該數(shù)據(jù)庫的使用方法參考陳同老師的科學(xué)網(wǎng)博客科學(xué)網(wǎng)—沒錢買KEGG怎么辦?REACTOME開源通路更強(qiáng)大 - 陳同的博文 http://blog.sciencenet.cn/blog-118204-1142737.html
#direct damage reversal/repair (DR)
#PS:下邊雖然找到了該通路尔许,由于該數(shù)據(jù)庫通路名字的原因可能并沒有找全
#molecule選項(xiàng)里面的
'
Proteins P16455 UniProt:P16455 MGMT
Proteins Q6NS38 UniProt:Q6NS38 ALKBH2
Proteins Q6P6C2 UniProt:Q6P6C2 ALKBH5
Proteins Q9C0B1 UniProt:Q9C0B1 FTO
Proteins Q9H1I8 UniProt:Q9H1I8 ASCC2
Proteins Q8N3C0 UniProt:Q8N3C0 ASCC3
Proteins Q8N9N2 UniProt:Q8N9N2 ASCC1
Proteins Q96Q83 UniProt:Q96Q83 ALKBH3'
# genenames <- 'Gene.Name'
# ddr <- c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
Gene.Name <- c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
DDR.list2 <- list(Gene.Name=Gene.Name)
#同樣的方法分別構(gòu)建translesion DNA synthesis (TLS) 和 and nucleotide pool maintenance (NP)的通路基因
#translesion DNA synthesis (TLS)
#由于TLS這個(gè)通路里面有32個(gè)基因么鹤,所以可以從文本里面導(dǎo)入。
#與參考博文不一致母债;參考博文是85個(gè)午磁,他應(yīng)該是把該條通路下面的所有基因匯總了尝抖。
#我覺得沒有必要毡们,上邊的都沒有匯總
TLS <- read.xlsx("TLS.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
TLS <- data.frame(TLS)
colnames(TLS) <- c("Gene.Name")
library(stringr)
TLS <- str_split(as.character(TLS$Gene.Name)," ",simplify = T)[,2]
TLS <- list(Gene.Name =TLS)
DDR.list3 <- TLS
#nucleotide pool maintenance (NP)
#這個(gè)沒有找到非常匹配的
#按照提示,選取包括基因最多的通路DNA Repair
NP <- read.xlsx("NP.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
NP <- data.frame(NP)
colnames(NP) <- c("Gene.Name")
NP <- str_split(as.character(NP$Gene.Name)," ",simplify = T)[,2]
NP <- list(Gene.Name =NP)
DDR.list4 <- NP
class(NP)
merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
#names(merged.list) <- c(1:9)
# 文章中276個(gè)基因和5條通路的交集
# overlap <- lapply(1:5, function(x){
# res <- intersect(gene$Gene_Symbol, DDRgene[[x]])
# })
intersect(res[-1, 2], DDRgene[[1]])
overlap <- lapply(1:9, function(x){
res <- intersect(res[-1, 2], merged.list[[x]])
})
Reduce(intersect,overlap) # 也沒有overlap
library(ggplot2)
library(tidyverse)
count <- unlist(overlap) %>% table()
count <- as.data.frame(count) %>% base::subset(Freq > 1)
names(count)[1] <- "gene"
ggplot(count, aes(gene, Freq),size =1)+
geom_bar(stat="identity")+
coord_flip()+
theme_bw()
ggplot(count, aes(gene, Freq, colour=Freq, size =Freq))+
geom_point(stat="identity")+
coord_flip()+
theme_bw()
韋恩圖
#沒有生成圖片
if(F){
library(UpSetR)
listinput <- list(dfgene = res[-1, 2],
BER = merged.list[[1]],
NER = merged.list[[2]],
MMR = merged.list[[3]],
HR = merged.list[[4]],
NHEJ = merged.list[[5]],
FA = merged.list[[6]],
NP = merged.list[[7]],
TLS = merged.list[[8]],
DR = merged.list[[9]])
pdf(file='upset.pdf',height = 8,width = 8)
p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
dev.off()
}
單個(gè)基因/單個(gè)通路
keggLink("pathway", "hsa:7157" )#單個(gè)基因昧辽,比如TP53
png <- keggGet("path:hsa01522", "image")# 看下通路
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
注意:這個(gè)任務(wù)中最關(guān)鍵的是如何確定九條信號(hào)通路中有哪些基因衙熔,但是各個(gè)數(shù)據(jù)庫并不一致,這是造成與作者原圖有出路的主要原因搅荞,作者在補(bǔ)充材料表格3中有276個(gè)基因的分類红氯,可以參考。
參考文獻(xiàn)
Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
KEGG學(xué)習(xí)筆記
簡(jiǎn)介Bioconductor中的幾個(gè)注釋信息數(shù)據(jù)庫