新的任務(wù),找到這 276 genes encompassed nine major DDR pathways:
base excision repair (BER),
nucleotide excision repair (NER),
mismatch repair (MMR),
the Fanconi anemia (FA) pathway,
homology-dependent recombination (HR),
non-homologous DNA end joining (NHEJ),
direct damage reversal repair (DR),
translesion DNA synthesis (TLS),
nucleotide pool maintenance (NP)
然后搞清楚每個基因出現(xiàn)在多少個通路塑陵,跟上次的任務(wù)比較像衬潦,來自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf
文章在線網(wǎng)址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/
276個基因在補充材料表格1回右,截圖如下:
根據(jù)Jimmy老師提示KEGGREST包含KEGG通路里的所有基因,那么反過來理論上也是可行的漱挚。所以查找KEGGREST包的說明翔烁,了解這個包的使用。
rm (list=ls())
Sys.setenv(LANGUAGE = "en") #顯示英文報錯信息
options(stringsAsFactors = FALSE) #禁止chr轉(zhuǎn)成factor
{ #install.packages
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
}
# 載入數(shù)據(jù)旨涝,補充材料表1
library("xlsx")
res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
這里可以查看差異基因在哪些KEGG通路
library(KEGGREST)
listDatabases()
res1 <- paste("hsa:",res[-1, 1]) #根據(jù)包說明書對變量重命名
res2 <- gsub(" ", "", res1) #正則表達(dá)式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
開始進(jìn)行分析
DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
# 批量提取幾個pathway的基因
DDRgene <- lapply(DDR.list, function(i){
#i="hsa03410"
d1 <- KEGGREST::keggGet(i)[[1]]$GENE
gene <- sapply(seq(2, length(d1), 2), function(x){
d2 <- unlist(strsplit(d1[x], ";"))[1]
})
})
#有三個通路在KEGG數(shù)據(jù)庫中找不到蹬屹,在其他數(shù)據(jù)庫
#Reactome/wikipathways/SMPDB/Reactome/BioCarta Pathway/Pathway Commons找到的,但是與原文有些不一致
DDR.list2 <- as.list(read.csv("NP.csv"))
DDR.list2
# $Gene.Name
# [1] "NUDT1" "NUDT15" "NUDT18" "RRM1" "RRM2"
DDR.list3 <- as.list(read.csv("TLS.csv"))
DDR.list3
#$Gene.Name
[1] "REV3L" "SPRTN" "DDX11" "TACC3" "NUP160" "SPDL1" "PARP3"
[8] "TPR" "POLQ" "EIF2AK2" "NDC1" "ZW10" "AURKA" "TTC28"
[15] "POLE2" "NIN" "CEP128" "PABPN1" "PABPC1L" "RAE1" "CDC25B"
[22] "FAM83D" "POLI" "STAG2" "EMD" "BEX4" "PARP4" "RPA3"
[29] "POLE4" "MAD2L2" "ADPRHL2" "CDC20" "NEK2" "CENPF" "RPA2"
[36] "POLK" "DLGAP5" "PARP2" "RPA1" "VPS4A" "PCNA" "REV1"
[43] "FLNB" "CKAP2" "NUPL2" "ODF2" "CDK5RAP2" "NUMA1" "MAPKBP1"
[50] "TUBGCP4" "RMDN3" "PPM1B" "ACTR2" "KIF11" "KIF20B" "PARP9"
[57] "EIF4A3" "PARP1" "POLE3" "TUT1" "DSN1" "LATS2" "UBC"
[64] "TIPARP" "RCHY1" "TOPBP1" "POC1A" "MAD2L1" "CEP44" "PAPD4"
[71] "VCP" "UBB" "NUDCD2" "POLH" "USP32" "WDR73" "POLE"
[78] "CETN1" "UBE2N" "CALML3" "CALML5" "PARP10" "PARPBP" "BRCC3"
[85] "PARG"
DDR.list4<- as.list (read.csv("DR.csv"))
DDR.list4
# $Gene.Name
# [1] "ALKBH5" "ASCC2" "ASCC3" "ASCC1" "FTO" "ALKBH3" "MGMT" "ALKBH2"
merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
#names(merged.list) <- c(1:9)
intersect(res[-1, 2], DDRgene[[1]])
overlap <- lapply(1:9, function(x){
res <- intersect(res[-1, 2], merged.list[[x]])
})
Reduce(intersect,overlap) # 也沒有overlap
library(ggplot2)
library(tidyverse)
count <- unlist(overlap) %>% table()
count <- as.data.frame(count) %>% base::subset(Freq > 1)
names(count)[1] <- "gene"
ggplot(count, aes(gene, Freq,colour=Freq, size =Freq))+
geom_point(stat="identity")+coord_flip()
做個韋恩圖
library(UpSetR)
listinput <- list(dfgene = res[-1, 2],
BER = merged.list[[1]],
NER = merged.list[[2]],
MMR = merged.list[[3]],
HR = merged.list[[4]],
NHEJ = merged.list[[5]],
FA = merged.list[[6]],
NP = merged.list[[7]],
TLS = merged.list[[8]],
DR = merged.list[[9]])
pdf(file='upset.pdf',height = 8,width = 8)
p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
dev.off()
單個基因/單個通路
keggLink("pathway", "hsa:7157" )#單個基因白华,比如TP53
png <- keggGet("path:hsa01522", "image")# 看下通路
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
做個富集分析
library(clusterProfiler)
kegg.result <- enrichKEGG(gene=res[-1,1],
organism="hsa",
pvalueCutoff=0.05,
pAdjustMethod="BH",
qvalueCutoff=0.1,
keyType = "kegg")
barplot(kegg.result)
dotplot(kegg.result)
write.csv(kegg.result,"kegg.csv")
注意:這個任務(wù)中最關(guān)鍵的是如何確定九條信號通路中有哪些基因哩治,但是各個數(shù)據(jù)庫并不一致,這是造成與作者原圖有出路的主要原因衬鱼,作者在補充材料表格3中有276個基因的分類业筏,可以參考。
特別感謝內(nèi)蒙古生信菜鳥團尋找DDR通路中關(guān)鍵基因
鸟赫,部分代碼參考他寫的帖子蒜胖!
參考文獻(xiàn)
Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
KEGG學(xué)習(xí)筆記
簡介Bioconductor中的幾個注釋信息數(shù)據(jù)庫
生信技能樹公益視頻合輯:學(xué)習(xí)順序是linux,r抛蚤,軟件安裝台谢,geo,小技巧岁经,ngs組學(xué)朋沮!
B站鏈接:https://m.bilibili.com/space/338686099
YouTube鏈接:https://m.youtube.com/channel/UC67sImqK7V8tSWHMG8azIVA/playlists
生信工程師入門最佳指南:https://mp.weixin.qq.com/s/vaX4ttaLIa19MefD86WfUA
學(xué)徒培養(yǎng):https://mp.weixin.qq.com/s/3jw3_PgZXYd7FomxEMxFmw
生信技能樹 - 簡書 http://www.reibang.com/u/d645f768d2d5
https://www.wikipathways.org/index.php/Pathway:WP1928
https://www.cnblogs.com/muchen/p/5412278.html