1. Gene_ID轉(zhuǎn)換
手動(dòng)把差異表格中基因那一列(如gene-Q0020牛隅,gene-替換掉 ),在gene_id那一列加上列名ENSEMBL酌泰,另存為csv格式再上傳至服務(wù)器媒佣。
#進(jìn)入R
load("diff.RData")
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("org.Sc.sgd.db")
BiocManager::install("clusterProfiler")
#首次使用需要install
這時(shí)候如果用系統(tǒng)的R,可能安裝不了"org.Sc.sgd.db"陵刹,可以用conda下載R4.0到自己服務(wù)器默伍,org.Sc.sgd.db就可以正常加載啦。如果把自己的R4.0加載到環(huán)境變量了,后續(xù)使用要注意R的環(huán)境也糊,會(huì)不會(huì)和系統(tǒng)R時(shí)有沖突炼蹦。
library(org.Sc.sgd.db)
library(clusterProfiler)
upTable <- read.csv("SY14_up_2.csv", header = TRUE)
head(upTable)
aTable <- bitr(upTable[,1],fromType = 'ENSEMBL', toType = c('ENTREZID','GENENAME'), OrgDb = 'org.Sc.sgd.db')
#轉(zhuǎn)換
tTable <- merge(aTable,upTable, by= "ENSEMBL")
#合并
head(tTable)
write.csv(tTable,file ="up_symbol.csv")
#輸出up_symbol,最后兩列增加了ENTREZID狸剃,GENENAME
downTable <- read.csv("SY14_down_2.csv", header = TRUE)
head(downTable)
bTable <- bitr(downTable[,1],fromType = 'ENSEMBL', toType = c('ENTREZID','GENENAME'), OrgDb = 'org.Sc.sgd.db')
uTable <- merge(bTable,downTable, by="ENSEMBL")
head(uTable)
write.csv(uTable,file = "down_symbol.csv")
#同理輸出down_symbol
2. GO富集分析
上調(diào)基因
vi go_up.R
#!bin/bash
library(org.Sc.sgd.db)
library(clusterProfiler)
UP <- read.csv("up_symbol.csv", header=TRUE)
head(UP)
a<- UP[,3]
head(a)
GO_UP <- enrichGO(UP[,3], keyType = "ENTREZID", OrgDb = "org.Sc.sgd.db", ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05)
head(GO_UP)
#<0 rows> (or 0-length row.names)
#由于差異基因太少掐隐,未富集到
下調(diào)基因
padj由0.001調(diào)整為0.05,再做GO分析钞馁。
down <- read.csv("down_symbol.csv", header=TRUE)
head(down)
#a<- down[,3]
#head(a)
GO_down <- enrichGO(down[,3], keyType = "ENTREZID", OrgDb = "org.Sc.sgd.db", ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05)
head(GO_down)
pdf("GO_DOWN.pdf",width = 25, height = 25)
dotplot(GO_down, showCategory=10,title="EnrichmentGO")
dev.off()
#showCategory指定展示的GO Terms的個(gè)數(shù)虑省,默認(rèn)展示顯著富集的top10個(gè),即p.adjust最小的10個(gè)僧凰。
3 GSEA (Gene Set Enrichment Analysis) 基因集富集分析
GSEA 是一種計(jì)算方法探颈,使用預(yù)先定義的基因集gene set(比如想關(guān)注基因是否參與DNA REPAIR,可選擇hallmark gene set)训措,將基因按照在兩類樣本中的FoldChange排序得到gene list(按照差異表達(dá)倍數(shù)從大到小排序)伪节,觀察預(yù)先定義的基因集是在這個(gè)gene list的頂端還是底端富集。若參與某通路的基因密集排列在gene list頂端(Leading edge subset)绩鸣,即顯著上調(diào)怀大,排列在底端即顯著下調(diào)。
3.1 準(zhǔn)備排序后的gene list
BiocManager::install("msigdbr")
#msigdbr 包提供多個(gè)物種的MSigDB (Explore the Molecular Signatures Database)數(shù)據(jù)全闷,是注釋基因集的集合
BiocManager::install("dplyr")
#dplyr是R語言的數(shù)據(jù)分析包,能對(duì)dataframe類型的數(shù)據(jù)做很方便的數(shù)據(jù)處理和分析操作萍启。
library(clusterProfiler)
library(org.Sc.sgd.db)
library(msigdbr)
library(dplyr)
library(ggplot2)
library(stringr)
exp <- read.csv("SY14_VSBY4742_2.csv", header=TRUE)
head(exp)
gene_ID <- bitr(exp$ENSEMBL, fromType = 'ENSEMBL', toType = c('ENTREZID','GENENAME'), OrgDb = 'org.Sc.sgd.db')
#gene_ID轉(zhuǎn)換
head(gene_ID)
dim(gene_ID)
#5915 3
diff <-merge(exp,gene_ID, by = "ENSEMBL")
head(diff)
glist <- diff$log2FoldChange
head(glist)
names(glist) = diff$ENTREZID #定義好glist总珠,再輸入這一列,否則報(bào)錯(cuò)行數(shù)不一致
head(glist)
glist = sort(glist, decreasing = T)
head(glist)
#準(zhǔn)備好的genelist按ENTREZID和FoldChange排序
3.2 準(zhǔn)備功能集 gene set
The MSigDB gene sets are divided into 9 major collections:H, C1 ~ C8.
H, hallmark gene sets, 聚合許多MSigDB基因集來表達(dá)明確的生物狀態(tài)或過程而獲得的一些特征勘纯。
Sc <- msigdbr(species = "Saccharomyces cerevisiae")
table(Sc$gs_cat)
#查看目錄
Sc[str_detect(Sc$gs_name,"HALLMARK_DNA_REPAIR"),]
#查看Sc中HALLMARK_DNA_REPAIR基因集的gene
#A tibble: 97 × 18
gs_cat gs_subcat gs_name gene_symbol entrez_gene ensembl_gene
<chr> <chr> <chr> <chr> <int> <chr>
1 H "" HALLMARK_DNA_REPAIR AAH1 855581 YNL141W
2 H "" HALLMARK_DNA_REPAIR ADK2 856917 YER170W
3 H "" HALLMARK_DNA_REPAIR APT1 854986 YML022W
hallmark <- msigdbr(species = "Saccharomyces cerevisiae",category = "H") %>% dplyr::select(gs_name,entrez_gene)
#%>%來自dplyr包的管道函數(shù)局服,其作用是將前一步的結(jié)果直接傳參給下一步的函數(shù),
#select(gs_name,entrez_gene)驳遵,篩選gs_name和entrez_gene之間的所有列淫奔,
head(hallmark)
#gs_name entrez_gene
<chr> <int>
1 HALLMARK_ADIPOGENESIS 851013
2 HALLMARK_ADIPOGENESIS 852667
dim(hallmark)
#[1] 2759 2
gsea <- GSEA(glist, TERM2GENE = hallmark,verbose=FALSE, pvalueCutoff = 0.2)
head(gsea)
#查看富集到的geneset
library(enrichplot)
pdf("gsea_INTERFERON_ALPHA_RESPONSE.pdf",width = 20, height = 15)
#這里選INTERFERON_ALPHA_RESPONSE 基因集作圖
gseaplot2(gsea, geneSetID="HALLMARK_INTERFERON_ALPHA_RESPONSE",color = "firebrick",rel_heights=c(1.5, 0.5, 1),subplots=1:3,pvalue_table = TRUE,ES_geom = "line" )
dev.off()
core_enrichment,即leading edge subset堤结,定義其中對(duì)Enrichment score貢獻(xiàn)最大的基因?yàn)楹诵幕颉?br> 我選的一個(gè)基因集結(jié)果~~