與傳統(tǒng)的Over Representation Analysis: the up- or down-regulated DEGs和Gene Set Enrichment Analysis: All the ranked genes相比晌区,Gene Set Variation Analysis分析直接使用gene expression或RNA-seq profile matrix計算基因在通路中的得分诅诱,并且這種計算可以基于單樣本處理。
-
GSVA方法圖解
輸入表達譜數(shù)據(jù)和基因集數(shù)據(jù)霍转。計算其得分刨啸。
The GSVA package implements a non-parametric unsupervised method, called Gene Set Variation
Analysis (GSVA), for assessing gene set enrichment (GSE) in gene expression microarray and RNA-
seq data. In contrast to most GSE methods, GSVA performs a change in coordinate systems,
transforming the data from a gene by sample matrix to a gene set by sample matrix. Thereby
allowing for the evaluation of pathway enrichment for each sample. This transformation is done
without the use of a phenotype, thus facilitating very powerful and open-ended analyses in a now
pathway centric manner.
加載R包
knitr::opts_chunk$set(warning = F, message = F)
library(dplyr)
library(tibble)
library(ggplot2)
library(data.table)
library(GSVA)
rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)
grp <- c("S1", "S2")
grp.col <- c("#6C326C", "#77A2D1")
load data
- gene Expression DESeq2 object
phen <- fread("../../Result/phenotype/phenotype_cluster.csv")
geneExp <- fread("../../Result/profile/geneExp_filter.tsv")
# 處理數(shù)據(jù)成整型
geneExp <- geneExp %>% column_to_rownames("V1") %>% as.matrix()
geneExp <- round(geneExp) %>% data.frame() %>% rownames_to_column("V1")
pathways_hallmark_kegg <- fgsea::gmtPathways("../../Result/GeneID/msigdb.v7.1.symbols_KEGG.gmt")
pathways_hallmark_GO <- fgsea::gmtPathways("../../Result/GeneID/msigdb.v7.1.symbols_GO.gmt")
Curation Function
get_ExprSet <- function(x=phen,
y=geneExp){
# x=phen
# y=geneExp
sid <- intersect(x$Barcode, colnames(y))
# phenotype
phe <- x %>% filter(Barcode%in%sid) %>%
mutate(Cluster=factor(as.character(Cluster))) %>%
column_to_rownames("Barcode")
# profile
prf <- y %>% column_to_rownames("V1") %>%
dplyr::select(all_of(sid))
# determine the right order between profile and phenotype
for(i in 1:ncol(prf)){
if (!(colnames(prf)[i] == rownames(phe)[i])) {
stop(paste0(i, " Wrong"))
}
}
require(convert)
exprs <- as.matrix(prf)
adf <- new("AnnotatedDataFrame", data=phe)
experimentData <- new("MIAME",
name="Hua Zou", lab="Hua Lab",
contact="zouhua1@outlook.com",
title="KRIC Experiment",
abstract="The ExpressionSet",
url="www.zouhua.top",
other=list(notes="Created from text files"))
expressionSet <- new("ExpressionSet", exprs=exprs,
phenoData=adf,
experimentData=experimentData)
return(expressionSet)
}
get_score <- function(dataset=get_ExprSet(x=phen, y=geneExp),
geneset=pathways_hallmark_kegg,
methods="ssgsea"){
# dataset=get_ExprSet(x=phen, y=geneExp)
# geneset=pathways_hallmark_kegg
# methods="ssgsea"
dat_fit <- gsva(expr = dataset,
gset.idx.list = geneset,
method = methods,
min.sz = 5,
max.sz = 500,
kcdf = "Poisson")
res <- exprs(dat_fit) %>% t() %>%
data.frame() %>%
rownames_to_column("SampleID")
return(res)
}
# Differential methylation/copyNumber/methylation
get_limma <- function(dataset=methylation_set,
group_col=grp,
tag="methylation",
fc=1,
pval=0.05){
# dataset=methylation_set
# group_col=grp
# tag="methylation"
# fc=1
# pval=0.05
pheno <- pData(dataset)
if(tag == "methylation"){
# transform the beta value into M values via lumi package
require(lumi)
edata <- beta2m(exprs(dataset))
}else{
edata <- exprs(dataset)
}
require(limma)
design <- model.matrix(~0 + pheno$Cluster)
rownames(design) <- rownames(pheno)
colnames(design) <- group_col
exprSet <- edata
# show distribution
boxplot(exprSet)
plotDensities(exprSet)
# linear fitting
fit <- lmFit(exprSet, design, method = 'ls')
contrast <- makeContrasts("S1-S2", levels = design)
# eBayes
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
qqt(fit2$t, df = fit2$df.prior + fit2$df.residual, pch = 16, cex = 0.2)
abline(0, 1)
# differential features
diff_feature <- topTable(fit2, number = Inf, adjust.method = 'BH') %>%
rownames_to_column("Feature")
# prof[rownames(prof)%in%"EVI2A", ] %>% data.frame() %>% setNames("EVI2A") %>%
# rownames_to_column("SampleID") %>%
# inner_join(pheno %>%rownames_to_column("SampleID"), by = "SampleID") -> a1
# wilcox.test(EVI2A~Cluster, data = a1)
# ggplot(a1, aes(x=Cluster, y=EVI2A))+geom_boxplot()
diff_feature[which(diff_feature$logFC >= fc &
diff_feature$adj.P.Val < pval), "Enrichment"] <- group_col[1]
diff_feature[which(diff_feature$logFC <= -fc &
diff_feature$adj.P.Val < pval), "Enrichment"] <- group_col[2]
diff_feature[which(abs(diff_feature$logFC) < fc |
diff_feature$adj.P.Val >= pval), "Enrichment"] <- "Nonsignif"
diff_res <- diff_feature %>%
setNames(c("Feature", "log2FoldChange", "baseMean", "t",
"pvalue", "padj", "B", "Enrichment")) %>%
dplyr::select(Feature, everything()) %>%
arrange(padj, log2FoldChange) %>%
column_to_rownames("Feature")
res <- list(fit=fit2, diff_res=diff_res)
return(res)
}
gsva score
kegg_gsva <- get_score(dataset=get_ExprSet(x=phen, y=geneExp),
geneset=pathways_hallmark_kegg,
methods="gsva")
if(!dir.exists("../../Result/pathway/")){
dir.create("../../Result/pathway/", recursive = T)
}
write.csv(kegg_gsva, "../../Result/pathway/KEGG_gsva.csv", row.names = F)
# Differential Test
kegg_gsva_exprset <- get_ExprSet(x=phen, y=kegg_gsva %>% column_to_rownames("SampleID") %>% t() %>% data.frame() %>%rownames_to_column("V1"))
Diff_gsva <- get_limma(dataset =kegg_gsva_exprset, tag = "gsva")
DT::datatable(Diff_gsva$diff_res)
網(wǎng)盤地址: https://pan.baidu.com/s/1AGsvOD6klfljvu0T2A-akA 玲献;提取碼:vw7z
version
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19042)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
system code page: 936
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] GSVA_1.36.3 data.table_1.13.6 ggplot2_3.3.3 tibble_3.0.3 dplyr_1.0.2
loaded via a namespace (and not attached):
[1] fgsea_1.14.0 colorspace_1.4-1 ellipsis_0.3.1
[4] ggridges_0.5.2 qvalue_2.20.0 XVector_0.28.0
[7] GenomicRanges_1.40.0 rstudioapi_0.11 farver_2.0.3
[10] urltools_1.7.3 graphlayouts_0.7.1 ggrepel_0.9.1.9999
[13] DT_0.16 bit64_0.9-7 AnnotationDbi_1.50.3
[16] scatterpie_0.1.5 xml2_1.3.2 splines_4.0.2
[19] GOSemSim_2.14.2 knitr_1.30 shinythemes_1.1.2
[22] polyclip_1.10-0 jsonlite_1.7.1 annotate_1.66.0
[25] GO.db_3.11.4 graph_1.66.0 ggforce_0.3.2
[28] shiny_1.5.0 BiocManager_1.30.10 compiler_4.0.2
[31] httr_1.4.2 rvcheck_0.1.8 Matrix_1.3-2
[34] fastmap_1.0.1 later_1.1.0.1 tweenr_1.0.1
[37] htmltools_0.5.0 prettyunits_1.1.1 tools_4.0.2
[40] igraph_1.2.5 gtable_0.3.0 glue_1.4.2
[43] GenomeInfoDbData_1.2.3 reshape2_1.4.4 DO.db_2.9
[46] fastmatch_1.1-0 Rcpp_1.0.5 enrichplot_1.8.1
[49] Biobase_2.48.0 vctrs_0.3.4 ggraph_2.0.3
[52] xfun_0.19 stringr_1.4.0 mime_0.9
[55] lifecycle_0.2.0 XML_3.99-0.5 DOSE_3.14.0
[58] europepmc_0.4 MASS_7.3-53 zlibbioc_1.34.0
[61] scales_1.1.1 tidygraph_1.2.0 hms_0.5.3
[64] promises_1.1.1 parallel_4.0.2 SummarizedExperiment_1.18.2
[67] RColorBrewer_1.1-2 yaml_2.2.1 memoise_1.1.0
[70] gridExtra_2.3 triebeard_0.3.0 stringi_1.5.3
[73] RSQLite_2.2.0 S4Vectors_0.26.1 BiocGenerics_0.34.0
[76] BiocParallel_1.22.0 GenomeInfoDb_1.24.2 rlang_0.4.7
[79] pkgconfig_2.0.3 bitops_1.0-6 matrixStats_0.57.0
[82] evaluate_0.14 lattice_0.20-41 purrr_0.3.4
[85] htmlwidgets_1.5.2 cowplot_1.1.1 bit_4.0.3
[88] tidyselect_1.1.0 GSEABase_1.50.1 plyr_1.8.6
[91] magrittr_1.5 R6_2.5.0 IRanges_2.22.2
[94] generics_0.1.0 DelayedArray_0.14.1 DBI_1.1.0
[97] pillar_1.4.6 withr_2.3.0 RCurl_1.98-1.2
[100] crayon_1.3.4 rmarkdown_2.5 viridis_0.5.1
[103] progress_1.2.2 grid_4.0.2 blob_1.2.1
[106] digest_0.6.25 xtable_1.8-4 tidyr_1.1.2
[109] httpuv_1.5.4 gridGraphics_0.5-0 stats4_4.0.2
[112] munsell_0.5.0 viridisLite_0.3.0 ggplotify_0.0.5