# 1. 初步使用
- 運(yùn)行下面幾行代碼,初步了解ChIPpeakAnno使用
library(ChIPpeakAnno)
## import the MACS output
macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno")
macsOutput <- toGRanges(macs, format="MACS")
## annotate the peaks with precompiled ensembl annotation
data(TSS.human.GRCh38)
macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38)
## add gene symbols
library(org.Hs.eg.db)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
IDs2Add="symbol")
if(interactive()){## annotate the peaks with UCSC annotation
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
macs.anno <- annotatePeakInBatch(macsOutput,
AnnotationData=ucsc.hg38.knownGene)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
feature_id_type="entrez_id",
IDs2Add="symbol")
}
# 2. 詳細(xì)使用例子
ChIPpeakAnno使用GRanges 保存peak數(shù)據(jù)。ChIPpeakAnno內(nèi)置toGRanges 函數(shù)可以將BED, GFF等格式轉(zhuǎn)換為GRanges呛牲。
- toGRanges()函數(shù)
toGRanges(data, format=c("BED", "GFF", "GTF",
"MACS", "MACS2", "MACS2.broad",
"narrowPeak", "broadPeak",
"others"),
header=FALSE, comment.char="#", colNames=NULL, ...)
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE)
## one can also try import from rtracklayer
library(rtracklayer)
gr1.import <- import(bed, format="BED")
identical(start(gr1), start(gr1.import))
# [1] TRUE
gr1[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | score
<Rle> <IRanges> <Rle> | <numeric>
MACS_peak_1 chr1 28341-29610 * | 160.81
MACS_peak_2 chr1 90821-91234 * | 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr1.import[1:2] #note the name slot is different from gr1
GRanges object with 2 ranges and 2 metadata columns:
seqnames ranges strand | name score
<Rle> <IRanges> <Rle> | <character> <numeric>
[1] chr1 28341-29610 * | MACS_peak_1 160.81
[2] chr1 90821-91234 * | MACS_peak_2 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
ol <- findOverlapsOfPeaks(gr1, gr2)
- 韋恩圖展示overlap結(jié)果
makeVennDiagram(ol,
fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2"))
- 餅圖展示結(jié)果
pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature))
- 找到overlap peak后榴都,可以使用annotatePeakInBatch在maxgap使用AnnotationData中的基因組特征對(duì)peak進(jìn)行注釋窄驹,在以下示例maxgap=5kb。
overlaps <- ol$peaklist[["gr1///gr2"]]
## ============== old style ===========
## data(TSS.human.GRCh37)
## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
## output="overlapping", maxgap=5000L)
## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol")
## ============== new style ===========
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
annoData[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | gene_name
<Rle> <IRanges> <Rle> | <character>
ENSG00000223972 chr1 11869-14412 + | DDX11L1
ENSG00000227232 chr1 14363-29806 - | WASH7P
-------
seqinfo: 273 sequences from GRCh37 genome
- annotatePeakInBatch進(jìn)行注釋
獲取距離最近的TSS台盯、miRNA亮蛔、外顯子等的距離
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
output="overlapping", maxgap=5000L)
overlaps.anno$gene_name <-
annoData$gene_name[match(overlaps.anno$feature,
names(annoData))]
head(overlaps.anno)
GRanges object with 6 ranges and 11 metadata columns:
seqnames ranges strand |
<Rle> <IRanges> <Rle> |
X001.ENSG00000228327 chr1 713791-715578 * |
X001.ENSG00000237491 chr1 713791-715578 * |
X002.ENSG00000237491 chr1 724851-727191 * |
X003.ENSG00000272438 chr1 839467-840090 * |
X004.ENSG00000223764 chr1 856361-856999 * |
X004.ENSG00000187634 chr1 856361-856999 * |
peakNames
<CharacterList>
X001.ENSG00000228327 gr1__MACS_peak_13,gr2__001,gr2__002
X001.ENSG00000237491 gr1__MACS_peak_13,gr2__001,gr2__002
X002.ENSG00000237491 gr2__003,gr1__MACS_peak_14
X003.ENSG00000272438 gr1__MACS_peak_16,gr2__004
X004.ENSG00000223764 gr1__MACS_peak_17,gr2__005
X004.ENSG00000187634 gr1__MACS_peak_17,gr2__005
peak feature
<character> <character>
X001.ENSG00000228327 001 ENSG00000228327
X001.ENSG00000237491 001 ENSG00000237491
X002.ENSG00000237491 002 ENSG00000237491
X003.ENSG00000272438 003 ENSG00000272438
X004.ENSG00000223764 004 ENSG00000223764
X004.ENSG00000187634 004 ENSG00000187634
start_position end_position
<integer> <integer>
X001.ENSG00000228327 700237 714006
X001.ENSG00000237491 714150 745440
X002.ENSG00000237491 714150 745440
X003.ENSG00000272438 840214 851356
X004.ENSG00000223764 852245 856396
X004.ENSG00000187634 860260 879955
feature_strand insideFeature
<character> <character>
X001.ENSG00000228327 - overlapStart
X001.ENSG00000237491 + overlapStart
X002.ENSG00000237491 + inside
X003.ENSG00000272438 + upstream
X004.ENSG00000223764 - overlapStart
X004.ENSG00000187634 + upstream
distancetoFeature shortestDistance
<numeric> <integer>
X001.ENSG00000228327 215 215
X001.ENSG00000237491 -359 359
X002.ENSG00000237491 10701 10701
X003.ENSG00000272438 -747 124
X004.ENSG00000223764 35 35
X004.ENSG00000187634 -3899 3261
fromOverlappingOrNearest gene_name
<character> <character>
X001.ENSG00000228327 Overlapping RP11-206L10.2
X001.ENSG00000237491 Overlapping RP11-206L10.9
X002.ENSG00000237491 Overlapping RP11-206L10.9
X003.ENSG00000272438 Overlapping RP11-54O7.16
X004.ENSG00000223764 Overlapping RP11-54O7.3
X004.ENSG00000187634 Overlapping SAMD11
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
- 繪制peak與最近特征(如轉(zhuǎn)錄起始位點(diǎn)TSS)的距離分布碍岔。
gr1.copy <- gr1
gr1.copy$score <- 1
binOverFeature(gr1, gr1.copy, annotationData=annoData,
radius=5000, nbins=10, FUN=c(sum, length),
ylab=c("score", "count"),
main=c("Distribution of aggregated peak scores around TSS",
"Distribution of aggregated peak numbers around TSS"))
- assignChromosomeRegion()可以統(tǒng)計(jì) exon, intron, enhancer, proximal promoter, 5’ UTR 和3’ UTR在peak中的分布浴讯。
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage)
}
# 3. ChIPpeakAnno 各種運(yùn)用場景
- loading called peaks from BED, GFF, or other formats;
- evaluating and visualizing the concordance among the biological replicates;
- combining peaks from replicates; preparing genomic annotation(s) as GRanges;
- associating/annotating peaks with the annotation(s); summarizing peak distributions over exon, intron, enhancer, proximal promoter, 5’UTR and 3’UTR regions;
- retrieving the sequences around the peaks; and enrichment analysis of GO and biological pathway.
- plot the heatmap of given peak ranges
- perform permutation test to determine if there is a significant overlap between two sets of peaks.
## 3. 計(jì)算不同peak數(shù)據(jù)的overlap,并使用韋恩圖展示
- 通過尋找重復(fù)實(shí)驗(yàn)peak的overlap來評(píng)估實(shí)驗(yàn)之間的peak的一致性蔼啦。也可以通過這種不同實(shí)驗(yàn)之間peak一致性來判斷兩個(gè)實(shí)驗(yàn)是不是存在一定的聯(lián)系榆纽。
peaks1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6",
"2", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869,
3123260, 3857501, 201089, 1543200,
1557200, 1563000, 1569800, 167889600),
end= c(967754, 2010997, 2496804, 3075969,
3123360, 3857601, 201089, 1555199,
1560599, 1565199, 1573799, 167893599),
names=paste("Site", 1:12, sep="")),
strand="+")
peaks2 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3",
"4", "5", "6", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967659, 2010898, 2496700,
3075866, 3123260, 3857500,
96765, 201089, 249670, 307586,
312326, 385750, 1549800,
1554400, 1565000, 1569400,
167888600),
end=c(967869, 2011108, 2496920,
3076166,3123470, 3857780,
96985, 201299, 249890, 307796,
312586, 385960, 1550599, 1560799,
1565399, 1571199, 167888999),
names=paste("t", 1:17, sep="")),
strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-",
"-", "-", "-", "+", "+", "+", "+", "+"))
ol <- findOverlapsOfPeaks(peaks1, peaks2, maxgap=1000)
peaklist <- ol$peaklist
attributes(ol )
$names
[1] "venn_cnt" "peaklist" "uniquePeaks"
[4] "mergedPeaks" "peaksInMergedPeaks" "overlappingPeaks"
[7] "all.peaks"
$class
[1] "overlappingPeaks"
olpeaks1///peaks2
中存放peak1和peak2的overlap peak
overlappingPeaks <- ol$overlappingPeaks
overlappingPeaks[["peaks1///peaks2"]][1:2, ]
## peaks1 seqnames start end width strand peaks2 seqnames start
## Site1_t1 Site1 1 967654 967754 101 + t1 1 967659
## Site2_t2 Site2 2 2010897 2010997 101 + t2 2 2010898
## end width strand overlapFeature shortestDistance
## Site1_t1 967869 211 + overlapStart 5
## Site2_t2 2011108 211 + overlapStart 1
pie1(table(overlappingPeaks[["peaks1///peaks2"]]$overlapFeature))
- 獲取merged overlapping peaks
peaklist[["peaks1///peaks2"]]
## GRanges object with 11 ranges and 1 metadata column:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## [1] 1 967654-967869 + |
## [2] 2 201089-201299 * |
## [3] 2 2010897-2011108 + |
## [4] 3 2496700-2496920 + |
## [5] 4 3075866-3076166 + |
## [6] 5 3123260-3123470 + |
## [7] 5 167888600-167893599 + |
## [8] 6 1543200-1560799 + |
## [9] 6 1563000-1565399 + |
## [10] 6 1569400-1573799 + |
## [11] 6 3857500-3857780 + |
## peakNames
## <CharacterList>
## [1] peaks1__Site1,peaks2__t1
## [2] peaks1__Site7,peaks2__t8
## [3] peaks1__Site2,peaks2__t2
## [4] peaks2__t3,peaks1__Site3
## [5] peaks2__t4,peaks1__Site4
## [6] peaks1__Site5,peaks2__t5
## [7] peaks2__t17,peaks1__Site12
## [8] peaks1__Site8,peaks2__t13,peaks2__t14,...
## [9] peaks1__Site10,peaks2__t15
## [10] peaks2__t16,peaks1__Site11
## [11] peaks2__t6,peaks1__Site6
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- peaks1 中宇peaks2沒有overlap的peak
peaklist[["peaks1"]]
## NULL
- peaks2 中宇peaks1沒有overlap的peak
peaklist[["peaks2"]]
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | peakNames
## <Rle> <IRanges> <Rle> | <CharacterList>
## [1] 1 96765-96985 - | peaks2__t7
## [2] 3 249670-249890 - | peaks2__t9
## [3] 4 307586-307796 - | peaks2__t10
## [4] 5 312326-312586 - | peaks2__t11
## [5] 6 385750-385960 - | peaks2__t12
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- 韋恩圖
并且輸出overlapping 是否顯著的p值
makeVennDiagram(ol, totalTest=1e+2,
fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2"))
$p.value
peaks1 peaks2 pval
[1,] 1 1 5.890971e-12
$vennCounts
peaks1 peaks2 Counts count.peaks1 count.peaks2
[1,] 0 0 83 0 0
[2,] 0 1 5 0 5
[3,] 1 0 0 0 0
[4,] 1 1 12 12 12
attr(,"class")
[1] "VennCounts"
- 使用R 包Vernerable繪制韋恩圖
# install.packages("Vennerable", repos="http://R-Forge.R-project.org",
# type="source")
# library(Vennerable)
# venn_cnt2venn <- function(venn_cnt){
# n <- which(colnames(venn_cnt)=="Counts") - 1
# SetNames=colnames(venn_cnt)[1:n]
# Weight=venn_cnt[,"Counts"]
# names(Weight) <- apply(venn_cnt[,1:n], 1, base::paste, collapse="")
# Venn(SetNames=SetNames, Weight=Weight)
# }
#
# v <- venn_cnt2venn(ol$venn_cnt)
# plot(v)
- findOverlapsOfPeaks 最多可以計(jì)算5 peak lists 的overlapping peak
peaks3 <- GRanges(seqnames=c("1", "2", "3", "4", "5",
"6", "1", "2", "3", "4"),
ranges=IRanges(start=c(967859, 2010868, 2496500, 3075966,
3123460, 3851500, 96865, 201189,
249600, 307386),
end= c(967969, 2011908, 2496720, 3076166,
3123470, 3857680, 96985, 201299,
249890, 307796),
names=paste("p", 1:10, sep="")),
strand=c("+", "+", "+", "+", "+",
"+", "-", "-", "-", "-"))
ol <- findOverlapsOfPeaks(peaks1, peaks2, peaks3, maxgap=1000,
connectedPeaks="min")
makeVennDiagram(ol, totalTest=1e+2,
fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color
cat.col=c("#D55E00", "#0072B2", "#E69F00"))
函數(shù)makeVennDiagram中的參數(shù)totalTest表示超幾何檢驗(yàn)中使用的潛在峰值總數(shù)。它應(yīng)該大于最大peak數(shù)捏肢。設(shè)置得越小奈籽,測(cè)試越嚴(yán)格。用于計(jì)算p值的時(shí)間并不取決于totalTest的值鸵赫。超幾何檢驗(yàn)要求用戶輸入給定TF的總潛在結(jié)合位點(diǎn)(峰值)的估計(jì)值衣屏。為了規(guī)避這一要求,ChIPpeakAnno 實(shí)現(xiàn)了一個(gè)名為permTest的置換檢驗(yàn)辩棒。
## 生成注釋數(shù)據(jù)
- peak一半需要注釋到基因組元件勾拉,ChIPpeakAnno 內(nèi)置有TSS.human.NCBI36, TSS.human.GRCh37, TSS.human.GRCh38, TSS.mouse.NCBIM37, TSS.mouse.GRCm38, TSS.rat.RGSC3.4, TSS.rat.Rnor_5.0, TSS.zebrafish.Zv8, and TSS.zebrafish.Zv9。使用data()即可調(diào)用盗温,data(TSS.human.GRCh38)。對(duì)于其他的注釋信息成肘,可以使用getAnnotation()函數(shù)設(shè)置參數(shù)featureType卖局,“Exon” to obtain the nearest exon, “miRNA” to find the nearest miRNA, and “5utr” or “3utr” to locate the overlapping “5’UTR” or “3’UTR”. 更多的參數(shù)與biomaRt類似。
類似使用TranscriptDb TxDb.Hsapiens.UCSC.hg19.knownGene 和toGRanges可以構(gòu)建注釋文件双霍。
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
annoData
## GRanges object with 23056 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 chr19 58858172-58874214 -
## 10 chr8 18248755-18258723 +
## 100 chr20 43248163-43280376 -
## 1000 chr18 25530930-25757445 -
## 10000 chr1 243651535-244006886 -
## ... ... ... ...
## 9991 chr9 114979995-115095944 -
## 9992 chr21 35736323-35743440 +
## 9993 chr22 19023795-19109967 -
## 9994 chr6 90539619-90584155 +
## 9997 chr22 50961997-50964905 -
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
## 注釋
data(myPeakList)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
AnnotationData = annoData)
annotatedPeak[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.100288069 chr1 556660-556760 * | X1_93_556427
## X1_41_559455.100288069 chr1 559774-559874 * | X1_41_559455
## X1_12_703729.100288069 chr1 703885-703985 * | X1_12_703729
## feature start_position end_position feature_strand
## <character> <integer> <integer> <character>
## X1_93_556427.100288069 100288069 700245 714068 -
## X1_41_559455.100288069 100288069 700245 714068 -
## X1_12_703729.100288069 100288069 700245 714068 -
## insideFeature distancetoFeature shortestDistance
## <character> <numeric> <integer>
## X1_93_556427.100288069 downstream 157408 143485
## X1_41_559455.100288069 downstream 154294 140371
## X1_12_703729.100288069 inside 10183 3640
## fromOverlappingOrNearest
## <character>
## X1_93_556427.100288069 NearestLocation
## X1_41_559455.100288069 NearestLocation
## X1_12_703729.100288069 NearestLocation
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
- 使用自定義的注釋文件
myPeak1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6",
"2", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869,
3123260, 3857501, 201089, 1543200,
1557200, 1563000, 1569800, 167889600),
end= c(967754, 2010997, 2496804, 3075969,
3123360, 3857601, 201089, 1555199,
1560599, 1565199, 1573799, 167893599),
names=paste("Site", 1:12, sep="")))
TFbindingSites <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2",
"3", "4", "5", "6", "6", "6", "6", "6",
"5"),
ranges=IRanges(start=c(967659, 2010898, 2496700,
3075866, 3123260, 3857500,
96765, 201089, 249670, 307586,
312326, 385750, 1549800,
1554400, 1565000, 1569400,
167888600),
end=c(967869, 2011108, 2496920,
3076166,3123470, 3857780,
96985, 201299, 249890, 307796,
312586, 385960, 1550599, 1560799,
1565399, 1571199, 167888999),
names=paste("t", 1:17, sep="")),
strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-",
"-", "-", "-", "+", "+", "+", "+", "+"))
annotatedPeak2 <- annotatePeakInBatch(myPeak1, AnnotationData=TFbindingSites)
annotatedPeak2[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak feature
## <Rle> <IRanges> <Rle> | <character> <character>
## Site1.t1 1 967654-967754 * | Site1 t1
## Site2.t2 2 2010897-2010997 * | Site2 t2
## Site3.t3 3 2496704-2496804 * | Site3 t3
## start_position end_position feature_strand insideFeature
## <integer> <integer> <character> <character>
## Site1.t1 967659 967869 + overlapStart
## Site2.t2 2010898 2011108 + overlapStart
## Site3.t3 2496700 2496920 + inside
## distancetoFeature shortestDistance fromOverlappingOrNearest
## <numeric> <integer> <character>
## Site1.t1 -5 5 NearestLocation
## Site2.t2 -1 1 NearestLocation
## Site3.t3 4 4 NearestLocation
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- 餅圖展示
pie1(table(as.data.frame(annotatedPeak2)$insideFeature))
- GenomicFeatures::promoters可以獲取啟動(dòng)子注釋
library(ChIPpeakAnno)
data(myPeakList)
data(TSS.human.NCBI36)
annotationData <- promoters(TSS.human.NCBI36, upstream=5000, downstream=500)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6,],
AnnotationData=annotationData,
output="overlapping")
annotatedPeak[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.ENSG00000209341 chr1 556660-556760 * | X1_93_556427
## X1_93_556427.ENSG00000209344 chr1 556660-556760 * | X1_93_556427
## X1_93_556427.ENSG00000209346 chr1 556660-556760 * | X1_93_556427
## feature start_position end_position
## <character> <integer> <integer>
## X1_93_556427.ENSG00000209341 ENSG00000209341 554314 559813
## X1_93_556427.ENSG00000209344 ENSG00000209344 555569 561068
## X1_93_556427.ENSG00000209346 ENSG00000209346 555643 561142
## feature_strand insideFeature distancetoFeature
## <character> <character> <numeric>
## X1_93_556427.ENSG00000209341 - inside 3153
## X1_93_556427.ENSG00000209344 - inside 4408
## X1_93_556427.ENSG00000209346 - inside 4482
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## X1_93_556427.ENSG00000209341 2346 Overlapping
## X1_93_556427.ENSG00000209344 1091 Overlapping
## X1_93_556427.ENSG00000209346 1017 Overlapping
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
- 除了將peak注釋到最近的基因之外砚偶,ChIPpeakAnno還可以通過在注釋PeakInBatch中設(shè)置output=“both”和maxgap來報(bào)告所有overlap和側(cè)翼基因。例如洒闸,如果設(shè)置maxgap=5000和output=“both”染坯,它輸出5kb內(nèi)所有重疊和側(cè)翼基因,再加上最近的基因丘逸。
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
AnnotationData = annoData,
output="both", maxgap=5000)
head(annotatedPeak)
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.100288069 chr1 556660-556760 * | X1_93_556427
## X1_41_559455.100288069 chr1 559774-559874 * | X1_41_559455
## X1_12_703729.100288069 chr1 703885-703985 * | X1_12_703729
## X1_20_925025.84808 chr1 926058-926158 * | X1_20_925025
## X1_11_1041174.54991 chr1 1041646-1041746 * | X1_11_1041174
## X1_14_1269014.1855 chr1 1270239-1270339 * | X1_14_1269014
## feature start_position end_position feature_strand
## <character> <integer> <integer> <character>
## X1_93_556427.100288069 100288069 700245 714068 -
## X1_41_559455.100288069 100288069 700245 714068 -
## X1_12_703729.100288069 100288069 700245 714068 -
## X1_20_925025.84808 84808 910579 917473 -
## X1_11_1041174.54991 54991 1017198 1051736 -
## X1_14_1269014.1855 1855 1270658 1284492 -
## insideFeature distancetoFeature shortestDistance
## <character> <numeric> <integer>
## X1_93_556427.100288069 downstream 157408 143485
## X1_41_559455.100288069 downstream 154294 140371
## X1_12_703729.100288069 inside 10183 3640
## X1_20_925025.84808 upstream -8585 8585
## X1_11_1041174.54991 inside 10090 9990
## X1_14_1269014.1855 downstream 14253 319
## fromOverlappingOrNearest
## <character>
## X1_93_556427.100288069 NearestLocation
## X1_41_559455.100288069 NearestLocation
## X1_12_703729.100288069 NearestLocation
## X1_20_925025.84808 NearestLocation
## X1_11_1041174.54991 NearestLocation
## X1_14_1269014.1855 Overlapping
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## Add other feature IDs to the annotated peaks
data(annotatedPeak)
library(org.Hs.eg.db)
addGeneIDs(annotatedPeak[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol"))
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## X1_11_100272487.ENSG00000202254 1 100272801-100272900 + |
## X1_11_108905539.ENSG00000186086 1 108906026-108906125 + |
## X1_11_110106925.ENSG00000065135 1 110107267-110107366 + |
## X1_11_110679983.ENSG00000197106 1 110680469-110680568 + |
## X1_11_110681677.ENSG00000197106 1 110682125-110682224 + |
## X1_11_110756560.ENSG00000116396 1 110756823-110756922 + |
## peak feature start_position
## <character> <character> <numeric>
## X1_11_100272487.ENSG00000202254 1_11_100272487 ENSG00000202254 100257218
## X1_11_108905539.ENSG00000186086 1_11_108905539 ENSG00000186086 108918435
## X1_11_110106925.ENSG00000065135 1_11_110106925 ENSG00000065135 110091233
## X1_11_110679983.ENSG00000197106 1_11_110679983 ENSG00000197106 110693108
## X1_11_110681677.ENSG00000197106 1_11_110681677 ENSG00000197106 110693108
## X1_11_110756560.ENSG00000116396 1_11_110756560 ENSG00000116396 110753965
## end_position insideFeature distancetoFeature
## <numeric> <character> <numeric>
## X1_11_100272487.ENSG00000202254 100257309 downstream 15582
## X1_11_108905539.ENSG00000186086 109013624 upstream -12410
## X1_11_110106925.ENSG00000065135 110136975 inside 16033
## X1_11_110679983.ENSG00000197106 110744824 upstream -12640
## X1_11_110681677.ENSG00000197106 110744824 upstream -10984
## X1_11_110756560.ENSG00000116396 110776666 inside 2857
## shortestDistance fromOverlappingOrNearest
## <numeric> <character>
## X1_11_100272487.ENSG00000202254 15491 NearestStart
## X1_11_108905539.ENSG00000186086 12310 NearestStart
## X1_11_110106925.ENSG00000065135 16033 NearestStart
## X1_11_110679983.ENSG00000197106 12540 NearestStart
## X1_11_110681677.ENSG00000197106 10884 NearestStart
## X1_11_110756560.ENSG00000116396 2857 NearestStart
## symbol
## <character>
## X1_11_100272487.ENSG00000202254 <NA>
## X1_11_108905539.ENSG00000186086 NBPF6
## X1_11_110106925.ENSG00000065135 GNAI3
## X1_11_110679983.ENSG00000197106 SLC6A17
## X1_11_110681677.ENSG00000197106 SLC6A17
## X1_11_110756560.ENSG00000116396 KCNC4
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
addGeneIDs(annotatedPeak$feature[1:6], orgAnn="org.Hs.eg.db",
IDs2Add=c("symbol"))
## ensembl_gene_id symbol
## 1 ENSG00000065135 GNAI3
## 2 ENSG00000116396 KCNC4
## 3 ENSG00000197106 SLC6A17
## 4 ENSG00000186086 NBPF6
## 5 ENSG00000202254 <NA>
## 獲取peak附近的序列
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
ranges=IRanges(start=c(100, 500),
end=c(300, 600),
names=c("peak1", "peak2")))
library(BSgenome.Ecoli.NCBI.20080805)
peaksWithSequences <- getAllPeakSequence(peaks, upstream=20,
downstream=20, genome=Ecoli)
保存為fa格式
write2FASTA(peaksWithSequences,"test.fa")
## Create heatmap for given feature/peak ranges
path <- system.file("extdata", package="ChIPpeakAnno")
files <- dir(path, "broadPeak")
data <- sapply(file.path(path, files), toGRanges, format="broadPeak")
names(data) <- gsub(".broadPeak", "", files)
ol <- findOverlapsOfPeaks(data)
#makeVennDiagram(ol)
features <- ol$peaklist[[length(ol$peaklist)]]
wid <- width(features)
feature.recentered <- feature.center <- features
start(feature.center) <- start(features) + floor(wid/2)
width(feature.center) <- 1
start(feature.recentered) <- start(feature.center) - 2000
end(feature.recentered) <- end(feature.center) + 2000
## here we also suggest importData function in bioconductor trackViewer package
## to import the coverage.
## compare rtracklayer, it will save you time when handle huge dataset.
library(rtracklayer)
files <- dir(path, "bigWig")
if(.Platform$OS.type != "windows"){
cvglists <- sapply(file.path(path, files), import,
format="BigWig",
which=feature.recentered,
as="RleList")
}else{## rtracklayer can not import bigWig files on Windows
load(file.path(path, "cvglist.rds"))
}
names(cvglists) <- gsub(".bigWig", "", files)
sig <- featureAlignedSignal(cvglists, feature.center,
upstream=2000, downstream=2000)
heatmap <- featureAlignedHeatmap(sig, feature.center,
upstream=2000, downstream=2000,
upper.extreme=c(3,.5,4))
featureAlignedDistribution(sig, feature.center,
upstream=2000, downstream=2000,
type="l")
## peak中motif 研究
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
ranges=IRanges(start=c(100, 500),
end=c(300, 600),
names=c("peak1", "peak2")))
filepath <- system.file("extdata", "examplePattern.fa", package="ChIPpeakAnno")
readLines(filepath)
## [1] ">ExamplePattern" "GGNCCK" ">ExamplePattern" "AACCNM"
library(BSgenome.Ecoli.NCBI.20080805)
summarizePatternInPeaks(patternFilePath=filepath, format="fasta", skip=0L,
BSgenomeName=Ecoli, peaks=peaks)
## peak注釋的基因進(jìn)行GO富集分析
library(org.Hs.eg.db)
over <- getEnrichedGO(annotatedPeak[1:500], orgAnn="org.Hs.eg.db",
maxP=0.01, minGOterm=10,
multiAdjMethod="BH",
condense=FALSE)
head(over[["bp"]][, -3])
## [1] go.id go.term Ontology
## [4] pvalue count.InDataset count.InGenome
## [7] totaltermInDataset totaltermInGenome BH.adjusted.p.value
## [10] EntrezID
## <0 rows> (or 0-length row.names)
head(over[["cc"]][, -3])
## [1] go.id go.term Ontology
## [4] pvalue count.InDataset count.InGenome
## [7] totaltermInDataset totaltermInGenome BH.adjusted.p.value
## [10] EntrezID
## <0 rows> (or 0-length row.names)
head(over[["mf"]][, -3])
egOrgMap("Mus musculus")
## [1] "org.Mm.eg.db"
egOrgMap("Homo sapiens")
## [1] "org.Hs.eg.db"
## Find peaks with bi-directional promoters
- 雙向啟動(dòng)子是位于兩個(gè)相鄰基因的轉(zhuǎn)錄起始位點(diǎn)(TSS)之間的DNA區(qū)域单鹿,它們?cè)谙喾捶较蛏限D(zhuǎn)錄,通常由該共享啟動(dòng)子區(qū)域共同調(diào)控深纲。下面是一個(gè)示例仲锄,用于查找雙向啟動(dòng)子附近的peak,并輸出雙向啟動(dòng)子周圍peak的百分比湃鹊。
data(myPeakList)
data(TSS.human.NCBI36)
seqlevelsStyle(TSS.human.NCBI36) <- seqlevelsStyle(myPeakList)
annotatedBDP <- peaksNearBDP(myPeakList[1:10,],
AnnotationData=TSS.human.NCBI36,
MaxDistance=5000)
annotatedBDP$peaksWithBDP
## GRangesList object of length 2:
## $`1`
## GRanges object with 2 ranges and 9 metadata columns:
## seqnames ranges strand | bdp_idx peak
## <Rle> <IRanges> <Rle> | <integer> <character>
## X1_93_556427 chr1 556660-556760 * | 1 X1_93_556427
## X1_93_556427 chr1 556660-556760 * | 1 X1_93_556427
## feature feature.ranges feature.strand distance
## <character> <IRanges> <Rle> <integer>
## X1_93_556427 ENSG00000209349 556240-556304 - 355
## X1_93_556427 ENSG00000209351 557933-557999 + 1172
## insideFeature distanceToSite description
## <character> <integer> <character>
## X1_93_556427 upstream 355
## X1_93_556427 upstream 1172
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
##
## $`8`
## GRanges object with 2 ranges and 9 metadata columns:
## seqnames ranges strand | bdp_idx peak
## <Rle> <IRanges> <Rle> | <integer> <character>
## X1_14_1300250 chr1 1300503-1300603 * | 8 X1_14_1300250
## X1_14_1300250 chr1 1300503-1300603 * | 8 X1_14_1300250
## feature feature.ranges feature.strand distance
## <character> <IRanges> <Rle> <integer>
## X1_14_1300250 ENSG00000175756 1298974-1300443 - 59
## X1_14_1300250 ENSG00000218550 1303908-1304275 + 3304
## insideFeature distanceToSite description
## <character> <integer> <character>
## X1_14_1300250 upstream 59 Aurora kinase A-inte..
## X1_14_1300250 upstream 3304
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
c(annotatedBDP$percentPeaksWithBDP,
annotatedBDP$n.peaks,
annotatedBDP$n.peaksWithBDP)
## [1] 0.2 10.0 2.0
## 置換檢驗(yàn)置換測(cè)試計(jì)算兩組peak之間的overlap的顯著性
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
cds <- unique(unlist(cdsBy(txdb)))
utr5 <- unique(unlist(fiveUTRsByTranscript(txdb)))
utr3 <- unique(unlist(threeUTRsByTranscript(txdb)))
set.seed(123)
utr3 <- utr3[sample.int(length(utr3), 1000)]
pt <- peakPermTest(utr3,
utr5[sample.int(length(utr5), 1000)],
maxgap=500,
TxDb=txdb, seed=1,
force.parallel=FALSE)
plot(pt)
## highly relevant peaks
ol <- findOverlaps(cds, utr3, maxgap=1)
pt1 <- peakPermTest(utr3,
c(cds[sample.int(length(cds), 500)],
cds[queryHits(ol)][sample.int(length(ol), 500)]),
maxgap=500,
TxDb=txdb, seed=1,
force.parallel=FALSE)
plot(pt1)
}
if(interactive()){
data(HOT.spots)
data(wgEncodeTfbsV3)
hotGR <- reduce(unlist(HOT.spots))
removeOl <- function(.ele){
ol <- findOverlaps(.ele, hotGR)
if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))]
.ele
}
temp <- tempfile()
download.file(file.path("http://hgdownload.cse.ucsc.edu",
"goldenPath", "hg19", "encodeDCC",
"wgEncodeRegTfbsClustered",
"wgEncodeRegTfbsClusteredV3.bed.gz"), temp)
data <- toGRanges(gzfile(temp, "r"), header=FALSE, format="others",
colNames = c("seqnames", "start", "end", "TF"))
unlink(temp)
data <- split(data, data$TF)
TAF1 <- removeOl(data[["TAF1"]])
TEAD4 <- removeOl(data[["TEAD4"]])
pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3), N=length(TAF1))
pt <- peakPermTest(TAF1, TEAD4, pool=pool, ntimes=1000)
plot(pt)
}