目錄
- 前言
- 方法簡介
- 從vcf文件里提取有用信息
- tidy vcfR
- vcf可視化1
- vcf可視化2
- 測序深度覆蓋度
- 窗口縮放
- 如何單獨(dú)分離染色體
- 利用vcf信息判斷物種染色體倍數(shù)
- CNV分析
vcf數(shù)據(jù)里除了位點(diǎn)的ATGC的對比,進(jìn)行純合/雜合判斷的以外袄简。還有一個重要的項(xiàng)目就是DP
,測序深度。測序深度不僅是看測序質(zhì)量的重要參考,也是對染色體倍數(shù)體以及基因拷貝數(shù)進(jìn)行評估的重要指標(biāo)豺型。
現(xiàn)重復(fù)一下之前的操作奋蔚,讀取數(shù)據(jù),提取必要的數(shù)據(jù)拳话。
提取矩陣數(shù)據(jù)
一般的VCF文件都很大,用手動提取里面的信息肯定不大現(xiàn)實(shí)种吸。用vcfR
就可以輕松實(shí)現(xiàn)弃衍。
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.8.0.9000
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
vcf_file <- system.file("extdata", "pinf_sc50.vcf.gz", package = "pinfsc50")
vcf <- read.vcfR(vcf_file, verbose = FALSE)
查看一下R讀取的數(shù)據(jù)。
vcf
## ***** Object of Class vcfR *****
## 18 samples
## 1 CHROMs
## 22,031 variants
## Object size: 22.4 Mb
## 7.929 percent missing data
## ***** ***** *****
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.1"
## [1] "##source=\"GATK haplotype Caller, phased with beagle4\""
## [1] "##FILTER=<ID=LowQual,Description=\"Low quality\">"
## [1] "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths fo [Truncated]"
## [1] "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read [Truncated]"
## [1] "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "Supercontig_1.50" "41" NA "AT" "A" "4784.43" NA
## [2,] "Supercontig_1.50" "136" NA "A" "C" "550.27" NA
## [3,] "Supercontig_1.50" "254" NA "T" "G" "774.44" NA
## [4,] "Supercontig_1.50" "275" NA "A" "G" "714.53" NA
## [5,] "Supercontig_1.50" "386" NA "T" "G" "876.55" NA
## [6,] "Supercontig_1.50" "462" NA "T" "G" "1301.07" NA
## [1]
## [1] "***** Genotype section *****"
## FORMAT BL2009P4_us23
## [1,] "GT:AD:DP:GQ:PL" "1|1:0,7:7:21:283,21,0"
## [2,] "GT:AD:DP:GQ:PL" "0|0:12,0:12:36:0,36,427"
## [3,] "GT:AD:DP:GQ:PL" "0|0:27,0:27:81:0,81,1117"
## [4,] "GT:AD:DP:GQ:PL" "0|0:29,0:29:87:0,87,1243"
## [5,] "GT:AD:DP:GQ:PL" "0|0:26,0:26:78:0,78,1034"
## [6,] "GT:AD:DP:GQ:PL" "0|0:23,0:23:69:0,69,958"
## DDR7602 IN2009T1_us22
## [1,] "1|1:0,6:6:18:243,18,0" "1|1:0,8:8:24:324,24,0"
## [2,] "0|0:20,0:20:60:0,60,819" "0|0:16,0:16:48:0,48,650"
## [3,] "0|0:26,0:26:78:0,78,1077" "0|0:23,0:23:69:0,69,946"
## [4,] "0|0:27,0:27:81:0,81,1158" "0|0:32,0:32:96:0,96,1299"
## [5,] "0|0:30,0:30:90:0,90,1242" "0|0:41,0:41:99:0,122,1613"
## [6,] "0|0:36,0:36:99:0,108,1556" "0|0:35,0:35:99:0,105,1467"
## LBUS5 NL07434
## [1,] "1|1:0,6:6:18:243,18,0" "1|1:0,12:12:36:486,36,0"
## [2,] "0|0:20,0:20:60:0,60,819" "0|0:28,0:28:84:0,84,948"
## [3,] "0|0:26,0:26:78:0,78,1077" "0|1:19,20:39:99:565,0,559"
## [4,] "0|0:27,0:27:81:0,81,1158" "0|1:19,19:38:99:523,0,535"
## [5,] "0|0:30,0:30:90:0,90,1242" "0|1:22,22:44:99:593,0,651"
## [6,] "0|0:36,0:36:99:0,108,1556" "0|1:29,25:54:99:723,0,876"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT:AD:DP:GQ:PL"
## [1]
選取我們需要的部分也就是Genotype Section里的DP
區(qū)域坚俗。
dp <- extract.gt(vcf, element='DP', as.numeric=TRUE)
測序深度箱狀圖
par(mar=c(8,4,1,1))
#boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth", log='y', las=2)
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth", las=2)
abline(h=seq(0,1e4, by=100), col="#C0C0C088")
眾所周知箱狀圖的特點(diǎn)就是(boxplot)包含了所有的信息镜盯,包括異常值outlier。正因?yàn)檫@個原因猖败,這張圖很大程度上受到了這些異常值的影響速缆,變得非常難懂。自己看看還可以恩闻,用來發(fā)表文章的話肯定不行艺糜。
測序深度小提琴圖
經(jīng)過log2轉(zhuǎn)換,我們可以得到理想的效果。
library(reshape2)
library(ggplot2)
dpf <- melt(dp, varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE)
head(dpf)
dpf <- dpf[ dpf$Depth > 0,]
p <- ggplot(dpf, aes(x=Sample, y=Depth)) + geom_violin(fill="#C0C0C0", adjust=1.0,
scale = "count", trim=TRUE)
p <- p + theme_bw()
p <- p + theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1, size=12))
p <- p + scale_y_continuous(trans=scales::log2_trans(),
breaks=c(1, 10, 100, 800),
minor_breaks=c(1:10, 2:10*10, 2:8*100))
p <- p + theme(axis.title.y = element_text(size=12))
p <- p + theme( panel.grid.major.y=element_line(color = "#A9A9A9", size=0.6) )
p <- p + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
p <- p + stat_summary(fun.y=median, geom="point", shape=23, size=2)
p
又或者不需要轉(zhuǎn)換倦踢,而是通過過濾數(shù)據(jù)來改善箱圖效果送滞。舉個例子,提取90%的信賴區(qū)間的數(shù)據(jù)來可視化辱挥。
sums <- apply(dp, MARGIN=2, quantile, probs=c(0.05, 0.95), na.rm=TRUE)
dp2 <- sweep(dp, MARGIN=2, FUN = "-", sums[1,])
dp[dp2 < 0] <- NA
dp2 <- sweep(dp, MARGIN=2, FUN = "-", sums[2,])
dp[dp2 > 0] <- NA
dp[dp < 4] <- NA
par(mar=c(8,4,1,1))
boxplot(dp, las=3, col=c("#C0C0C0", "#808080"), ylab="Depth")
abline(h=seq(0,200, by=20), col="#C0C0C088")
這樣也可以獲得類似的結(jié)果犁嗅。