一、準(zhǔn)備工作
需要安裝devtools驳遵、vcfR厅贪、binmapr
install.packages("devtools")
install.packages("vcfR")
#加載devtools時(shí)提示需要usethis包
install.packages("usethis")
library(usethis)
library(devtools)
library(vcfR)
devtools::install_github("xuzhougeng/binmapr")
- 安裝devtools時(shí)出現(xiàn)問題,詳見筆記:
R-Rstudio-server無(wú)法安裝devtools解決方法 - 簡(jiǎn)書 (jianshu.com) - 安裝binmapr時(shí)出現(xiàn)問題伺通,詳見筆記:
R-devtools無(wú)法安裝binmapr包 - 簡(jiǎn)書 (jianshu.com)
二箍土、讀取數(shù)據(jù)
#設(shè)置工作路徑
setwd("~/workspace/BSA/practice/")
#vcfR讀取VCF文件
vcf <- read.vcfR("4.variants_filter/snps.vcf")
#從VCF對(duì)象中提取兩個(gè)關(guān)鍵信息,AD(Allele Depth)和GT(Genotype)
gt <- extract.gt(vcf)
ad <- extract.gt(vcf, "AD")
gt
:基因型矩陣罐监,過濾后僅剩"0/0", "0/1","1/1"
ad
:等位基因的count數(shù).
> head(gt)
SRR6327815 SRR6327816 SRR6327817 SRR6327818
chr01_1151 "0/0" "1/1" "0/0" "0/1"
chr01_6918 "0/0" "1/1" "0/0" "0/1"
chr01_17263 "0/0" "1/1" "0/0" "0/1"
chr01_21546 "1/1" "1/1" "0/0" "0/1"
chr01_24732 "1/1" "0/0" "0/0" "0/0"
chr01_33667 "1/1" "1/1" "0/0" "0/1"
> head(ad)
SRR6327815 SRR6327816 SRR6327817 SRR6327818
chr01_1151 "14,0" "0,18" "10,0" "11,3"
chr01_6918 "31,0" "0,34" "32,0" "21,5"
chr01_17263 "46,0" "0,37" "20,0" "21,9"
chr01_21546 "0,26" "0,20" "17,0" "16,3"
chr01_24732 "0,30" "36,0" "22,0" "31,0"
chr01_33667 "0,28" "0,34" "28,0" "29,12"
SRR6327815(KY131), SRR6327816(DN422),
SRR6327817(T-pool), SRR6327818(S-pool)
需要測(cè)試是選擇KY131是"0/0", "DN422" 是 "1/1"的點(diǎn)
還是選擇KY131是 "1/1"吴藻,"DN422" 是"0/0"的點(diǎn)
#xxx15對(duì)應(yīng)KY131,xxx16對(duì)應(yīng)DN422
#選取gt中KY0/0,DN1/1
mask <- which(gt[,"SRR6327815"] == "0/0" & gt[,"SRR6327816"] == "1/1")
length(mask)#結(jié)果顯示[1] 140788
#選取gt中KY1/1,DN0/0
verse_mask <- which(gt[,"SRR6327815"] == "1/1" & gt[,"SRR6327816"] == "0/0")
length(mask)#結(jié)果顯示[1] 221712
#先測(cè)試0/0,1/1的點(diǎn)
ad_flt <- ad[mask,c("SRR6327817", "SRR6327818")]
colnames(ad_flt) <- c("T_Pool", "S_Pool")
接下來根據(jù)AD來計(jì)算SNP-index簡(jiǎn)單理解SNP-index就是與親本基因型不同的計(jì)數(shù)弓柱,即
SNP-index=AD(ALT)/AD(REF)+AD(ALT)
AD:allele depth,指樣本中每一種等位基因的reads覆蓋度沟堡。vcf文件中,一般對(duì)于二倍體或多倍體矢空,會(huì)用逗號(hào)進(jìn)行分隔航罗,前者是REF,后者是ALT
> freq <- calcFreqFromAd(ad_flt, min.depth = 10, max.depth = 100)
#彈出來了這個(gè)報(bào)錯(cuò)妇多,探索了半天伤哺,感覺可能是因?yàn)閍d_flt是matrix,不支持$選擇特定列者祖?
$ operator is invalid for atomic vectors
#輸入函數(shù)立莉,看看是什么
> calcFreqFromAd
function (x, min.depth = 10, max.depth = 200)
{
Ref_COUNT <- x$refmat #逗號(hào)前的數(shù)值
Alt_COUNT <- x$altmat #逗號(hào)后的數(shù)值
AD_Depth <- Ref_COUNT + Alt_COUNT #Allele Depth
NA_pos <- which(AD_Depth < min.depth | AD_Depth > max.depth) #超過最小和最大深度的值
Ref_COUNT[NA_pos] <- NA
Alt_COUNT[NA_pos] <- NA #將Ref和Altcount中這些超出閾值的值設(shè)置為NA
AD_freq <- round(Alt_COUNT/(Ref_COUNT + Alt_COUNT), 2) #計(jì)算SNP-index
return(AD_freq)
}
<bytecode: 0x557d38229660>
<environment: namespace:binmapr>
然后我發(fā)現(xiàn),vcfR包里是有相關(guān)函數(shù)(masplit)可以用來提取字符的
ref_count <- masplit(ad_flt, record = 1, sort = 0)
alt_count <- masplit(ad_flt, record = 2, sort = 0)
ad_depth <- ref_count + alt_count
NA_pos <- which(ad_depth < 10 | ad_depth >100)
ref_count[NA_pos] <- NA
alt_count[NA_pos] <- NA
freq <- round(alt_count/(ref_count + alt_count), 2)
#參照函數(shù)的步驟七问,成功得到了freq
freq2 <- freq[Matrix::rowSums(is.na(freq)) == 0, ]
接下來進(jìn)行畫圖
par(mfrow = c(3,4))
for (i in paste0("chr", formatC(1:12, width = 2, flag=0)) ){
#i對(duì)應(yīng)chr01蜓耻、chr02……chr12
#根據(jù)染色體編號(hào),將freq2分為12個(gè)矩陣
freq_flt <- freq2[grepl(i,row.names(freq2)), ]
#提取位置信息:substring從第七位(即除去染色體編號(hào)+“_”)開始的字符串
pos <- as.numeric(substring(row.names(freq_flt), 7))
#開始繪圖械巡,以pos為橫坐標(biāo)刹淌,freq_flt第二列和第一列差值為縱坐標(biāo)饶氏,
#y軸界限為-1~1
plot(pos, freq_flt[,2] - freq_flt[,1], ylim = c(-1,1),
#pch:指定描繪點(diǎn)時(shí)的符號(hào),cex:符號(hào)大小
pch = 20, cex = 0.2,
#x軸標(biāo)題和y軸標(biāo)題
xlab = i,
#expression有勾,注明數(shù)學(xué)符號(hào)△
ylab = expression(paste(Delta, " " ,"SNP index")))
}
KY1/1DN0/0中
以上為兩種情況下所做的△SNP_index圖
文獻(xiàn)中指出疹启,候選基因被定位在六號(hào)染色體的20-25M處,從圖中可知KY0/0蔼卡,DN1/1這種情況與文獻(xiàn)中情況類似喊崖,因此在最初的實(shí)驗(yàn)設(shè)計(jì)中應(yīng)該就是以KY0/0,DN1/1為標(biāo)準(zhǔn)
0:REF,1:ALT
文獻(xiàn)中除了用到了SNP-index雇逞,還用到了ED(Euclidean distance)方法進(jìn)行定位
where Aaa, Caa, Gaa and Taa represent the frequency of bases A, C, G and T in the T-pool, respectively. Aab, Cab, Gab and Tab represent the frequency of bases A, C, G and T in the S-pool, respectively.
Aaa荤懂、Caa、Gaa塘砸、Taa:T-pool的堿基
Aab节仿、Cab、Gab掉蔬、Tab:S-pool的堿基
The depth of each base in different pools and the ED value of each SNP loci were calculated.
計(jì)算不同池中的depth及每一個(gè)SNP位點(diǎn)的ED值
To eliminate the background noise, the ED value was powered, and the ED^5 was used as the associated value.
為消除噪音廊宪,返回值為ED^5
mask <- which(gt[,"SRR6327815"] == "0/0" & gt[,"SRR6327816"] == "1/1")
ad_flt <- ad[mask,c("SRR6327817", "SRR6327818")]
#apply函數(shù)避免for循環(huán),margin = 1:對(duì)行進(jìn)行操作
ED_list <- apply(ad_flt, 1, function(x){
#strsplit()眉踱,按逗號(hào)分割字符串挤忙,返回列表
#unlist(),將列表轉(zhuǎn)化為向量谈喳,as.numeric()將數(shù)據(jù)類型改為numeric
count <- as.numeric(unlist(strsplit(x, ",",fixed = TRUE,useBytes = TRUE)))
#depth1:T-pool册烈;depth2:S-pool
depth1 <- count[1] + count[2]
depth2 <- count[3] + count[4]
#sqrt():開方
#count[3]/depth2:S-pool
ED <- sqrt((count[3] / depth2 - count[1] / depth1)^2 +
(count[4] / depth2- count[2] /depth1)^2)
return(ED^5)
})
par(mfrow = c(3,4))
for (i in paste0("chr", formatC(1:12, width = 2, flag=0)) ){
ED_flt <- ED_list[grepl(i,names(ED_list))]
pos <- as.numeric(substring(names(ED_flt), 7))
plot(pos, ED_flt,
pch = 20, cex = 0.2,
xlab = i,
ylab = "ED")
}
與文獻(xiàn)對(duì)比