前期準備
- 不同物種的蛋白和cds序列:
os.pep, os.cds, sb.pep, sb.cds - 依賴程序:
blast+, clustalw2, ParaAT, KaKs_Calculator
注:所依賴程序需提前安裝好羊苟,并添加到環(huán)境變量中
- 所用腳本:
axt2one-line.py, calculate_4DTV_correction.pl
獲得不同物種之間的同源序列
對參考物種的蛋白序列構(gòu)建索引
makeblastdb -in sb.pep -dbtype prot
將目標物種的蛋白序列與參考物種進行比對喘垂,并保留最優(yōu)匹配結(jié)果
blastp -query os.pep -db sb.pep -evalue 1e-5 -max_target_seqs 1 -num_threads 12 -out os2sb.blastp_out.m6 -outfmt 6
提取最優(yōu)匹配的同源序列基因?qū)?/p>
cut -f1-2 os2sb.blastp_out.m6|sort|uniq > os2sb.homolog
合并目標物種和參考物種的蛋白序列和cds序列
cat os.cds sb.cds >os_sb.cds
cat os.pep sb.pep >os_sb.pep
使用ParaAT程序?qū)⒌鞍仔蛄斜葘Y(jié)果轉(zhuǎn)化為cds序列比對結(jié)果
# 新建proc文件, 設定12個線程
echo "12" >proc
# -m參數(shù)指定多序列比對程序為clustalw2,-p參數(shù)指定多線程文件央渣,-f參數(shù)指定輸出文件格式為axt
ParaAT.pl -h os2sb.homolog -n os_sb.cds -a os_sb.pep -m clustalw2 -p proc -f axt -o os2sb_out
計算kaks值和4dtv值
cd os2sb_out
# 使用KaKs_Calculator計算kaks值, -m參數(shù)指定kaks值的計算方法為YN模型
for i in `ls *.axt`;do KaKs_Calculator -i $i -o ${i}.kaks -m YN;done
# 將多行axt文件轉(zhuǎn)換成單行
for i in `ls *.axt`;do axt2one-line.py $i ${i}.one-line;done
# 使用calculate_4DTV_correction.pl腳本計算4dtv值
ls *.axt.one-line|while read id;do calculate_4DTV_correction.pl $id >${id%%one-line}4dtv;done
# 合并所有同源基因?qū)Φ?dtv
for i in `ls *.4dtv`;do awk 'NR>1{print $1"\t"$2}' $i >>all-4dtv.txt;done
# 合并所有同源基因?qū)Φ膋aks值
for i in `ls *.kaks`;do awk 'NR>1{print $1"\t"$3"\t"$4"\t"$5}' $i >>all-kaks.txt;done
# 給結(jié)果文件排序并去冗余
sort all-4dtv.txt|uniq >all-4dtv.results
sort all-kaks.txt|uniq >all-kaks.results
# 清除中間文件
rm *one-line
rm all-4dtv.txt
rm all-kaks.txt
# 將kaks結(jié)果文件和4dtv結(jié)果文件進行合并
join -1 1 -2 1 all-kaks.results all-4dtv.results >all-results.txt
# 給結(jié)果文件添加標題
sed -i '1i\Seq\tKa\tKs\tKa/Ks\t4dtv_corrected' all-results.txt
使用calc_kaks_4dtv.sh腳本一步計算kaks值和4dtv值
nohup sh calc_kaks_4dtv.sh os sb &
查看腳本文件
#!/bin/bash
set -e
set -u
if [ $# -lt 2 ];then
echo "Two parameters needed, but only $# given!"
echo "Usage: sh calc_kaks_4dtv.sh <Species1> <Species2>"
exit 1;
fi
Species1=$1
Species2=$2
# 對參考物種的蛋白序列構(gòu)建索引
makeblastdb -in ${Species2}.pep -dbtype prot
# 將目標物種的蛋白序列與參考物種進行比對,并保留最優(yōu)匹配結(jié)果
blastp -query ${Species1}.pep -db ${Species2}.pep -evalue 1e-5 -max_target_seqs 1 -num_threads 10 -out ${Species1}2${Species2}.blastp_out.m6 -outfmt 6
# 提取最優(yōu)匹配的同源序列基因?qū)?cut -f1-2 ${Species1}2${Species2}.blastp_out.m6|sort|uniq > ${Species1}2${Species2}.homolog
# 合并目標物種和參考物種的蛋白序列和cds序列
cat ${Species1}.cds ${Species2}.cds >${Species1}_${Species2}.cds
cat ${Species1}.pep ${Species2}.pep >${Species1}_${Species2}.pep
# 使用ParaAT程序?qū)⒌鞍仔蛄斜葘Y(jié)果轉(zhuǎn)化為cds序列比對結(jié)果
ParaAT.pl -h ${Species1}2${Species2}.homolog -n ${Species1}_${Species2}.cds -a ${Species1}_${Species2}.pep -p proc -m clustalw2 -f axt -o ${Species1}2${Species2}_out
cd ${Species1}2${Species2}_out
# 使用KaKs_Calculator計算kaks值
for i in `ls *.axt`;do KaKs_Calculator -i $i -o ${i}.kaks -m YN;done
# 將多行axt文件轉(zhuǎn)換成單行
for i in `ls *.axt`;do axt2one-line.py $i ${i}.one-line;done
# 使用calculate_4DTV_correction.pl腳本計算4dtv值
ls *.one-line|while read id;do calculate_4DTV_correction.pl $id >${id%%one-line}4dtv;done
# 合并所有同源基因?qū)Φ?dtv值
for i in `ls *.4dtv`;do awk 'NR>1{print $1"\t"$2}' $i >>all-4dtv.txt;done
# 合并所有同源基因?qū)Φ膋aks值
for i in `ls *.kaks`;do awk 'NR>1{print $1"\t"$3"\t"$4"\t"$5}' $i >>all-kaks.txt;done
# 排序并去冗余
sort all-4dtv.txt|uniq >all-4dtv.results
sort all-kaks.txt|uniq >all-kaks.results
# 清除中間文件
rm *one-line
rm all-4dtv.txt
rm all-kaks.txt
# 將kaks結(jié)果文件和4dtv結(jié)果文件進行合并
join -1 1 -2 1 all-kaks.results all-4dtv.results >all-results.txt
# 給結(jié)果文件添加標題
sed -i '1i\Seq\tKa\tKs\tKa/Ks\t4dtv_corrected' all-results.txt
腳本運行完后會生成以下結(jié)果文件
image.png
最終得到的結(jié)果在os2sb_out文件夾中的all-results.txt文件中
image.png
繪制kaks值和4dtv值的密度分布圖
setwd("/Users/Davey/Desktop")
library(ggplot2)
library(patchwork)
data <- read.table("all-results.txt",header=T,check.names=F)
head(data)
Seq Ka Ks Ka/Ks 4dtv_corrected
1 Os01g0276600-Sb03g022840 0.2954040 2.146660 0.1376110 0.3852584
2 Os01g0276700-Sb03g022860 0.0372221 0.586840 0.0634280 0.2272330
3 Os01g0276800-Sb03g022870 0.1293250 0.560683 0.2306560 0.2456600
4 Os01g0721900-Sb03g158830 0.0769183 0.849214 0.0905759 0.1966413
5 Os01g0723100-Sb03g158910 0.1109170 1.198870 0.0925176 0.4060341
6 Os01g0723200-Sb03g158920 0.0398003 0.941168 0.0422882 0.1695289
p1 <- ggplot(data,aes(Ks))+ geom_line(stat="density",color="red")+theme_classic()+theme(axis.title = element_text(size=24),axis.text = element_text(size = 22,color = "black"))
p2 <- ggplot(data,aes(Ka/Ks))+ geom_line(stat="density",color="blue")+theme_classic()+theme(axis.title = element_text(size=24),axis.text = element_text(size = 22,color = "black"))
p3 <- ggplot(data,aes(`4dtv_corrected`))+ geom_line(stat="density",color="orange")+theme_classic()+theme(axis.title = element_text(size=24),axis.text = element_text(size = 22,color = "black"))
p1 + p2 + p3
image.png