雙序列局部比對(duì)的算法和全局比對(duì)算法的步驟相似博个,只是賦值規(guī)則稍有不同,每個(gè)單元格的分值由前面對(duì)角線的分值和引入一個(gè)空位得到的分值的最大值決定末盔,但是分值不能為負(fù)值筑舅,如果為負(fù)值,則將其值設(shè)為0.
構(gòu)建矩陣時(shí)先將第一列與第一行的值設(shè)為0陨舱,而在雙序列全局比對(duì)中翠拣,此處的值為空位罰分的累加
除第一行和第一列以外每個(gè)位置的分值由一下四個(gè)值的最大值決定
1.S(i-1,j-1)+S(i,j) --匹配和不匹配
2.S(i,j-1)+gap -- 列引入空位
3.S(i-1,j)+gap -- 行引入空位
4.0
打分過程的目的是為了尋找比對(duì)的最大值,他代表了比對(duì)的結(jié)尾處游盲,最大值不一定在最后一行和最后一列心剥,這一點(diǎn)與全局比對(duì)不同
回溯過程從最大值開始,沿對(duì)角線向左直到碰到一位置(i,j)它的對(duì)角線上方背桐,左方优烧,上方的分值都小于匹配得分(在我的代碼中是5時(shí)),停止尋找链峭。
局部比對(duì)的比對(duì)區(qū)域較短畦娄,但相似性百分比較大
Python代碼如下:
mport sys
import numpy
import string
## Score matrix for nucleotide alignment
NUC44=numpy.array([[5,-4,-4,-4,-2],\
[-4,5,-4,-4,-2],\
[-4,-4,5,-4,-2],\
[-4,-4,-4,5,-2],\
[-2,-2,-2,-2,-1]])
NBET='ATGCN'
## define the function for calculating score matrix and arrow matrix:
def scoreMat(NUC44,NBET,seq1,seq2,gap=-8):
len1,len2=len(seq1),len(seq2)
scorMat=numpy.zeros((len1+1,len2+1),int)
arrowMat=numpy.zeros((len1+1,len2+1),int)
# scorMat[0,:]=numpy.arange(len2+1)*gap
# scorMat[:,0]=numpy.arange(len1+1)*gap
arrowMat[0,:]=numpy.ones(len2+1)
arrowMat[1:,0]=numpy.zeros(len1)
for i in range(1,len1+1):
for j in range(1,len2+1):
s_mat=numpy.zeros(4)
s_mat[0]=scorMat[i-1,j]+gap
s_mat[1]=scorMat[i,j-1]+gap
n1,n2=NBET.index(seq1[i-1]),NBET.index(seq2[j-1])
s_mat[2]=scorMat[i-1,j-1]+NUC44[n1,n2]
scorMat[i,j]=numpy.max(s_mat)
arrowMat[i,j]=numpy.argmax(s_mat)
return scorMat,arrowMat
## obtain the alignment of the sequences
def DynAlign(scorMat,arrow,seq1,seq2):
aln_seq1,aln_seq2='',''
flat_scorMat=numpy.ravel(scorMat)
v,h=divmod(numpy.argmax(flat_scorMat),len(seq2)+1)
print (v,h)
while True:
if arrow[v,h]==0:
aln_seq1+=seq1[v-1]
aln_seq2+='-'
v-=1
elif arrow[v,h]==1:
aln_seq1+='-'
aln_seq2+=seq2[h-1]
h-=1
elif arrow[v,h]==2:
aln_seq1+=seq1[v-1]
aln_seq2+=seq2[h-1]
v-=1
h-=1
elif arrow[v,h]==3:
break
if (v==0 and h==0) or scorMat[v,h]==0:
break
aln1=aln_seq1[::-1]
aln2=aln_seq2[::-1]
return aln1,aln2
sq1=input("Please input the sequence 1:")
sq2=input("Please input the sequence 2:")
scoreMatrix,arrowMatrix=scoreMat(NUC44,NBET,sq1,sq2,gap=-8)
aln1,aln2=DynAlign(scoreMatrix,arrowMatrix,sq1,sq2)
print('The score matrix is:')
print (scoreMatrix)
print ('The arrow matrix is:')
print (arrowMatrix)
print ('The aligned sequences are:')
print (aln1)
print (aln2)
C語言代碼如下:
#include <stdio.h>
#include <string.h>
char seq1_out[100] = "\0", seq2_out[100]= "\0";//定義比對(duì)后的輸出一維數(shù)組
int scores[50][50] = {0};//定義得分矩陣
int score_array[5][5] = { {5,-4,-4,-4,-2},{-4,5,-4,-4,-2},{-4,-4,5,-4,-2},{-4,-4,-4,5,-2},{-2,-2,-2,-2,-1} };//定義打分矩陣
int main()
{
void output(char seq1[], char seq2[]);//引用輸出函數(shù)
void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8);//引用打分函數(shù)
char seq1[100] = "\0", seq2[100] = "\0", normol[] = "ATCGN";
int i, j;
//輸入待比對(duì)序列
printf("Please input the sequence 1 :");
scanf("%s", seq1);
printf("Please input the sequence 2 :");
scanf("%s", seq2);
scoresMat(seq1, seq2, normol);
printf("the score array is:\n");
for (i = 0; i <= strlen(seq2); i++)
for (j = 0; j <= strlen(seq1); j++)
if (j == strlen(seq1))
printf("%d\n", scores[i][j]);
else
printf("%d\t", scores[i][j]);
output(seq1, seq2);
int out_len1 = strlen(seq1_out), out_len2 = strlen(seq2_out);
for (i = out_len1 - 1; i >= 0; i--)
printf("%c", seq1_out[i]);
printf("\n");
for (i = out_len2 - 1; i >= 0; i--)
printf("%c", seq2_out[i]);
printf("\n");
return 0;
}
void scoresMat(char seq1[], char seq2[], char normol[],int gap=-8)//打分函數(shù)
{
int find_index(char c, char normol[]);
int max(int score1, int score2, int score3);
int index1, index2;
int len1 = strlen(seq1), len2 = strlen(seq2);
int i = 0,j=0,k=0;
//將打分矩陣的第一行和第一列分值設(shè)為0
for (i = 1; i <= len1; i++)
scores[0][i] = 0;
for (i = 1; i <= len2; i++)
scores[i][0] = 0;
for(i=1;i<=len2;i++)
for (j = 1; j <= len1; j++)
{
index1 = 0, index2 = 0;
int score1 = 0, score2 = 0, score3 = 0,final_score=0;
index1 = find_index(seq2[i-1], normol);//當(dāng)i=1時(shí),其實(shí)是第二條序列的第一個(gè)堿基,在序列中的索引為0熙卡,所以要減一
index2 = find_index(seq1[j-1], normol);
score1 = scores[i-1][j-1]+score_array[index1][index2];
score2 = scores[i - 1][j] + gap;
score3 = scores[i][j - 1] + gap;
final_score = max(score1, score2, score3);
if (final_score > 0)
scores[i][j] = final_score;
else
scores[i][j] = 0;
}
}
int max(int score1, int score2, int score3)//求三個(gè)數(shù)最大值函數(shù)
{
if (score1 > score2)
if (score1 > score3)
return score1;
else
return score3;
else
if (score2 > score3)
return score2;
else
return score3;
}
int find_index(char c, char normol[])//尋找堿基在normol中的索引函數(shù)
{
int i=0, index=0;
for (i = 0; i < strlen(normol); i++)
if (c == normol[i])
index = i;
return index;
}
void output(char seq1[], char seq2[])//反向輸出最佳匹配序列函數(shù)
{
int out_num,seq1_num,seq2_num;
int i, j;
int score_sum = 0;
int max(int score1, int score2, int score3);
int len1 = strlen(seq1), len2 = strlen(seq2);
//找出打分矩陣中最置作為反向?qū)ふ易罴驯葘?duì)的起點(diǎn)
int start_i = 0, start_j = 0, max_score;
max_score = scores[0][0];
for (i = 0; i <= len2; i++)
for (j = 0; j <= len1; j++)
if (max_score <= scores[i][j])
{
max_score = scores[i][j];
start_i = i;
start_j = j;
}
//開始反向?qū)ふ易罴哑ヅ? out_num = 0;
seq1_num = start_j - 1;
seq2_num = start_i - 1;
score_sum += scores[start_i][start_j];
seq1_out[out_num] =seq1[seq1_num]; seq1_num--;
seq2_out[out_num] = seq2[seq2_num]; seq2_num--;
out_num++;
int score_max = 0, score1, score2 , score3;//score1為(i,j)左邊的分?jǐn)?shù)杖刷,score2為(i,j)上邊的分?jǐn)?shù),score3為(i,j)對(duì)角線上方的分?jǐn)?shù)
score1 = scores[start_i][start_j - 1];
score2 = scores[start_i - 1][start_j];
score3 = scores[start_i - 1][start_j - 1];
while (score1>=5 || score2>=5 || score3>=5)
{
if (score3 == score_max)
{
start_i = start_i - 1;
start_j = start_j - 1;
seq1_out[out_num] = seq1[seq1_num];
seq2_out[out_num] = seq2[seq2_num];
out_num++; seq1_num--; seq2_num--;
}
if (score1 == score_max && score2!=score_max)
{
start_j = start_j - 1;
seq1_out[out_num] = seq1[seq1_num];
seq2_out[out_num] = '-';
out_num++; seq1_num--;
}
if (score2 == score_max&&score1!=score_max)
{
start_i = start_i - 1;
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = '-';
out_num++; seq2_num--;
}
if ( score2 == score_max && score1== score_max)
{
start_i = start_i - 1;
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = '-';
out_num++; seq2_num--;
}
score1 = scores[start_i][start_j - 1];
score2 = scores[start_i - 1][start_j];
score3 = scores[start_i - 1][start_j - 1];
score_max = max(score1, score2, score3);
score_sum += score_max;
}
printf("The best scores is %d\n", score_sum);
}
運(yùn)行結(jié)果如下圖:
代碼僅供參考和學(xué)習(xí)驳癌,如果有問題滑燃,請(qǐng)聯(lián)系我,我們可以一起討論M窍省1砭健!