Smith-Waterman Algorithm in Python (for amino acid sequences)
by Forrest Sheng Bao http://fsbao.net
This code is for amino acid sequences rather than nucleic acid sequences. But you can modify it slightly, such as change the scoring matrix, to make it work for nucleic acid sequences.
#This software is a free software. Thus, it is licensed under GNU General Public License.
#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>
from numpy import *
from matplotlib import *
#read the first sequence
f1=open(sys.argv[1], 'r')
seq1=f1.readline()
seq1=string.strip(seq1)
#read the second sequence
f2=open(sys.argv[2], 'r')
seq2=f2.readline()
seq2=string.strip(seq2)
#read in BLOSUM62 matrix
f3=open(sys.argv[3],'r')
BLOSUM62=[]
for line in f3.readlines():
BLOSUM62.append(map(int, line.split()))
#define similar amino acids
similarAA=['ST','TS','SP','PS','SA','AS','SG','GS','TP','PT','TA','AT','TG','GT','PA','AP','PG','GP','AG','GA','ND','DN','NE','EN','NQ','QN','DE','ED','DQ',
'QD','EQ','QE','HR','RH','HK','KH','RK','KR','MI','IM','ML','LM','MV','VM','IL','LI','IV','VI','LV','VL','FY','YF','FW','WF','YW','WY'];
m,n = len(seq1),len(seq2) #length of two sequences
penalty=-4; #define the gap penalty
#generate DP table and traceback path pointer matrix
score=zeros((m+1,n+1)) #the DP table
pointer=zeros((m+1,n+1)) #to store the traceback path
P=0;
def match_score(alpha,beta,BLOSUM62): #the function to find match/dismatch score from BLOSUM62 by letters of AAs
alphabet={}
alphabet["A"] = 0
alphabet["R"] = 1
alphabet["N"] = 2
alphabet["D"] = 3
alphabet["C"] = 4
alphabet["Q"] = 5
alphabet["E"] = 6
alphabet["G"] = 7
alphabet["H"] = 8
alphabet["I"] = 9
alphabet["L"] = 10
alphabet["K"] = 11
alphabet["M"] = 12
alphabet["F"] = 13
alphabet["P"] = 14
alphabet["S"] = 15
alphabet["T"] = 16
alphabet["W"] = 17
alphabet["Y"] = 18
alphabet["V"] = 19
alphabet["B"] = 20
alphabet["Z"] = 21
alphabet["X"] = 22
lut_x=alphabet[alpha]
lut_y=alphabet[beta]
return BLOSUM62[lut_x][lut_y]
max_score=P; #initial maximum score in DP table
#calculate DP table and mark pointers
for i in range(1,m+1):
for j in range(1,n+1):
score_up=score[i-1][j]+penalty;
score_down=score[i][j-1]+penalty;
score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1],BLOSUM62);
score[i][j]=max(0,score_up,score_down,score_diagonal);
if score[i][j]==0:
pointer[i][j]=0; #0 means end of the path
if score[i][j]==score_up:
pointer[i][j]=1; #1 means trace up
if score[i][j]==score_down:
pointer[i][j]=2; #2 means trace left
if score[i][j]==score_diagonal:
pointer[i][j]=3; #3 means trace diagonal
if score[i][j]>=max_score:
max_i=i;
max_j=j;
max_score=score[i][j];
#END of DP table
align1,align2='',''; #initial sequences
i,j=max_i,max_j; #indices of path starting point
#traceback, follow pointers
while pointer[i][j]!=0:
if pointer[i][j]==3:
align1=align1+seq1[i-1];
align2=align2+seq2[j-1];
i=i-1;
j=j-1;
elif pointer[i][j]==2:
align1=align1+'-';
align2=align2+seq2[j-1];
j=j-1;
elif pointer[i][j]==1:
align1=align1+seq1[i-1];
align2=align2+'-';
i=i-1;
#END of traceback
align1=align1[::-1]; #reverse sequence 1
align2=align2[::-1]; #reverse sequence 2
i,j=0,0;
#calcuate identity, similarity, score and aligned sequeces
def result(align1,align2,BLOSUM62):
symbol='';
found=0;
score=0;
penalty=-4;
identity,similarity=0,0;
for i in range(0,len(align1)):
if align1[i]==align2[i]: #if two AAs are the same, then output the letter
symbol=symbol+align1[i];
identity=identity+1;
similarity=similarity+1;
score=score+match_score(align1[i],align2[i],BLOSUM62);
elif align1[i]!=align2[i] and align1[i]!='-' and align2[i]!='-': #if they are not identical and none of them is gap
score=score+match_score(align1[i],align2[i],BLOSUM62);
for j in range(0,len(similarAA)-1):
if align1[i]+align2[i]==similarAA[j]: #search whether these two AAs form a pair in similar AA table
found=1;
if found==1:
symbol=symbol+':'; #if they are similar AA, output ':'
similarity=similarity+1;
if found==0:
symbol=symbol+' '; #o/w, output a space
found=0;
elif align1[i]=='-' or align2[i]=='-': #if one of them is a gap, output a space
symbol=symbol+' ';
score=score+penalty;
identity=float(identity)/len(align1)*100;
similarity=float(similarity)/len(align2)*100;
print 'Identity =', "%3.3f" % identity, 'percent';
print 'Similarity =', "%3.3f" % similarity, 'percent';
print 'Score =', score;
print align1
print symbol
print align2
#END of function result
result(align1,align2,BLOSUM62)
The execution is like this:
forrest@rainbow:/forrest/work/BioInfomatics/SW$ python SW.py Q9GMA3 Q9M276 BLOSUM62
Identity = 35.294 percent
Similarity = 52.941 percent
Score = 118
KRKKRRHRTVFTAHQLEELEKAF-SEAHY-P-D-VY-AREMLAMKTELPEDRIQVWFQNRRAKWR-KR-EKRWGGSSVMAEYG-L
K KK F: Q: LE F SE:: P V ARE L:: : P : :WFQN:RA:W: K EK :: A:Y L
KMKKSNNQKRFSEEQIKSLELIFESETRLEPRKKVQVARE-LGL--Q-PR-QVAIWFQNKRARWKTKQLEKEY--NTLRANYNNL
Update@Jun. 20, 2010: Several users have written to me regarding the format of my BLOSUM62. It is as follows:
4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
-1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
-2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
-2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
-1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
-1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
-2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
-1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
-1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
-1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
-1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
-2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
-1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
-3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
-2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
-2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
-1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
-4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
To jest z jest strony.
http://narnia.cs.ttu.edu/drupal/node/104