algorytm smitha watermana python


Smith-Waterman Algorithm in Python (for amino acid sequences)

by Forrest Sheng Bao http://fsbao.net

This code is for amino acid sequences rather than nucleic acid sequences. But you can modify it slightly, such as change the scoring matrix, to make it work for nucleic acid sequences.

#This software is a free software. Thus, it is licensed under GNU General Public License.

#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.

#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>

import sys, string

from numpy import *

from matplotlib import *

#read the first sequence

f1=open(sys.argv[1], 'r')

seq1=f1.readline()

seq1=string.strip(seq1)

#read the second sequence

f2=open(sys.argv[2], 'r')

seq2=f2.readline()

seq2=string.strip(seq2)

#read in BLOSUM62 matrix

f3=open(sys.argv[3],'r')

BLOSUM62=[]

for line in f3.readlines():

BLOSUM62.append(map(int, line.split()))

#define similar amino acids

similarAA=['ST','TS','SP','PS','SA','AS','SG','GS','TP','PT','TA','AT','TG','GT','PA','AP','PG','GP','AG','GA','ND','DN','NE','EN','NQ','QN','DE','ED','DQ',

'QD','EQ','QE','HR','RH','HK','KH','RK','KR','MI','IM','ML','LM','MV','VM','IL','LI','IV','VI','LV','VL','FY','YF','FW','WF','YW','WY'];

m,n = len(seq1),len(seq2) #length of two sequences

penalty=-4; #define the gap penalty

#generate DP table and traceback path pointer matrix

score=zeros((m+1,n+1)) #the DP table

pointer=zeros((m+1,n+1)) #to store the traceback path

P=0;

def match_score(alpha,beta,BLOSUM62): #the function to find match/dismatch score from BLOSUM62 by letters of AAs

alphabet={}

alphabet["A"] = 0

alphabet["R"] = 1

alphabet["N"] = 2

alphabet["D"] = 3

alphabet["C"] = 4

alphabet["Q"] = 5

alphabet["E"] = 6

alphabet["G"] = 7

alphabet["H"] = 8

alphabet["I"] = 9

alphabet["L"] = 10

alphabet["K"] = 11

alphabet["M"] = 12

alphabet["F"] = 13

alphabet["P"] = 14

alphabet["S"] = 15

alphabet["T"] = 16

alphabet["W"] = 17

alphabet["Y"] = 18

alphabet["V"] = 19

alphabet["B"] = 20

alphabet["Z"] = 21

alphabet["X"] = 22

lut_x=alphabet[alpha]

lut_y=alphabet[beta]

return BLOSUM62[lut_x][lut_y]

max_score=P; #initial maximum score in DP table

#calculate DP table and mark pointers

for i in range(1,m+1):

for j in range(1,n+1):

score_up=score[i-1][j]+penalty;

score_down=score[i][j-1]+penalty;

score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1],BLOSUM62);

score[i][j]=max(0,score_up,score_down,score_diagonal);

if score[i][j]==0:

pointer[i][j]=0; #0 means end of the path

if score[i][j]==score_up:

pointer[i][j]=1; #1 means trace up

if score[i][j]==score_down:

pointer[i][j]=2; #2 means trace left

if score[i][j]==score_diagonal:

pointer[i][j]=3; #3 means trace diagonal

if score[i][j]>=max_score:

max_i=i;

max_j=j;

max_score=score[i][j];

#END of DP table

align1,align2='',''; #initial sequences

i,j=max_i,max_j; #indices of path starting point

#traceback, follow pointers

while pointer[i][j]!=0:

if pointer[i][j]==3:

align1=align1+seq1[i-1];

align2=align2+seq2[j-1];

i=i-1;

j=j-1;

elif pointer[i][j]==2:

align1=align1+'-';

align2=align2+seq2[j-1];

j=j-1;

elif pointer[i][j]==1:

align1=align1+seq1[i-1];

align2=align2+'-';

i=i-1;

#END of traceback

align1=align1[::-1]; #reverse sequence 1

align2=align2[::-1]; #reverse sequence 2

i,j=0,0;

#calcuate identity, similarity, score and aligned sequeces

def result(align1,align2,BLOSUM62):

symbol='';

found=0;

score=0;

penalty=-4;

identity,similarity=0,0;

for i in range(0,len(align1)):

if align1[i]==align2[i]: #if two AAs are the same, then output the letter

symbol=symbol+align1[i];

identity=identity+1;

similarity=similarity+1;

score=score+match_score(align1[i],align2[i],BLOSUM62);

elif align1[i]!=align2[i] and align1[i]!='-' and align2[i]!='-': #if they are not identical and none of them is gap

score=score+match_score(align1[i],align2[i],BLOSUM62);

for j in range(0,len(similarAA)-1):

if align1[i]+align2[i]==similarAA[j]: #search whether these two AAs form a pair in similar AA table

found=1;

if found==1:

symbol=symbol+':'; #if they are similar AA, output ':'

similarity=similarity+1;

if found==0:

symbol=symbol+' '; #o/w, output a space

found=0;

elif align1[i]=='-' or align2[i]=='-': #if one of them is a gap, output a space

symbol=symbol+' ';

score=score+penalty;

identity=float(identity)/len(align1)*100;

similarity=float(similarity)/len(align2)*100;

print 'Identity =', "%3.3f" % identity, 'percent';

print 'Similarity =', "%3.3f" % similarity, 'percent';

print 'Score =', score;

print align1

print symbol

print align2

#END of function result

result(align1,align2,BLOSUM62)

The execution is like this:

forrest@rainbow:/forrest/work/BioInfomatics/SW$ python SW.py Q9GMA3 Q9M276 BLOSUM62

Identity = 35.294 percent

Similarity = 52.941 percent

Score = 118

KRKKRRHRTVFTAHQLEELEKAF-SEAHY-P-D-VY-AREMLAMKTELPEDRIQVWFQNRRAKWR-KR-EKRWGGSSVMAEYG-L

K KK F: Q: LE F SE:: P V ARE L:: : P : :WFQN:RA:W: K EK :: A:Y L

KMKKSNNQKRFSEEQIKSLELIFESETRLEPRKKVQVARE-LGL--Q-PR-QVAIWFQNKRARWKTKQLEKEY--NTLRANYNNL

Update@Jun. 20, 2010: Several users have written to me regarding the format of my BLOSUM62. It is as follows:

4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4

-1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4

-2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4

-2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4

0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4

-1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4

-1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4

0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4

-2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4

-1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4

-1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4

-1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4

-1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4

-2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4

-1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4

1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4

0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4

-3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4

-2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4

0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4

-2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4

-1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4

0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4

-4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1

To jest z jest strony.

http://narnia.cs.ttu.edu/drupal/node/104



Wyszukiwarka

Podobne podstrony:
Układy Napędowe oraz algorytmy sterowania w bioprotezach
5 Algorytmy
5 Algorytmy wyznaczania dyskretnej transformaty Fouriera (CPS)
Tętniak aorty brzusznej algorytm
Algorytmy rastrowe
Algorytmy genetyczne
Teorie algorytmow genetycznych prezentacja
Algorytmy tekstowe
Algorytmy i struktury danych Wykład 1 Reprezentacja informacji w komputerze
ALGORYTM EUKLIDESA
Algorytmy z przykladami tp 7 0
ALGORYT8

więcej podobnych podstron