victor@0
|
1 '''
|
victor@0
|
2 @organization: Lancaster University & University of Leeds
|
victor@0
|
3 @version: 1.0
|
victor@0
|
4 Created on 11/12/2014
|
victor@0
|
5
|
victor@0
|
6 @author: Victor Padilla
|
victor@0
|
7 @contact: v.padilla@lancaster.ac.uk
|
victor@0
|
8
|
victor@0
|
9 Contains an implementation of the Needleman Wunsch algorithm.
|
victor@0
|
10 for two dimension arrays. It is a Cython file and should be
|
victor@0
|
11 compiled using the file setupCython.py
|
victor@0
|
12
|
victor@0
|
13 python setupCython.py build
|
victor@0
|
14
|
victor@0
|
15 The output will be in the build folder
|
victor@0
|
16 MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR
|
victor@0
|
17
|
victor@0
|
18 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
|
victor@0
|
19 # The score value will be between +1 to -1
|
victor@0
|
20 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
|
victor@0
|
21
|
victor@0
|
22 # 'isFast=true' means that the differences between foo00 and foo000 is -1
|
victor@0
|
23 # alignment are the sequences aligned
|
victor@0
|
24 # finalValue is the last value of the matrix
|
victor@0
|
25 # finalScore is the similarity of both sequences
|
victor@0
|
26
|
victor@0
|
27 usage:
|
victor@0
|
28
|
victor@0
|
29 seq1=["foo00","abc","123"]
|
victor@0
|
30 seq2=["foo000","abc","1234"]
|
victor@0
|
31
|
victor@0
|
32 faa=FastAlignmentArray()
|
victor@0
|
33 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
|
victor@0
|
34 # finalScore=-0.333333343267
|
victor@0
|
35 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
|
victor@0
|
36 # finalScore=0.722222208977
|
victor@0
|
37
|
victor@0
|
38 '''
|
victor@0
|
39
|
victor@0
|
40 import numpy as np
|
victor@0
|
41 cimport numpy as np
|
victor@0
|
42
|
victor@0
|
43 import NWunsch
|
victor@0
|
44
|
victor@0
|
45 # -*- coding: utf-8 -*-
|
victor@0
|
46
|
victor@0
|
47 from cpython cimport bool
|
victor@0
|
48
|
victor@0
|
49 # the three directions you can go in the traceback:
|
victor@0
|
50 cdef int DIAG = 0
|
victor@0
|
51 cdef int UP = 1
|
victor@0
|
52 cdef int LEFT = 2
|
victor@0
|
53 cdef float score=0
|
victor@0
|
54
|
victor@0
|
55 class FastAlignmentArrays:
|
victor@0
|
56
|
victor@0
|
57 def __needleman_wunsch_matrix(self,seq1,seq2,isFast):
|
victor@0
|
58 """
|
victor@0
|
59 fill in the DP matrix according to the Needleman-Wunsch algorithm.
|
victor@0
|
60 Returns the matrix of scores and the matrix of pointers
|
victor@0
|
61
|
victor@0
|
62 if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
|
victor@0
|
63 The score value will be between +1 to -1
|
victor@0
|
64 if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
|
victor@0
|
65 """
|
victor@0
|
66
|
victor@0
|
67 indel = -1 # indel penalty
|
victor@0
|
68
|
victor@0
|
69 cdef int n = len(seq1)
|
victor@0
|
70 cdef int m = len(seq2)
|
victor@0
|
71
|
victor@0
|
72 cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix
|
victor@0
|
73 cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers
|
victor@0
|
74
|
victor@0
|
75 ##### INITIALIZE SCORING MATRIX (base case) #####
|
victor@0
|
76
|
victor@0
|
77 cdef int i
|
victor@0
|
78 cdef int j
|
victor@0
|
79 for i in range(1, n+1) :
|
victor@0
|
80 s[i,0] = indel * i
|
victor@0
|
81 for j in range(1, m+1):
|
victor@0
|
82 s[0,j] = indel * j
|
victor@0
|
83
|
victor@0
|
84 ########## INITIALIZE TRACEBACK MATRIX ##########
|
victor@0
|
85
|
victor@0
|
86 # Tag first row by LEFT, indicating initial "-"s
|
victor@0
|
87 ptr[0,1:] = LEFT
|
victor@0
|
88
|
victor@0
|
89 # Tag first column by UP, indicating initial "-"s
|
victor@0
|
90 ptr[1:,0] = UP
|
victor@0
|
91
|
victor@0
|
92 #####################################################
|
victor@0
|
93
|
victor@0
|
94
|
victor@0
|
95 cdef int p
|
victor@0
|
96 cdef int q
|
victor@0
|
97 diagonalRange=350
|
victor@0
|
98 for i in range(1,n+1):
|
victor@0
|
99 p=i-diagonalRange
|
victor@0
|
100 q=i+diagonalRange
|
victor@0
|
101 if(p<1):
|
victor@0
|
102 p=1
|
victor@0
|
103 if(q>m+1):
|
victor@0
|
104 q=m+1
|
victor@0
|
105 for j in range(p,q):
|
victor@0
|
106 # match
|
victor@0
|
107 myseq1=seq1[i-1]
|
victor@0
|
108 myseq2=seq2[j-1]
|
victor@0
|
109 if isinstance(myseq1,list):
|
victor@0
|
110 myseq1=myseq1[0]
|
victor@0
|
111 if isinstance(myseq2,list):
|
victor@0
|
112 myseq2=myseq2[0]
|
victor@0
|
113
|
victor@0
|
114
|
victor@0
|
115 if(myseq1== myseq2):
|
victor@0
|
116 score=1
|
victor@0
|
117 else:
|
victor@0
|
118 score=-1
|
victor@0
|
119
|
victor@0
|
120 if len(myseq1)==0 or len(myseq2)==0:
|
victor@0
|
121 score=0
|
victor@0
|
122
|
victor@0
|
123 #####For double alignment###
|
victor@0
|
124 if isFast==False:
|
victor@0
|
125 if len(myseq1)==0 or len(myseq2)==0:
|
victor@0
|
126 score=0
|
victor@0
|
127 else:
|
victor@0
|
128 score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2)
|
victor@0
|
129 ############################
|
victor@0
|
130
|
victor@0
|
131 s[i,j] = s[i-1,j-1]+ score
|
victor@0
|
132
|
victor@0
|
133
|
victor@0
|
134 # indel penalty
|
victor@0
|
135 if s[i-1,j] + indel > s[i,j] :
|
victor@0
|
136 s[i,j] = s[i-1,j] + indel
|
victor@0
|
137 ptr[i,j] = UP
|
victor@0
|
138 # indel penalty
|
victor@0
|
139 if s[i, j-1] + indel > s[i,j]:
|
victor@0
|
140 s[i,j] = s[i, j-1] + indel
|
victor@0
|
141 ptr[i,j] = LEFT
|
victor@0
|
142
|
victor@0
|
143 return s, ptr
|
victor@0
|
144
|
victor@0
|
145 def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) :
|
victor@0
|
146 """
|
victor@0
|
147 Function that traces back the best path to get alignment
|
victor@0
|
148
|
victor@0
|
149 """
|
victor@0
|
150 #### TRACE BEST PATH TO GET ALIGNMENT ####
|
victor@0
|
151 align1 = []
|
victor@0
|
152 align2 = []
|
victor@0
|
153 align1_gap = []
|
victor@0
|
154 align2_gap = []
|
victor@0
|
155 gap1=[]
|
victor@0
|
156 gap2=[]
|
victor@0
|
157
|
victor@0
|
158 cdef int n,m
|
victor@0
|
159 n, m = (len(seq1), len(seq2))
|
victor@0
|
160 cdef int i,j,curr
|
victor@0
|
161 i = n
|
victor@0
|
162 j = m
|
victor@0
|
163 curr = ptr[i, j]
|
victor@0
|
164 while (i > 0 or j > 0):
|
victor@0
|
165 ptr[i,j] += 3
|
victor@0
|
166 if curr == DIAG :
|
victor@0
|
167 align1.append(seq1[i-1])
|
victor@0
|
168 align2.append(seq2[j-1])
|
victor@0
|
169 align1_gap.append(seq1[i-1])
|
victor@0
|
170 align2_gap.append(seq2[j-1])
|
victor@0
|
171 i -= 1
|
victor@0
|
172 j -= 1
|
victor@0
|
173 elif curr == LEFT:
|
victor@0
|
174 align1.append("*")
|
victor@0
|
175 align2.append(seq2[j-1])
|
victor@0
|
176 align1_gap.append("[GAP]")
|
victor@0
|
177 align2_gap.append(seq2[j-1])
|
victor@0
|
178 j -= 1
|
victor@0
|
179 elif curr == UP:
|
victor@0
|
180 align1.append(seq1[i-1])
|
victor@0
|
181 align2.append("*")
|
victor@0
|
182 align1_gap.append(seq1[i-1])
|
victor@0
|
183 align2_gap.append("[GAP]")
|
victor@0
|
184 i -= 1
|
victor@0
|
185
|
victor@0
|
186 curr = ptr[i,j]
|
victor@0
|
187
|
victor@0
|
188 align1.reverse()
|
victor@0
|
189 align2.reverse()
|
victor@0
|
190 align1_gap.reverse()
|
victor@0
|
191 align2_gap.reverse()
|
victor@0
|
192 #gaps
|
victor@0
|
193 for index in range(len(align1_gap)):
|
victor@0
|
194 if(align1_gap[index])=="[GAP]":
|
victor@0
|
195 gap1.append(index)
|
victor@0
|
196
|
victor@0
|
197 for index in range(len(align2_gap)):
|
victor@0
|
198 if(align2_gap[index])=="[GAP]":
|
victor@0
|
199 gap2.append(index)
|
victor@0
|
200
|
victor@0
|
201
|
victor@0
|
202 return align1, align2,gap1,gap2
|
victor@0
|
203
|
victor@0
|
204
|
victor@0
|
205 def needleman_wunsch(self,seq1, seq2,isFast=True) :
|
victor@0
|
206 """
|
victor@0
|
207 Computes an optimal global alignment of two sequences using the Needleman-Wunsch
|
victor@0
|
208 algorithm
|
victor@0
|
209 returns the alignment and its score
|
victor@0
|
210
|
victor@0
|
211 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
|
victor@0
|
212 # The score value will be between +1 to -1
|
victor@0
|
213 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
|
victor@0
|
214
|
victor@0
|
215 # 'isFast=true' means that the differences between foo00 and foo000 is -1
|
victor@0
|
216 # alignment are the sequences aligned
|
victor@0
|
217 # finalValue is the last value of the matrix
|
victor@0
|
218 # finalScore is the similarity of both sequences
|
victor@0
|
219
|
victor@0
|
220 usage:
|
victor@0
|
221
|
victor@0
|
222 seq1=["foo00","abc","123"]
|
victor@0
|
223 seq2=["foo000","abc","1234"]
|
victor@0
|
224
|
victor@0
|
225 faa=FastAlignmentArray()
|
victor@0
|
226 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
|
victor@0
|
227 # finalScore=-0.333333343267
|
victor@0
|
228 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
|
victor@0
|
229 # finalScore=0.722222208977
|
victor@0
|
230
|
victor@0
|
231 """
|
victor@0
|
232 s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast)
|
victor@0
|
233 alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr)
|
victor@0
|
234 cdef int maxlen=len(seq1)
|
victor@0
|
235 if len(seq2)>len(seq1):
|
victor@0
|
236 maxlen=len(seq2)
|
victor@0
|
237 cdef float finalscore=s[len(seq1), len(seq2)]/maxlen
|
victor@0
|
238 return alignment, s[len(seq1), len(seq2)],finalscore
|
victor@0
|
239
|
victor@0
|
240
|