Mercurial > hg > multiomr
comparison Alignment/FastAlignmentArrays.pyx @ 0:1eb6054a1f84
First Commit
author | Victor Padilla <victor.padilla.mc@gmail.com> |
---|---|
date | Mon, 04 May 2015 22:47:33 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1eb6054a1f84 |
---|---|
1 ''' | |
2 @organization: Lancaster University & University of Leeds | |
3 @version: 1.0 | |
4 Created on 11/12/2014 | |
5 | |
6 @author: Victor Padilla | |
7 @contact: v.padilla@lancaster.ac.uk | |
8 | |
9 Contains an implementation of the Needleman Wunsch algorithm. | |
10 for two dimension arrays. It is a Cython file and should be | |
11 compiled using the file setupCython.py | |
12 | |
13 python setupCython.py build | |
14 | |
15 The output will be in the build folder | |
16 MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR | |
17 | |
18 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. | |
19 # The score value will be between +1 to -1 | |
20 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) | |
21 | |
22 # 'isFast=true' means that the differences between foo00 and foo000 is -1 | |
23 # alignment are the sequences aligned | |
24 # finalValue is the last value of the matrix | |
25 # finalScore is the similarity of both sequences | |
26 | |
27 usage: | |
28 | |
29 seq1=["foo00","abc","123"] | |
30 seq2=["foo000","abc","1234"] | |
31 | |
32 faa=FastAlignmentArray() | |
33 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) | |
34 # finalScore=-0.333333343267 | |
35 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) | |
36 # finalScore=0.722222208977 | |
37 | |
38 ''' | |
39 | |
40 import numpy as np | |
41 cimport numpy as np | |
42 | |
43 import NWunsch | |
44 | |
45 # -*- coding: utf-8 -*- | |
46 | |
47 from cpython cimport bool | |
48 | |
49 # the three directions you can go in the traceback: | |
50 cdef int DIAG = 0 | |
51 cdef int UP = 1 | |
52 cdef int LEFT = 2 | |
53 cdef float score=0 | |
54 | |
55 class FastAlignmentArrays: | |
56 | |
57 def __needleman_wunsch_matrix(self,seq1,seq2,isFast): | |
58 """ | |
59 fill in the DP matrix according to the Needleman-Wunsch algorithm. | |
60 Returns the matrix of scores and the matrix of pointers | |
61 | |
62 if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. | |
63 The score value will be between +1 to -1 | |
64 if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) | |
65 """ | |
66 | |
67 indel = -1 # indel penalty | |
68 | |
69 cdef int n = len(seq1) | |
70 cdef int m = len(seq2) | |
71 | |
72 cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix | |
73 cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers | |
74 | |
75 ##### INITIALIZE SCORING MATRIX (base case) ##### | |
76 | |
77 cdef int i | |
78 cdef int j | |
79 for i in range(1, n+1) : | |
80 s[i,0] = indel * i | |
81 for j in range(1, m+1): | |
82 s[0,j] = indel * j | |
83 | |
84 ########## INITIALIZE TRACEBACK MATRIX ########## | |
85 | |
86 # Tag first row by LEFT, indicating initial "-"s | |
87 ptr[0,1:] = LEFT | |
88 | |
89 # Tag first column by UP, indicating initial "-"s | |
90 ptr[1:,0] = UP | |
91 | |
92 ##################################################### | |
93 | |
94 | |
95 cdef int p | |
96 cdef int q | |
97 diagonalRange=350 | |
98 for i in range(1,n+1): | |
99 p=i-diagonalRange | |
100 q=i+diagonalRange | |
101 if(p<1): | |
102 p=1 | |
103 if(q>m+1): | |
104 q=m+1 | |
105 for j in range(p,q): | |
106 # match | |
107 myseq1=seq1[i-1] | |
108 myseq2=seq2[j-1] | |
109 if isinstance(myseq1,list): | |
110 myseq1=myseq1[0] | |
111 if isinstance(myseq2,list): | |
112 myseq2=myseq2[0] | |
113 | |
114 | |
115 if(myseq1== myseq2): | |
116 score=1 | |
117 else: | |
118 score=-1 | |
119 | |
120 if len(myseq1)==0 or len(myseq2)==0: | |
121 score=0 | |
122 | |
123 #####For double alignment### | |
124 if isFast==False: | |
125 if len(myseq1)==0 or len(myseq2)==0: | |
126 score=0 | |
127 else: | |
128 score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2) | |
129 ############################ | |
130 | |
131 s[i,j] = s[i-1,j-1]+ score | |
132 | |
133 | |
134 # indel penalty | |
135 if s[i-1,j] + indel > s[i,j] : | |
136 s[i,j] = s[i-1,j] + indel | |
137 ptr[i,j] = UP | |
138 # indel penalty | |
139 if s[i, j-1] + indel > s[i,j]: | |
140 s[i,j] = s[i, j-1] + indel | |
141 ptr[i,j] = LEFT | |
142 | |
143 return s, ptr | |
144 | |
145 def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) : | |
146 """ | |
147 Function that traces back the best path to get alignment | |
148 | |
149 """ | |
150 #### TRACE BEST PATH TO GET ALIGNMENT #### | |
151 align1 = [] | |
152 align2 = [] | |
153 align1_gap = [] | |
154 align2_gap = [] | |
155 gap1=[] | |
156 gap2=[] | |
157 | |
158 cdef int n,m | |
159 n, m = (len(seq1), len(seq2)) | |
160 cdef int i,j,curr | |
161 i = n | |
162 j = m | |
163 curr = ptr[i, j] | |
164 while (i > 0 or j > 0): | |
165 ptr[i,j] += 3 | |
166 if curr == DIAG : | |
167 align1.append(seq1[i-1]) | |
168 align2.append(seq2[j-1]) | |
169 align1_gap.append(seq1[i-1]) | |
170 align2_gap.append(seq2[j-1]) | |
171 i -= 1 | |
172 j -= 1 | |
173 elif curr == LEFT: | |
174 align1.append("*") | |
175 align2.append(seq2[j-1]) | |
176 align1_gap.append("[GAP]") | |
177 align2_gap.append(seq2[j-1]) | |
178 j -= 1 | |
179 elif curr == UP: | |
180 align1.append(seq1[i-1]) | |
181 align2.append("*") | |
182 align1_gap.append(seq1[i-1]) | |
183 align2_gap.append("[GAP]") | |
184 i -= 1 | |
185 | |
186 curr = ptr[i,j] | |
187 | |
188 align1.reverse() | |
189 align2.reverse() | |
190 align1_gap.reverse() | |
191 align2_gap.reverse() | |
192 #gaps | |
193 for index in range(len(align1_gap)): | |
194 if(align1_gap[index])=="[GAP]": | |
195 gap1.append(index) | |
196 | |
197 for index in range(len(align2_gap)): | |
198 if(align2_gap[index])=="[GAP]": | |
199 gap2.append(index) | |
200 | |
201 | |
202 return align1, align2,gap1,gap2 | |
203 | |
204 | |
205 def needleman_wunsch(self,seq1, seq2,isFast=True) : | |
206 """ | |
207 Computes an optimal global alignment of two sequences using the Needleman-Wunsch | |
208 algorithm | |
209 returns the alignment and its score | |
210 | |
211 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. | |
212 # The score value will be between +1 to -1 | |
213 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) | |
214 | |
215 # 'isFast=true' means that the differences between foo00 and foo000 is -1 | |
216 # alignment are the sequences aligned | |
217 # finalValue is the last value of the matrix | |
218 # finalScore is the similarity of both sequences | |
219 | |
220 usage: | |
221 | |
222 seq1=["foo00","abc","123"] | |
223 seq2=["foo000","abc","1234"] | |
224 | |
225 faa=FastAlignmentArray() | |
226 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) | |
227 # finalScore=-0.333333343267 | |
228 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) | |
229 # finalScore=0.722222208977 | |
230 | |
231 """ | |
232 s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast) | |
233 alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr) | |
234 cdef int maxlen=len(seq1) | |
235 if len(seq2)>len(seq1): | |
236 maxlen=len(seq2) | |
237 cdef float finalscore=s[len(seq1), len(seq2)]/maxlen | |
238 return alignment, s[len(seq1), len(seq2)],finalscore | |
239 | |
240 |