comparison Alignment/FastAlignmentArrays.pyx @ 0:1eb6054a1f84

First Commit
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:47:33 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1eb6054a1f84
1 '''
2 @organization: Lancaster University & University of Leeds
3 @version: 1.0
4 Created on 11/12/2014
5
6 @author: Victor Padilla
7 @contact: v.padilla@lancaster.ac.uk
8
9 Contains an implementation of the Needleman Wunsch algorithm.
10 for two dimension arrays. It is a Cython file and should be
11 compiled using the file setupCython.py
12
13 python setupCython.py build
14
15 The output will be in the build folder
16 MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR
17
18 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
19 # The score value will be between +1 to -1
20 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
21
22 # 'isFast=true' means that the differences between foo00 and foo000 is -1
23 # alignment are the sequences aligned
24 # finalValue is the last value of the matrix
25 # finalScore is the similarity of both sequences
26
27 usage:
28
29 seq1=["foo00","abc","123"]
30 seq2=["foo000","abc","1234"]
31
32 faa=FastAlignmentArray()
33 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
34 # finalScore=-0.333333343267
35 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
36 # finalScore=0.722222208977
37
38 '''
39
40 import numpy as np
41 cimport numpy as np
42
43 import NWunsch
44
45 # -*- coding: utf-8 -*-
46
47 from cpython cimport bool
48
49 # the three directions you can go in the traceback:
50 cdef int DIAG = 0
51 cdef int UP = 1
52 cdef int LEFT = 2
53 cdef float score=0
54
55 class FastAlignmentArrays:
56
57 def __needleman_wunsch_matrix(self,seq1,seq2,isFast):
58 """
59 fill in the DP matrix according to the Needleman-Wunsch algorithm.
60 Returns the matrix of scores and the matrix of pointers
61
62 if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
63 The score value will be between +1 to -1
64 if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
65 """
66
67 indel = -1 # indel penalty
68
69 cdef int n = len(seq1)
70 cdef int m = len(seq2)
71
72 cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix
73 cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers
74
75 ##### INITIALIZE SCORING MATRIX (base case) #####
76
77 cdef int i
78 cdef int j
79 for i in range(1, n+1) :
80 s[i,0] = indel * i
81 for j in range(1, m+1):
82 s[0,j] = indel * j
83
84 ########## INITIALIZE TRACEBACK MATRIX ##########
85
86 # Tag first row by LEFT, indicating initial "-"s
87 ptr[0,1:] = LEFT
88
89 # Tag first column by UP, indicating initial "-"s
90 ptr[1:,0] = UP
91
92 #####################################################
93
94
95 cdef int p
96 cdef int q
97 diagonalRange=350
98 for i in range(1,n+1):
99 p=i-diagonalRange
100 q=i+diagonalRange
101 if(p<1):
102 p=1
103 if(q>m+1):
104 q=m+1
105 for j in range(p,q):
106 # match
107 myseq1=seq1[i-1]
108 myseq2=seq2[j-1]
109 if isinstance(myseq1,list):
110 myseq1=myseq1[0]
111 if isinstance(myseq2,list):
112 myseq2=myseq2[0]
113
114
115 if(myseq1== myseq2):
116 score=1
117 else:
118 score=-1
119
120 if len(myseq1)==0 or len(myseq2)==0:
121 score=0
122
123 #####For double alignment###
124 if isFast==False:
125 if len(myseq1)==0 or len(myseq2)==0:
126 score=0
127 else:
128 score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2)
129 ############################
130
131 s[i,j] = s[i-1,j-1]+ score
132
133
134 # indel penalty
135 if s[i-1,j] + indel > s[i,j] :
136 s[i,j] = s[i-1,j] + indel
137 ptr[i,j] = UP
138 # indel penalty
139 if s[i, j-1] + indel > s[i,j]:
140 s[i,j] = s[i, j-1] + indel
141 ptr[i,j] = LEFT
142
143 return s, ptr
144
145 def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) :
146 """
147 Function that traces back the best path to get alignment
148
149 """
150 #### TRACE BEST PATH TO GET ALIGNMENT ####
151 align1 = []
152 align2 = []
153 align1_gap = []
154 align2_gap = []
155 gap1=[]
156 gap2=[]
157
158 cdef int n,m
159 n, m = (len(seq1), len(seq2))
160 cdef int i,j,curr
161 i = n
162 j = m
163 curr = ptr[i, j]
164 while (i > 0 or j > 0):
165 ptr[i,j] += 3
166 if curr == DIAG :
167 align1.append(seq1[i-1])
168 align2.append(seq2[j-1])
169 align1_gap.append(seq1[i-1])
170 align2_gap.append(seq2[j-1])
171 i -= 1
172 j -= 1
173 elif curr == LEFT:
174 align1.append("*")
175 align2.append(seq2[j-1])
176 align1_gap.append("[GAP]")
177 align2_gap.append(seq2[j-1])
178 j -= 1
179 elif curr == UP:
180 align1.append(seq1[i-1])
181 align2.append("*")
182 align1_gap.append(seq1[i-1])
183 align2_gap.append("[GAP]")
184 i -= 1
185
186 curr = ptr[i,j]
187
188 align1.reverse()
189 align2.reverse()
190 align1_gap.reverse()
191 align2_gap.reverse()
192 #gaps
193 for index in range(len(align1_gap)):
194 if(align1_gap[index])=="[GAP]":
195 gap1.append(index)
196
197 for index in range(len(align2_gap)):
198 if(align2_gap[index])=="[GAP]":
199 gap2.append(index)
200
201
202 return align1, align2,gap1,gap2
203
204
205 def needleman_wunsch(self,seq1, seq2,isFast=True) :
206 """
207 Computes an optimal global alignment of two sequences using the Needleman-Wunsch
208 algorithm
209 returns the alignment and its score
210
211 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
212 # The score value will be between +1 to -1
213 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
214
215 # 'isFast=true' means that the differences between foo00 and foo000 is -1
216 # alignment are the sequences aligned
217 # finalValue is the last value of the matrix
218 # finalScore is the similarity of both sequences
219
220 usage:
221
222 seq1=["foo00","abc","123"]
223 seq2=["foo000","abc","1234"]
224
225 faa=FastAlignmentArray()
226 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
227 # finalScore=-0.333333343267
228 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
229 # finalScore=0.722222208977
230
231 """
232 s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast)
233 alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr)
234 cdef int maxlen=len(seq1)
235 if len(seq2)>len(seq1):
236 maxlen=len(seq2)
237 cdef float finalscore=s[len(seq1), len(seq2)]/maxlen
238 return alignment, s[len(seq1), len(seq2)],finalscore
239
240