annotate Alignment/FastAlignmentArrays.pyx @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents 1eb6054a1f84
children
rev   line source
victor@0 1 '''
victor@0 2 @organization: Lancaster University & University of Leeds
victor@0 3 @version: 1.0
victor@0 4 Created on 11/12/2014
victor@0 5
victor@0 6 @author: Victor Padilla
victor@0 7 @contact: v.padilla@lancaster.ac.uk
victor@0 8
victor@0 9 Contains an implementation of the Needleman Wunsch algorithm.
victor@0 10 for two dimension arrays. It is a Cython file and should be
victor@0 11 compiled using the file setupCython.py
victor@0 12
victor@0 13 python setupCython.py build
victor@0 14
victor@0 15 The output will be in the build folder
victor@0 16 MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR
victor@0 17
victor@0 18 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
victor@0 19 # The score value will be between +1 to -1
victor@0 20 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
victor@0 21
victor@0 22 # 'isFast=true' means that the differences between foo00 and foo000 is -1
victor@0 23 # alignment are the sequences aligned
victor@0 24 # finalValue is the last value of the matrix
victor@0 25 # finalScore is the similarity of both sequences
victor@0 26
victor@0 27 usage:
victor@0 28
victor@0 29 seq1=["foo00","abc","123"]
victor@0 30 seq2=["foo000","abc","1234"]
victor@0 31
victor@0 32 faa=FastAlignmentArray()
victor@0 33 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
victor@0 34 # finalScore=-0.333333343267
victor@0 35 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
victor@0 36 # finalScore=0.722222208977
victor@0 37
victor@0 38 '''
victor@0 39
victor@0 40 import numpy as np
victor@0 41 cimport numpy as np
victor@0 42
victor@0 43 import NWunsch
victor@0 44
victor@0 45 # -*- coding: utf-8 -*-
victor@0 46
victor@0 47 from cpython cimport bool
victor@0 48
victor@0 49 # the three directions you can go in the traceback:
victor@0 50 cdef int DIAG = 0
victor@0 51 cdef int UP = 1
victor@0 52 cdef int LEFT = 2
victor@0 53 cdef float score=0
victor@0 54
victor@0 55 class FastAlignmentArrays:
victor@0 56
victor@0 57 def __needleman_wunsch_matrix(self,seq1,seq2,isFast):
victor@0 58 """
victor@0 59 fill in the DP matrix according to the Needleman-Wunsch algorithm.
victor@0 60 Returns the matrix of scores and the matrix of pointers
victor@0 61
victor@0 62 if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
victor@0 63 The score value will be between +1 to -1
victor@0 64 if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
victor@0 65 """
victor@0 66
victor@0 67 indel = -1 # indel penalty
victor@0 68
victor@0 69 cdef int n = len(seq1)
victor@0 70 cdef int m = len(seq2)
victor@0 71
victor@0 72 cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix
victor@0 73 cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers
victor@0 74
victor@0 75 ##### INITIALIZE SCORING MATRIX (base case) #####
victor@0 76
victor@0 77 cdef int i
victor@0 78 cdef int j
victor@0 79 for i in range(1, n+1) :
victor@0 80 s[i,0] = indel * i
victor@0 81 for j in range(1, m+1):
victor@0 82 s[0,j] = indel * j
victor@0 83
victor@0 84 ########## INITIALIZE TRACEBACK MATRIX ##########
victor@0 85
victor@0 86 # Tag first row by LEFT, indicating initial "-"s
victor@0 87 ptr[0,1:] = LEFT
victor@0 88
victor@0 89 # Tag first column by UP, indicating initial "-"s
victor@0 90 ptr[1:,0] = UP
victor@0 91
victor@0 92 #####################################################
victor@0 93
victor@0 94
victor@0 95 cdef int p
victor@0 96 cdef int q
victor@0 97 diagonalRange=350
victor@0 98 for i in range(1,n+1):
victor@0 99 p=i-diagonalRange
victor@0 100 q=i+diagonalRange
victor@0 101 if(p<1):
victor@0 102 p=1
victor@0 103 if(q>m+1):
victor@0 104 q=m+1
victor@0 105 for j in range(p,q):
victor@0 106 # match
victor@0 107 myseq1=seq1[i-1]
victor@0 108 myseq2=seq2[j-1]
victor@0 109 if isinstance(myseq1,list):
victor@0 110 myseq1=myseq1[0]
victor@0 111 if isinstance(myseq2,list):
victor@0 112 myseq2=myseq2[0]
victor@0 113
victor@0 114
victor@0 115 if(myseq1== myseq2):
victor@0 116 score=1
victor@0 117 else:
victor@0 118 score=-1
victor@0 119
victor@0 120 if len(myseq1)==0 or len(myseq2)==0:
victor@0 121 score=0
victor@0 122
victor@0 123 #####For double alignment###
victor@0 124 if isFast==False:
victor@0 125 if len(myseq1)==0 or len(myseq2)==0:
victor@0 126 score=0
victor@0 127 else:
victor@0 128 score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2)
victor@0 129 ############################
victor@0 130
victor@0 131 s[i,j] = s[i-1,j-1]+ score
victor@0 132
victor@0 133
victor@0 134 # indel penalty
victor@0 135 if s[i-1,j] + indel > s[i,j] :
victor@0 136 s[i,j] = s[i-1,j] + indel
victor@0 137 ptr[i,j] = UP
victor@0 138 # indel penalty
victor@0 139 if s[i, j-1] + indel > s[i,j]:
victor@0 140 s[i,j] = s[i, j-1] + indel
victor@0 141 ptr[i,j] = LEFT
victor@0 142
victor@0 143 return s, ptr
victor@0 144
victor@0 145 def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) :
victor@0 146 """
victor@0 147 Function that traces back the best path to get alignment
victor@0 148
victor@0 149 """
victor@0 150 #### TRACE BEST PATH TO GET ALIGNMENT ####
victor@0 151 align1 = []
victor@0 152 align2 = []
victor@0 153 align1_gap = []
victor@0 154 align2_gap = []
victor@0 155 gap1=[]
victor@0 156 gap2=[]
victor@0 157
victor@0 158 cdef int n,m
victor@0 159 n, m = (len(seq1), len(seq2))
victor@0 160 cdef int i,j,curr
victor@0 161 i = n
victor@0 162 j = m
victor@0 163 curr = ptr[i, j]
victor@0 164 while (i > 0 or j > 0):
victor@0 165 ptr[i,j] += 3
victor@0 166 if curr == DIAG :
victor@0 167 align1.append(seq1[i-1])
victor@0 168 align2.append(seq2[j-1])
victor@0 169 align1_gap.append(seq1[i-1])
victor@0 170 align2_gap.append(seq2[j-1])
victor@0 171 i -= 1
victor@0 172 j -= 1
victor@0 173 elif curr == LEFT:
victor@0 174 align1.append("*")
victor@0 175 align2.append(seq2[j-1])
victor@0 176 align1_gap.append("[GAP]")
victor@0 177 align2_gap.append(seq2[j-1])
victor@0 178 j -= 1
victor@0 179 elif curr == UP:
victor@0 180 align1.append(seq1[i-1])
victor@0 181 align2.append("*")
victor@0 182 align1_gap.append(seq1[i-1])
victor@0 183 align2_gap.append("[GAP]")
victor@0 184 i -= 1
victor@0 185
victor@0 186 curr = ptr[i,j]
victor@0 187
victor@0 188 align1.reverse()
victor@0 189 align2.reverse()
victor@0 190 align1_gap.reverse()
victor@0 191 align2_gap.reverse()
victor@0 192 #gaps
victor@0 193 for index in range(len(align1_gap)):
victor@0 194 if(align1_gap[index])=="[GAP]":
victor@0 195 gap1.append(index)
victor@0 196
victor@0 197 for index in range(len(align2_gap)):
victor@0 198 if(align2_gap[index])=="[GAP]":
victor@0 199 gap2.append(index)
victor@0 200
victor@0 201
victor@0 202 return align1, align2,gap1,gap2
victor@0 203
victor@0 204
victor@0 205 def needleman_wunsch(self,seq1, seq2,isFast=True) :
victor@0 206 """
victor@0 207 Computes an optimal global alignment of two sequences using the Needleman-Wunsch
victor@0 208 algorithm
victor@0 209 returns the alignment and its score
victor@0 210
victor@0 211 # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C.
victor@0 212 # The score value will be between +1 to -1
victor@0 213 # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1)
victor@0 214
victor@0 215 # 'isFast=true' means that the differences between foo00 and foo000 is -1
victor@0 216 # alignment are the sequences aligned
victor@0 217 # finalValue is the last value of the matrix
victor@0 218 # finalScore is the similarity of both sequences
victor@0 219
victor@0 220 usage:
victor@0 221
victor@0 222 seq1=["foo00","abc","123"]
victor@0 223 seq2=["foo000","abc","1234"]
victor@0 224
victor@0 225 faa=FastAlignmentArray()
victor@0 226 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True)
victor@0 227 # finalScore=-0.333333343267
victor@0 228 alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False)
victor@0 229 # finalScore=0.722222208977
victor@0 230
victor@0 231 """
victor@0 232 s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast)
victor@0 233 alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr)
victor@0 234 cdef int maxlen=len(seq1)
victor@0 235 if len(seq2)>len(seq1):
victor@0 236 maxlen=len(seq2)
victor@0 237 cdef float finalscore=s[len(seq1), len(seq2)]/maxlen
victor@0 238 return alignment, s[len(seq1), len(seq2)],finalscore
victor@0 239
victor@0 240