Mercurial > hg > multiomr
diff Alignment/FastAlignmentArrays.pyx @ 0:1eb6054a1f84
First Commit
author | Victor Padilla <victor.padilla.mc@gmail.com> |
---|---|
date | Mon, 04 May 2015 22:47:33 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Alignment/FastAlignmentArrays.pyx Mon May 04 22:47:33 2015 +0200 @@ -0,0 +1,240 @@ +''' +@organization: Lancaster University & University of Leeds +@version: 1.0 +Created on 11/12/2014 + +@author: Victor Padilla +@contact: v.padilla@lancaster.ac.uk + +Contains an implementation of the Needleman Wunsch algorithm. +for two dimension arrays. It is a Cython file and should be +compiled using the file setupCython.py + +python setupCython.py build + +The output will be in the build folder +MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR + +# if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. +# The score value will be between +1 to -1 +# if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) + +# 'isFast=true' means that the differences between foo00 and foo000 is -1 +# alignment are the sequences aligned +# finalValue is the last value of the matrix +# finalScore is the similarity of both sequences + +usage: + +seq1=["foo00","abc","123"] +seq2=["foo000","abc","1234"] + +faa=FastAlignmentArray() +alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) +# finalScore=-0.333333343267 +alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) +# finalScore=0.722222208977 + +''' + +import numpy as np +cimport numpy as np + +import NWunsch + +# -*- coding: utf-8 -*- + +from cpython cimport bool + +# the three directions you can go in the traceback: +cdef int DIAG = 0 +cdef int UP = 1 +cdef int LEFT = 2 +cdef float score=0 + +class FastAlignmentArrays: + + def __needleman_wunsch_matrix(self,seq1,seq2,isFast): + """ + fill in the DP matrix according to the Needleman-Wunsch algorithm. + Returns the matrix of scores and the matrix of pointers + + if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. + The score value will be between +1 to -1 + if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) + """ + + indel = -1 # indel penalty + + cdef int n = len(seq1) + cdef int m = len(seq2) + + cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix + cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers + + ##### INITIALIZE SCORING MATRIX (base case) ##### + + cdef int i + cdef int j + for i in range(1, n+1) : + s[i,0] = indel * i + for j in range(1, m+1): + s[0,j] = indel * j + + ########## INITIALIZE TRACEBACK MATRIX ########## + + # Tag first row by LEFT, indicating initial "-"s + ptr[0,1:] = LEFT + + # Tag first column by UP, indicating initial "-"s + ptr[1:,0] = UP + + ##################################################### + + + cdef int p + cdef int q + diagonalRange=350 + for i in range(1,n+1): + p=i-diagonalRange + q=i+diagonalRange + if(p<1): + p=1 + if(q>m+1): + q=m+1 + for j in range(p,q): + # match + myseq1=seq1[i-1] + myseq2=seq2[j-1] + if isinstance(myseq1,list): + myseq1=myseq1[0] + if isinstance(myseq2,list): + myseq2=myseq2[0] + + + if(myseq1== myseq2): + score=1 + else: + score=-1 + + if len(myseq1)==0 or len(myseq2)==0: + score=0 + + #####For double alignment### + if isFast==False: + if len(myseq1)==0 or len(myseq2)==0: + score=0 + else: + score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2) + ############################ + + s[i,j] = s[i-1,j-1]+ score + + + # indel penalty + if s[i-1,j] + indel > s[i,j] : + s[i,j] = s[i-1,j] + indel + ptr[i,j] = UP + # indel penalty + if s[i, j-1] + indel > s[i,j]: + s[i,j] = s[i, j-1] + indel + ptr[i,j] = LEFT + + return s, ptr + + def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) : + """ + Function that traces back the best path to get alignment + + """ + #### TRACE BEST PATH TO GET ALIGNMENT #### + align1 = [] + align2 = [] + align1_gap = [] + align2_gap = [] + gap1=[] + gap2=[] + + cdef int n,m + n, m = (len(seq1), len(seq2)) + cdef int i,j,curr + i = n + j = m + curr = ptr[i, j] + while (i > 0 or j > 0): + ptr[i,j] += 3 + if curr == DIAG : + align1.append(seq1[i-1]) + align2.append(seq2[j-1]) + align1_gap.append(seq1[i-1]) + align2_gap.append(seq2[j-1]) + i -= 1 + j -= 1 + elif curr == LEFT: + align1.append("*") + align2.append(seq2[j-1]) + align1_gap.append("[GAP]") + align2_gap.append(seq2[j-1]) + j -= 1 + elif curr == UP: + align1.append(seq1[i-1]) + align2.append("*") + align1_gap.append(seq1[i-1]) + align2_gap.append("[GAP]") + i -= 1 + + curr = ptr[i,j] + + align1.reverse() + align2.reverse() + align1_gap.reverse() + align2_gap.reverse() + #gaps + for index in range(len(align1_gap)): + if(align1_gap[index])=="[GAP]": + gap1.append(index) + + for index in range(len(align2_gap)): + if(align2_gap[index])=="[GAP]": + gap2.append(index) + + + return align1, align2,gap1,gap2 + + + def needleman_wunsch(self,seq1, seq2,isFast=True) : + """ + Computes an optimal global alignment of two sequences using the Needleman-Wunsch + algorithm + returns the alignment and its score + + # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. + # The score value will be between +1 to -1 + # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) + + # 'isFast=true' means that the differences between foo00 and foo000 is -1 + # alignment are the sequences aligned + # finalValue is the last value of the matrix + # finalScore is the similarity of both sequences + + usage: + + seq1=["foo00","abc","123"] + seq2=["foo000","abc","1234"] + + faa=FastAlignmentArray() + alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) + # finalScore=-0.333333343267 + alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) + # finalScore=0.722222208977 + + """ + s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast) + alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr) + cdef int maxlen=len(seq1) + if len(seq2)>len(seq1): + maxlen=len(seq2) + cdef float finalscore=s[len(seq1), len(seq2)]/maxlen + return alignment, s[len(seq1), len(seq2)],finalscore + +