victor@0: ''' victor@0: @organization: Lancaster University & University of Leeds victor@0: @version: 1.0 victor@0: Created on 11/12/2014 victor@0: victor@0: @author: Victor Padilla victor@0: @contact: v.padilla@lancaster.ac.uk victor@0: victor@0: Contains an implementation of the Needleman Wunsch algorithm. victor@0: for two dimension arrays. It is a Cython file and should be victor@0: compiled using the file setupCython.py victor@0: victor@0: python setupCython.py build victor@0: victor@0: The output will be in the build folder victor@0: MultipleOMR\\build\\lib.win32-2.7\\MultipleOMR victor@0: victor@0: # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. victor@0: # The score value will be between +1 to -1 victor@0: # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) victor@0: victor@0: # 'isFast=true' means that the differences between foo00 and foo000 is -1 victor@0: # alignment are the sequences aligned victor@0: # finalValue is the last value of the matrix victor@0: # finalScore is the similarity of both sequences victor@0: victor@0: usage: victor@0: victor@0: seq1=["foo00","abc","123"] victor@0: seq2=["foo000","abc","1234"] victor@0: victor@0: faa=FastAlignmentArray() victor@0: alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) victor@0: # finalScore=-0.333333343267 victor@0: alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) victor@0: # finalScore=0.722222208977 victor@0: victor@0: ''' victor@0: victor@0: import numpy as np victor@0: cimport numpy as np victor@0: victor@0: import NWunsch victor@0: victor@0: # -*- coding: utf-8 -*- victor@0: victor@0: from cpython cimport bool victor@0: victor@0: # the three directions you can go in the traceback: victor@0: cdef int DIAG = 0 victor@0: cdef int UP = 1 victor@0: cdef int LEFT = 2 victor@0: cdef float score=0 victor@0: victor@0: class FastAlignmentArrays: victor@0: victor@0: def __needleman_wunsch_matrix(self,seq1,seq2,isFast): victor@0: """ victor@0: fill in the DP matrix according to the Needleman-Wunsch algorithm. victor@0: Returns the matrix of scores and the matrix of pointers victor@0: victor@0: if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. victor@0: The score value will be between +1 to -1 victor@0: if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) victor@0: """ victor@0: victor@0: indel = -1 # indel penalty victor@0: victor@0: cdef int n = len(seq1) victor@0: cdef int m = len(seq2) victor@0: victor@0: cdef np.ndarray s = np.zeros( (n+1, m+1) ) # DP matrix victor@0: cdef np.ndarray ptr = np.zeros( (n+1, m+1), dtype=int ) # matrix of pointers victor@0: victor@0: ##### INITIALIZE SCORING MATRIX (base case) ##### victor@0: victor@0: cdef int i victor@0: cdef int j victor@0: for i in range(1, n+1) : victor@0: s[i,0] = indel * i victor@0: for j in range(1, m+1): victor@0: s[0,j] = indel * j victor@0: victor@0: ########## INITIALIZE TRACEBACK MATRIX ########## victor@0: victor@0: # Tag first row by LEFT, indicating initial "-"s victor@0: ptr[0,1:] = LEFT victor@0: victor@0: # Tag first column by UP, indicating initial "-"s victor@0: ptr[1:,0] = UP victor@0: victor@0: ##################################################### victor@0: victor@0: victor@0: cdef int p victor@0: cdef int q victor@0: diagonalRange=350 victor@0: for i in range(1,n+1): victor@0: p=i-diagonalRange victor@0: q=i+diagonalRange victor@0: if(p<1): victor@0: p=1 victor@0: if(q>m+1): victor@0: q=m+1 victor@0: for j in range(p,q): victor@0: # match victor@0: myseq1=seq1[i-1] victor@0: myseq2=seq2[j-1] victor@0: if isinstance(myseq1,list): victor@0: myseq1=myseq1[0] victor@0: if isinstance(myseq2,list): victor@0: myseq2=myseq2[0] victor@0: victor@0: victor@0: if(myseq1== myseq2): victor@0: score=1 victor@0: else: victor@0: score=-1 victor@0: victor@0: if len(myseq1)==0 or len(myseq2)==0: victor@0: score=0 victor@0: victor@0: #####For double alignment### victor@0: if isFast==False: victor@0: if len(myseq1)==0 or len(myseq2)==0: victor@0: score=0 victor@0: else: victor@0: score=NWunsch.NWunsch_getSimilarity(myseq1,myseq2) victor@0: ############################ victor@0: victor@0: s[i,j] = s[i-1,j-1]+ score victor@0: victor@0: victor@0: # indel penalty victor@0: if s[i-1,j] + indel > s[i,j] : victor@0: s[i,j] = s[i-1,j] + indel victor@0: ptr[i,j] = UP victor@0: # indel penalty victor@0: if s[i, j-1] + indel > s[i,j]: victor@0: s[i,j] = s[i, j-1] + indel victor@0: ptr[i,j] = LEFT victor@0: victor@0: return s, ptr victor@0: victor@0: def __needleman_wunsch_trace(self,seq1, seq2,np.ndarray s, np.ndarray ptr) : victor@0: """ victor@0: Function that traces back the best path to get alignment victor@0: victor@0: """ victor@0: #### TRACE BEST PATH TO GET ALIGNMENT #### victor@0: align1 = [] victor@0: align2 = [] victor@0: align1_gap = [] victor@0: align2_gap = [] victor@0: gap1=[] victor@0: gap2=[] victor@0: victor@0: cdef int n,m victor@0: n, m = (len(seq1), len(seq2)) victor@0: cdef int i,j,curr victor@0: i = n victor@0: j = m victor@0: curr = ptr[i, j] victor@0: while (i > 0 or j > 0): victor@0: ptr[i,j] += 3 victor@0: if curr == DIAG : victor@0: align1.append(seq1[i-1]) victor@0: align2.append(seq2[j-1]) victor@0: align1_gap.append(seq1[i-1]) victor@0: align2_gap.append(seq2[j-1]) victor@0: i -= 1 victor@0: j -= 1 victor@0: elif curr == LEFT: victor@0: align1.append("*") victor@0: align2.append(seq2[j-1]) victor@0: align1_gap.append("[GAP]") victor@0: align2_gap.append(seq2[j-1]) victor@0: j -= 1 victor@0: elif curr == UP: victor@0: align1.append(seq1[i-1]) victor@0: align2.append("*") victor@0: align1_gap.append(seq1[i-1]) victor@0: align2_gap.append("[GAP]") victor@0: i -= 1 victor@0: victor@0: curr = ptr[i,j] victor@0: victor@0: align1.reverse() victor@0: align2.reverse() victor@0: align1_gap.reverse() victor@0: align2_gap.reverse() victor@0: #gaps victor@0: for index in range(len(align1_gap)): victor@0: if(align1_gap[index])=="[GAP]": victor@0: gap1.append(index) victor@0: victor@0: for index in range(len(align2_gap)): victor@0: if(align2_gap[index])=="[GAP]": victor@0: gap2.append(index) victor@0: victor@0: victor@0: return align1, align2,gap1,gap2 victor@0: victor@0: victor@0: def needleman_wunsch(self,seq1, seq2,isFast=True) : victor@0: """ victor@0: Computes an optimal global alignment of two sequences using the Needleman-Wunsch victor@0: algorithm victor@0: returns the alignment and its score victor@0: victor@0: # if isFast==False, calculate the difference between the strings using the Needlemann Wunsch in C. victor@0: # The score value will be between +1 to -1 victor@0: # if isFast==True, just compare the strings if they are equals (score=1) or different (score=-1) victor@0: victor@0: # 'isFast=true' means that the differences between foo00 and foo000 is -1 victor@0: # alignment are the sequences aligned victor@0: # finalValue is the last value of the matrix victor@0: # finalScore is the similarity of both sequences victor@0: victor@0: usage: victor@0: victor@0: seq1=["foo00","abc","123"] victor@0: seq2=["foo000","abc","1234"] victor@0: victor@0: faa=FastAlignmentArray() victor@0: alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=True) victor@0: # finalScore=-0.333333343267 victor@0: alignment,finalValue,finalScore=faa.needleman_wunsch(seq1, seq2,isFast=False) victor@0: # finalScore=0.722222208977 victor@0: victor@0: """ victor@0: s,ptr = self.__needleman_wunsch_matrix(seq1, seq2,isFast) victor@0: alignment = self.__needleman_wunsch_trace(seq1, seq2, s, ptr) victor@0: cdef int maxlen=len(seq1) victor@0: if len(seq2)>len(seq1): victor@0: maxlen=len(seq2) victor@0: cdef float finalscore=s[len(seq1), len(seq2)]/maxlen victor@0: return alignment, s[len(seq1), len(seq2)],finalscore victor@0: victor@0: