diff Process/PipelineAlignment.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Process/PipelineAlignment.py	Mon May 04 22:56:18 2015 +0200
@@ -0,0 +1,372 @@
+'''
+Created on 10/11/2014
+
+@organization: Lancaster University & University of Leeds
+@version: 1.0
+Created on 11/12/2014
+
+@author: Victor Padilla
+@contact: v.padilla@lancaster.ac.uk
+
+Functions related to the alignment
+and voting process
+'''
+from Alignment import FastAlignmentArrays
+from SymbolConversion import SymbolConversion
+from Functions import FilesFunctions
+import numpy as np
+from Clustering import Clustering
+import math
+from Alignment import NWunsch
+
+
+class PipelineAlignment:
+    def alignGround(self,OMRs,part):
+        '''
+        Returns one part and the ground aligned. The first array value of OMRs 
+        should be the ground and the second the omr to align
+        '''
+        sc=SymbolConversion()
+        OMRs_symbols=[]
+        omr_symbolsAlign=[]
+        for omr in OMRs:
+            omr_symbols=sc.filterOMR(omr,part)[1]
+            OMRs_symbols.append(omr_symbols)
+            omr_symbolsAlign.append([])
+        
+        faa=FastAlignmentArrays()
+        out=faa.needleman_wunsch(OMRs_symbols[0], OMRs_symbols[1])[0]
+
+             
+        return out
+    
+    def getDistances(self,OMRs_symbols):
+        '''
+        Returns the distance matrix from several omr
+        in symbols, using the first symbol only
+        [u'N:E-4_0.25', 0.25, '', 2.75, 3.0, None] y
+        [u'N:E-4_0.25', 0.33, '', 2.50, 2.75, None]
+        
+        are equals
+        
+        Returns a triangular matrix
+        [[ 0.          0.17647058  0.19141912]
+         [ 0.          0.          0.17647058]
+         [ 0.          0.          0.        ]]
+        
+        Uses the algorithm implemented in C for increasing the speed 
+        Alignment/C_Libraries/NWunsch
+        '''
+        ls=len(OMRs_symbols)
+        dimension= (ls,ls)
+        distances=np.zeros(dimension)   
+        for i in range(len(OMRs_symbols)):  
+            for j in range(i+1,len(OMRs_symbols)):
+                print i,j
+                align1=[]
+                align2=[]
+                for s in OMRs_symbols[i]:
+                    align1.append(s[0])
+                for s in OMRs_symbols[j]:
+                    align2.append(s[0])
+                #Algorithm implemented in C
+                if len(align1)==0 or len(align2)==0:
+                    score=0
+                else:
+                    print"-------------------------"  
+                    
+                    score=NWunsch.NWunsch_getSimilarity(align1,align2)
+                    print"-------------------------"  
+                if math.isnan(score):
+                    score=0
+                distances[i][j]=1-score      
+        return distances
+    
+    def getDistanceLength(self,OMRs_symbols):
+        '''
+        Similar to getDistance, but based on the length
+        of the omrs. Testing purposes
+        '''
+        ls=len(OMRs_symbols)
+        dimension= (ls,ls)
+        distances=np.zeros(dimension)       
+        for i in range(len(OMRs_symbols)):       
+            for j in range(i+1,len(OMRs_symbols)):
+                print i,j
+                len_i=len(OMRs_symbols[i])
+                len_j=len(OMRs_symbols[j])
+                maxLen=len_j
+                if len_i>=len_j:
+                    maxLen=len_i
+                
+                score=(len_i-len_j)*1.0/maxLen
+                if score<0:
+                    score=score*-1
+                distances[i][j]=score        
+        return distances
+    
+    def __getMinimum(self,distance):
+        '''
+        Returns the minimum value and the x,y position
+        in the distance matrix
+        '''
+        minim=1000
+        iMin=0
+        jMin=0
+        for i in range(len(distance[0])):         
+            for j in range(i+1,len(distance[0])):
+                dist=distance[i][j]
+                if isinstance(dist,list):
+                    dist=dist[0]
+                if dist<minim:
+                    minim=dist
+                    iMin=i
+                    jMin=j
+                    
+        return minim,iMin,jMin
+    
+    
+    def __recalculeDistances(self,distance,iMin,jMin):
+        '''
+        Removes the rows and the column in the distance matrix
+        and calculates the new matrix 
+        '''
+        for i in range(len(distance[0])):         
+            for j in range(i+1,len(distance[0])):       
+                if j==iMin:
+                    dist=distance[i][j]
+                    dist2=distance[i][jMin]
+                    distance[i][j]=(dist+dist2)/2
+        distance=np.delete(distance, jMin, 0)
+        distance=np.delete(distance, jMin, 1)     
+        return distance
+    
+    
+    def __getPairingReal(self,iMin,jMin,removedArray):
+        '''
+        Returns the real omr position in the original matrix 
+        based on the actual position and the elements removed
+        
+        usage:
+        self._getPairingReal(0,1,[1])
+        returns
+        0,2
+        '''
+        iMinReal=iMin
+        jMinReal=jMin
+        removedArray.sort()
+        for removedItem in removedArray:
+            if iMinReal>=removedItem:
+                iMinReal+=1
+            if jMinReal>=removedItem:
+                jMinReal+=1
+        return iMinReal,jMinReal
+    
+    def selectBetterOMRs(self,OMRs_symbols):
+        '''
+        Based on Philogenetic trees, this function
+        takes the best omrs based on the distances between them
+        '''
+        distanceSimple=self.getDistances(OMRs_symbols)  
+        clustering=Clustering()
+        distances=clustering.getCompleteMatrix(distanceSimple)  
+        species = []
+        for i in range(len(OMRs_symbols)):
+            species.append(i)
+        clu = clustering.make_clusters(species)    
+        tree = clustering.regroup(clu, distances)
+
+        #at least 3 leafs in the tree
+        maintree=tree
+        for i in range(3,len(OMRs_symbols)):
+            maintree=clustering.getBetterTree(tree,i)
+            if len(clustering.getLeafs(maintree))>=3:
+                break
+           
+            
+        betterOmrIds= clustering.getLeafs(maintree) 
+        
+        #Graphic representation
+        strTree=clustering.getStringTree(tree,tree.height,"")
+        print strTree
+        clustering.showTree(strTree)
+        strMainTree=clustering.getStringTree(maintree,maintree.height,"")
+        print strMainTree
+        clustering.showTree(strMainTree)
+
+        newOMRs=[]
+        for i in betterOmrIds:
+            newOMRs.append(OMRs_symbols[i])
+
+        return newOMRs,betterOmrIds
+    
+#     def alignNJ(self,idPart,fsOMRs,partOMRs):
+#         '''
+#         Main function for aligning the different OMRs
+#         
+#         Returns:
+#             omr_symbolsAligned. OMR array of symbols aligned (only the best)
+#             betterOmrIds. Id array of better OMRs (for writing the log file)
+#         
+#         usage:
+#             pa=PipelineAlignment()
+#             omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
+#         '''
+#         
+# 
+#         OMRs_symbols=[]
+#         sc=SymbolConversion()
+#         print "2---converting to symbols---"
+#         for omr in OMRs:
+#             if omr!=[]:
+#                 omr_symbols=sc.filterOMR(omr,idPart)[1]
+#                 OMRs_symbols.append(omr_symbols)
+#             else:
+#                 OMRs_symbols.append([])
+#             
+#         print "3---removing worst OMR---"  
+# #         betterOmrIds=[]
+#         OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
+# 
+#         print "4---calculating distances---"
+#         distances=self.getDistances(OMRs_symbols)
+#         
+#         print "5---aligning symbols---"
+#                
+#         omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
+#         
+#         return omr_symbolsAligned,betterOmrIds
+    
+    def alignNJ_files(self,idPart,fsOMRs_files,partOMRs_files):
+        '''
+        Main function for aligning the different OMRs
+        
+        Returns:
+            omr_symbolsAligned. OMR array of symbols aligned (only the best)
+            betterOmrIds. Id array of better OMRs (for writing the log file)
+        
+        usage:
+            pa=PipelineAlignment()
+            omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
+        '''
+        
+               
+        ff=FilesFunctions()
+        OMRs_files=partOMRs_files+fsOMRs_files
+        OMRs_symbols=[]
+        sc=SymbolConversion()
+        print "2---converting to symbols---"
+        for omr_file in OMRs_files:
+            omr=ff.getOMR(omr_file)
+            if omr!=[]:
+                omr_symbols=sc.filterOMR(omr,idPart)[1]
+                OMRs_symbols.append(omr_symbols)
+            else:
+                OMRs_symbols.append([])
+            
+        print "3---removing worst OMR---"  
+#         betterOmrIds=[]
+
+        
+        OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
+
+        print "4---calculating distances---"
+        distances=self.getDistances(OMRs_symbols)
+        
+        print "5---aligning symbols---"
+               
+        omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
+        
+        return omr_symbolsAligned,betterOmrIds
+    def __setVoidArray(self,length):
+        '''
+        private function to create a void array
+        '''
+        arrayOut=[]
+        for _ in range(length):
+            arrayOut.append([])
+
+        return arrayOut
+            
+    def setOMRSymbolsAligned(self,OMRs_symbols,distances):   
+        '''
+        returns the OMRs symbols aligned from OMR symbols and 
+        the distances matrix
+        ''' 
+        pairings=[]
+        pairingsReal=[]
+        gapArray=[]
+        removedArray=[]
+        omr_symbolsAlign=self.__setVoidArray(len(OMRs_symbols))
+        faa=FastAlignmentArrays()
+        while len(distances[0])>1:
+            minim,iMin,jMin=self.__getMinimum(distances)
+            print distances,minim,iMin,jMin
+            pairings.append([iMin,jMin])
+            iMinReal,jMinReal=self.__getPairingReal(iMin,jMin,removedArray)
+            
+            pairingsReal.append([iMinReal,jMinReal])
+            if len(omr_symbolsAlign[iMinReal])==0:
+                omr_symbolsAlign[iMinReal]=OMRs_symbols[iMin]
+            if len(omr_symbolsAlign[jMinReal])==0:
+                omr_symbolsAlign[jMinReal]=OMRs_symbols[jMin]
+                
+            out=faa.needleman_wunsch(omr_symbolsAlign[iMinReal], omr_symbolsAlign[jMinReal])[0]
+            omr_symbolsAlign[iMinReal]=out[0]
+            omr_symbolsAlign[jMinReal]=out[1]
+            gap1=out[2]
+            gap2=out[3]
+            gapArray.append([gap1,gap2])
+            
+
+            OMRs_symbols.pop(jMin)
+            removedArray.append(jMinReal)
+            
+            distances=self.__recalculeDistances(distances,iMin,jMin)
+            
+                
+        omr_symbolsAlign=self.__fillingGaps(omr_symbolsAlign,gapArray,pairingsReal)
+
+
+        return omr_symbolsAlign
+    
+    def __fillingGaps(self,omr_symbolsAlign,gapArray,pairingsReal):
+        '''
+        private function to complete gaps based on the gap matrix 
+        and the pairs stablished
+        '''
+        for p in range(len(pairingsReal)-1,0,-1):
+            for i in range(2):
+                for j in range(2):
+                    for t in range(1,p+1):
+                        if(pairingsReal[p][i]==pairingsReal[p-t][j]):
+                            if j==0:
+                                s=1
+                            if j==1:
+                                s=0
+                            omrIndex=pairingsReal[p-t][s]
+                            newGap=[]
+                            for gap in gapArray[p][i]:
+                                omr_symbolsAlign[omrIndex].insert(gap,"*")   
+                                newGap.append(gap)
+                            gapArray[p-t][s]=gapArray[p-t][s]+newGap
+                                
+
+        return omr_symbolsAlign
+
+
+
+                    
+                
+
+    
+    
+    
+    
+                    
+                
+             
+
+        
+        
+        
\ No newline at end of file