Mercurial > hg > multiomr
diff Process/PipelineAlignment.py @ 2:46fb79167a61 tip
Main Code
author | Victor Padilla <victor.padilla.mc@gmail.com> |
---|---|
date | Mon, 04 May 2015 22:56:18 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Process/PipelineAlignment.py Mon May 04 22:56:18 2015 +0200 @@ -0,0 +1,372 @@ +''' +Created on 10/11/2014 + +@organization: Lancaster University & University of Leeds +@version: 1.0 +Created on 11/12/2014 + +@author: Victor Padilla +@contact: v.padilla@lancaster.ac.uk + +Functions related to the alignment +and voting process +''' +from Alignment import FastAlignmentArrays +from SymbolConversion import SymbolConversion +from Functions import FilesFunctions +import numpy as np +from Clustering import Clustering +import math +from Alignment import NWunsch + + +class PipelineAlignment: + def alignGround(self,OMRs,part): + ''' + Returns one part and the ground aligned. The first array value of OMRs + should be the ground and the second the omr to align + ''' + sc=SymbolConversion() + OMRs_symbols=[] + omr_symbolsAlign=[] + for omr in OMRs: + omr_symbols=sc.filterOMR(omr,part)[1] + OMRs_symbols.append(omr_symbols) + omr_symbolsAlign.append([]) + + faa=FastAlignmentArrays() + out=faa.needleman_wunsch(OMRs_symbols[0], OMRs_symbols[1])[0] + + + return out + + def getDistances(self,OMRs_symbols): + ''' + Returns the distance matrix from several omr + in symbols, using the first symbol only + [u'N:E-4_0.25', 0.25, '', 2.75, 3.0, None] y + [u'N:E-4_0.25', 0.33, '', 2.50, 2.75, None] + + are equals + + Returns a triangular matrix + [[ 0. 0.17647058 0.19141912] + [ 0. 0. 0.17647058] + [ 0. 0. 0. ]] + + Uses the algorithm implemented in C for increasing the speed + Alignment/C_Libraries/NWunsch + ''' + ls=len(OMRs_symbols) + dimension= (ls,ls) + distances=np.zeros(dimension) + for i in range(len(OMRs_symbols)): + for j in range(i+1,len(OMRs_symbols)): + print i,j + align1=[] + align2=[] + for s in OMRs_symbols[i]: + align1.append(s[0]) + for s in OMRs_symbols[j]: + align2.append(s[0]) + #Algorithm implemented in C + if len(align1)==0 or len(align2)==0: + score=0 + else: + print"-------------------------" + + score=NWunsch.NWunsch_getSimilarity(align1,align2) + print"-------------------------" + if math.isnan(score): + score=0 + distances[i][j]=1-score + return distances + + def getDistanceLength(self,OMRs_symbols): + ''' + Similar to getDistance, but based on the length + of the omrs. Testing purposes + ''' + ls=len(OMRs_symbols) + dimension= (ls,ls) + distances=np.zeros(dimension) + for i in range(len(OMRs_symbols)): + for j in range(i+1,len(OMRs_symbols)): + print i,j + len_i=len(OMRs_symbols[i]) + len_j=len(OMRs_symbols[j]) + maxLen=len_j + if len_i>=len_j: + maxLen=len_i + + score=(len_i-len_j)*1.0/maxLen + if score<0: + score=score*-1 + distances[i][j]=score + return distances + + def __getMinimum(self,distance): + ''' + Returns the minimum value and the x,y position + in the distance matrix + ''' + minim=1000 + iMin=0 + jMin=0 + for i in range(len(distance[0])): + for j in range(i+1,len(distance[0])): + dist=distance[i][j] + if isinstance(dist,list): + dist=dist[0] + if dist<minim: + minim=dist + iMin=i + jMin=j + + return minim,iMin,jMin + + + def __recalculeDistances(self,distance,iMin,jMin): + ''' + Removes the rows and the column in the distance matrix + and calculates the new matrix + ''' + for i in range(len(distance[0])): + for j in range(i+1,len(distance[0])): + if j==iMin: + dist=distance[i][j] + dist2=distance[i][jMin] + distance[i][j]=(dist+dist2)/2 + distance=np.delete(distance, jMin, 0) + distance=np.delete(distance, jMin, 1) + return distance + + + def __getPairingReal(self,iMin,jMin,removedArray): + ''' + Returns the real omr position in the original matrix + based on the actual position and the elements removed + + usage: + self._getPairingReal(0,1,[1]) + returns + 0,2 + ''' + iMinReal=iMin + jMinReal=jMin + removedArray.sort() + for removedItem in removedArray: + if iMinReal>=removedItem: + iMinReal+=1 + if jMinReal>=removedItem: + jMinReal+=1 + return iMinReal,jMinReal + + def selectBetterOMRs(self,OMRs_symbols): + ''' + Based on Philogenetic trees, this function + takes the best omrs based on the distances between them + ''' + distanceSimple=self.getDistances(OMRs_symbols) + clustering=Clustering() + distances=clustering.getCompleteMatrix(distanceSimple) + species = [] + for i in range(len(OMRs_symbols)): + species.append(i) + clu = clustering.make_clusters(species) + tree = clustering.regroup(clu, distances) + + #at least 3 leafs in the tree + maintree=tree + for i in range(3,len(OMRs_symbols)): + maintree=clustering.getBetterTree(tree,i) + if len(clustering.getLeafs(maintree))>=3: + break + + + betterOmrIds= clustering.getLeafs(maintree) + + #Graphic representation + strTree=clustering.getStringTree(tree,tree.height,"") + print strTree + clustering.showTree(strTree) + strMainTree=clustering.getStringTree(maintree,maintree.height,"") + print strMainTree + clustering.showTree(strMainTree) + + newOMRs=[] + for i in betterOmrIds: + newOMRs.append(OMRs_symbols[i]) + + return newOMRs,betterOmrIds + +# def alignNJ(self,idPart,fsOMRs,partOMRs): +# ''' +# Main function for aligning the different OMRs +# +# Returns: +# omr_symbolsAligned. OMR array of symbols aligned (only the best) +# betterOmrIds. Id array of better OMRs (for writing the log file) +# +# usage: +# pa=PipelineAlignment() +# omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs) +# ''' +# +# +# OMRs_symbols=[] +# sc=SymbolConversion() +# print "2---converting to symbols---" +# for omr in OMRs: +# if omr!=[]: +# omr_symbols=sc.filterOMR(omr,idPart)[1] +# OMRs_symbols.append(omr_symbols) +# else: +# OMRs_symbols.append([]) +# +# print "3---removing worst OMR---" +# # betterOmrIds=[] +# OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols) +# +# print "4---calculating distances---" +# distances=self.getDistances(OMRs_symbols) +# +# print "5---aligning symbols---" +# +# omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances) +# +# return omr_symbolsAligned,betterOmrIds + + def alignNJ_files(self,idPart,fsOMRs_files,partOMRs_files): + ''' + Main function for aligning the different OMRs + + Returns: + omr_symbolsAligned. OMR array of symbols aligned (only the best) + betterOmrIds. Id array of better OMRs (for writing the log file) + + usage: + pa=PipelineAlignment() + omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs) + ''' + + + ff=FilesFunctions() + OMRs_files=partOMRs_files+fsOMRs_files + OMRs_symbols=[] + sc=SymbolConversion() + print "2---converting to symbols---" + for omr_file in OMRs_files: + omr=ff.getOMR(omr_file) + if omr!=[]: + omr_symbols=sc.filterOMR(omr,idPart)[1] + OMRs_symbols.append(omr_symbols) + else: + OMRs_symbols.append([]) + + print "3---removing worst OMR---" +# betterOmrIds=[] + + + OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols) + + print "4---calculating distances---" + distances=self.getDistances(OMRs_symbols) + + print "5---aligning symbols---" + + omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances) + + return omr_symbolsAligned,betterOmrIds + def __setVoidArray(self,length): + ''' + private function to create a void array + ''' + arrayOut=[] + for _ in range(length): + arrayOut.append([]) + + return arrayOut + + def setOMRSymbolsAligned(self,OMRs_symbols,distances): + ''' + returns the OMRs symbols aligned from OMR symbols and + the distances matrix + ''' + pairings=[] + pairingsReal=[] + gapArray=[] + removedArray=[] + omr_symbolsAlign=self.__setVoidArray(len(OMRs_symbols)) + faa=FastAlignmentArrays() + while len(distances[0])>1: + minim,iMin,jMin=self.__getMinimum(distances) + print distances,minim,iMin,jMin + pairings.append([iMin,jMin]) + iMinReal,jMinReal=self.__getPairingReal(iMin,jMin,removedArray) + + pairingsReal.append([iMinReal,jMinReal]) + if len(omr_symbolsAlign[iMinReal])==0: + omr_symbolsAlign[iMinReal]=OMRs_symbols[iMin] + if len(omr_symbolsAlign[jMinReal])==0: + omr_symbolsAlign[jMinReal]=OMRs_symbols[jMin] + + out=faa.needleman_wunsch(omr_symbolsAlign[iMinReal], omr_symbolsAlign[jMinReal])[0] + omr_symbolsAlign[iMinReal]=out[0] + omr_symbolsAlign[jMinReal]=out[1] + gap1=out[2] + gap2=out[3] + gapArray.append([gap1,gap2]) + + + OMRs_symbols.pop(jMin) + removedArray.append(jMinReal) + + distances=self.__recalculeDistances(distances,iMin,jMin) + + + omr_symbolsAlign=self.__fillingGaps(omr_symbolsAlign,gapArray,pairingsReal) + + + return omr_symbolsAlign + + def __fillingGaps(self,omr_symbolsAlign,gapArray,pairingsReal): + ''' + private function to complete gaps based on the gap matrix + and the pairs stablished + ''' + for p in range(len(pairingsReal)-1,0,-1): + for i in range(2): + for j in range(2): + for t in range(1,p+1): + if(pairingsReal[p][i]==pairingsReal[p-t][j]): + if j==0: + s=1 + if j==1: + s=0 + omrIndex=pairingsReal[p-t][s] + newGap=[] + for gap in gapArray[p][i]: + omr_symbolsAlign[omrIndex].insert(gap,"*") + newGap.append(gap) + gapArray[p-t][s]=gapArray[p-t][s]+newGap + + + return omr_symbolsAlign + + + + + + + + + + + + + + + + + \ No newline at end of file