annotate Process/PipelineAlignment.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
rev   line source
victor@2 1 '''
victor@2 2 Created on 10/11/2014
victor@2 3
victor@2 4 @organization: Lancaster University & University of Leeds
victor@2 5 @version: 1.0
victor@2 6 Created on 11/12/2014
victor@2 7
victor@2 8 @author: Victor Padilla
victor@2 9 @contact: v.padilla@lancaster.ac.uk
victor@2 10
victor@2 11 Functions related to the alignment
victor@2 12 and voting process
victor@2 13 '''
victor@2 14 from Alignment import FastAlignmentArrays
victor@2 15 from SymbolConversion import SymbolConversion
victor@2 16 from Functions import FilesFunctions
victor@2 17 import numpy as np
victor@2 18 from Clustering import Clustering
victor@2 19 import math
victor@2 20 from Alignment import NWunsch
victor@2 21
victor@2 22
victor@2 23 class PipelineAlignment:
victor@2 24 def alignGround(self,OMRs,part):
victor@2 25 '''
victor@2 26 Returns one part and the ground aligned. The first array value of OMRs
victor@2 27 should be the ground and the second the omr to align
victor@2 28 '''
victor@2 29 sc=SymbolConversion()
victor@2 30 OMRs_symbols=[]
victor@2 31 omr_symbolsAlign=[]
victor@2 32 for omr in OMRs:
victor@2 33 omr_symbols=sc.filterOMR(omr,part)[1]
victor@2 34 OMRs_symbols.append(omr_symbols)
victor@2 35 omr_symbolsAlign.append([])
victor@2 36
victor@2 37 faa=FastAlignmentArrays()
victor@2 38 out=faa.needleman_wunsch(OMRs_symbols[0], OMRs_symbols[1])[0]
victor@2 39
victor@2 40
victor@2 41 return out
victor@2 42
victor@2 43 def getDistances(self,OMRs_symbols):
victor@2 44 '''
victor@2 45 Returns the distance matrix from several omr
victor@2 46 in symbols, using the first symbol only
victor@2 47 [u'N:E-4_0.25', 0.25, '', 2.75, 3.0, None] y
victor@2 48 [u'N:E-4_0.25', 0.33, '', 2.50, 2.75, None]
victor@2 49
victor@2 50 are equals
victor@2 51
victor@2 52 Returns a triangular matrix
victor@2 53 [[ 0. 0.17647058 0.19141912]
victor@2 54 [ 0. 0. 0.17647058]
victor@2 55 [ 0. 0. 0. ]]
victor@2 56
victor@2 57 Uses the algorithm implemented in C for increasing the speed
victor@2 58 Alignment/C_Libraries/NWunsch
victor@2 59 '''
victor@2 60 ls=len(OMRs_symbols)
victor@2 61 dimension= (ls,ls)
victor@2 62 distances=np.zeros(dimension)
victor@2 63 for i in range(len(OMRs_symbols)):
victor@2 64 for j in range(i+1,len(OMRs_symbols)):
victor@2 65 print i,j
victor@2 66 align1=[]
victor@2 67 align2=[]
victor@2 68 for s in OMRs_symbols[i]:
victor@2 69 align1.append(s[0])
victor@2 70 for s in OMRs_symbols[j]:
victor@2 71 align2.append(s[0])
victor@2 72 #Algorithm implemented in C
victor@2 73 if len(align1)==0 or len(align2)==0:
victor@2 74 score=0
victor@2 75 else:
victor@2 76 print"-------------------------"
victor@2 77
victor@2 78 score=NWunsch.NWunsch_getSimilarity(align1,align2)
victor@2 79 print"-------------------------"
victor@2 80 if math.isnan(score):
victor@2 81 score=0
victor@2 82 distances[i][j]=1-score
victor@2 83 return distances
victor@2 84
victor@2 85 def getDistanceLength(self,OMRs_symbols):
victor@2 86 '''
victor@2 87 Similar to getDistance, but based on the length
victor@2 88 of the omrs. Testing purposes
victor@2 89 '''
victor@2 90 ls=len(OMRs_symbols)
victor@2 91 dimension= (ls,ls)
victor@2 92 distances=np.zeros(dimension)
victor@2 93 for i in range(len(OMRs_symbols)):
victor@2 94 for j in range(i+1,len(OMRs_symbols)):
victor@2 95 print i,j
victor@2 96 len_i=len(OMRs_symbols[i])
victor@2 97 len_j=len(OMRs_symbols[j])
victor@2 98 maxLen=len_j
victor@2 99 if len_i>=len_j:
victor@2 100 maxLen=len_i
victor@2 101
victor@2 102 score=(len_i-len_j)*1.0/maxLen
victor@2 103 if score<0:
victor@2 104 score=score*-1
victor@2 105 distances[i][j]=score
victor@2 106 return distances
victor@2 107
victor@2 108 def __getMinimum(self,distance):
victor@2 109 '''
victor@2 110 Returns the minimum value and the x,y position
victor@2 111 in the distance matrix
victor@2 112 '''
victor@2 113 minim=1000
victor@2 114 iMin=0
victor@2 115 jMin=0
victor@2 116 for i in range(len(distance[0])):
victor@2 117 for j in range(i+1,len(distance[0])):
victor@2 118 dist=distance[i][j]
victor@2 119 if isinstance(dist,list):
victor@2 120 dist=dist[0]
victor@2 121 if dist<minim:
victor@2 122 minim=dist
victor@2 123 iMin=i
victor@2 124 jMin=j
victor@2 125
victor@2 126 return minim,iMin,jMin
victor@2 127
victor@2 128
victor@2 129 def __recalculeDistances(self,distance,iMin,jMin):
victor@2 130 '''
victor@2 131 Removes the rows and the column in the distance matrix
victor@2 132 and calculates the new matrix
victor@2 133 '''
victor@2 134 for i in range(len(distance[0])):
victor@2 135 for j in range(i+1,len(distance[0])):
victor@2 136 if j==iMin:
victor@2 137 dist=distance[i][j]
victor@2 138 dist2=distance[i][jMin]
victor@2 139 distance[i][j]=(dist+dist2)/2
victor@2 140 distance=np.delete(distance, jMin, 0)
victor@2 141 distance=np.delete(distance, jMin, 1)
victor@2 142 return distance
victor@2 143
victor@2 144
victor@2 145 def __getPairingReal(self,iMin,jMin,removedArray):
victor@2 146 '''
victor@2 147 Returns the real omr position in the original matrix
victor@2 148 based on the actual position and the elements removed
victor@2 149
victor@2 150 usage:
victor@2 151 self._getPairingReal(0,1,[1])
victor@2 152 returns
victor@2 153 0,2
victor@2 154 '''
victor@2 155 iMinReal=iMin
victor@2 156 jMinReal=jMin
victor@2 157 removedArray.sort()
victor@2 158 for removedItem in removedArray:
victor@2 159 if iMinReal>=removedItem:
victor@2 160 iMinReal+=1
victor@2 161 if jMinReal>=removedItem:
victor@2 162 jMinReal+=1
victor@2 163 return iMinReal,jMinReal
victor@2 164
victor@2 165 def selectBetterOMRs(self,OMRs_symbols):
victor@2 166 '''
victor@2 167 Based on Philogenetic trees, this function
victor@2 168 takes the best omrs based on the distances between them
victor@2 169 '''
victor@2 170 distanceSimple=self.getDistances(OMRs_symbols)
victor@2 171 clustering=Clustering()
victor@2 172 distances=clustering.getCompleteMatrix(distanceSimple)
victor@2 173 species = []
victor@2 174 for i in range(len(OMRs_symbols)):
victor@2 175 species.append(i)
victor@2 176 clu = clustering.make_clusters(species)
victor@2 177 tree = clustering.regroup(clu, distances)
victor@2 178
victor@2 179 #at least 3 leafs in the tree
victor@2 180 maintree=tree
victor@2 181 for i in range(3,len(OMRs_symbols)):
victor@2 182 maintree=clustering.getBetterTree(tree,i)
victor@2 183 if len(clustering.getLeafs(maintree))>=3:
victor@2 184 break
victor@2 185
victor@2 186
victor@2 187 betterOmrIds= clustering.getLeafs(maintree)
victor@2 188
victor@2 189 #Graphic representation
victor@2 190 strTree=clustering.getStringTree(tree,tree.height,"")
victor@2 191 print strTree
victor@2 192 clustering.showTree(strTree)
victor@2 193 strMainTree=clustering.getStringTree(maintree,maintree.height,"")
victor@2 194 print strMainTree
victor@2 195 clustering.showTree(strMainTree)
victor@2 196
victor@2 197 newOMRs=[]
victor@2 198 for i in betterOmrIds:
victor@2 199 newOMRs.append(OMRs_symbols[i])
victor@2 200
victor@2 201 return newOMRs,betterOmrIds
victor@2 202
victor@2 203 # def alignNJ(self,idPart,fsOMRs,partOMRs):
victor@2 204 # '''
victor@2 205 # Main function for aligning the different OMRs
victor@2 206 #
victor@2 207 # Returns:
victor@2 208 # omr_symbolsAligned. OMR array of symbols aligned (only the best)
victor@2 209 # betterOmrIds. Id array of better OMRs (for writing the log file)
victor@2 210 #
victor@2 211 # usage:
victor@2 212 # pa=PipelineAlignment()
victor@2 213 # omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
victor@2 214 # '''
victor@2 215 #
victor@2 216 #
victor@2 217 # OMRs_symbols=[]
victor@2 218 # sc=SymbolConversion()
victor@2 219 # print "2---converting to symbols---"
victor@2 220 # for omr in OMRs:
victor@2 221 # if omr!=[]:
victor@2 222 # omr_symbols=sc.filterOMR(omr,idPart)[1]
victor@2 223 # OMRs_symbols.append(omr_symbols)
victor@2 224 # else:
victor@2 225 # OMRs_symbols.append([])
victor@2 226 #
victor@2 227 # print "3---removing worst OMR---"
victor@2 228 # # betterOmrIds=[]
victor@2 229 # OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
victor@2 230 #
victor@2 231 # print "4---calculating distances---"
victor@2 232 # distances=self.getDistances(OMRs_symbols)
victor@2 233 #
victor@2 234 # print "5---aligning symbols---"
victor@2 235 #
victor@2 236 # omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
victor@2 237 #
victor@2 238 # return omr_symbolsAligned,betterOmrIds
victor@2 239
victor@2 240 def alignNJ_files(self,idPart,fsOMRs_files,partOMRs_files):
victor@2 241 '''
victor@2 242 Main function for aligning the different OMRs
victor@2 243
victor@2 244 Returns:
victor@2 245 omr_symbolsAligned. OMR array of symbols aligned (only the best)
victor@2 246 betterOmrIds. Id array of better OMRs (for writing the log file)
victor@2 247
victor@2 248 usage:
victor@2 249 pa=PipelineAlignment()
victor@2 250 omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
victor@2 251 '''
victor@2 252
victor@2 253
victor@2 254 ff=FilesFunctions()
victor@2 255 OMRs_files=partOMRs_files+fsOMRs_files
victor@2 256 OMRs_symbols=[]
victor@2 257 sc=SymbolConversion()
victor@2 258 print "2---converting to symbols---"
victor@2 259 for omr_file in OMRs_files:
victor@2 260 omr=ff.getOMR(omr_file)
victor@2 261 if omr!=[]:
victor@2 262 omr_symbols=sc.filterOMR(omr,idPart)[1]
victor@2 263 OMRs_symbols.append(omr_symbols)
victor@2 264 else:
victor@2 265 OMRs_symbols.append([])
victor@2 266
victor@2 267 print "3---removing worst OMR---"
victor@2 268 # betterOmrIds=[]
victor@2 269
victor@2 270
victor@2 271 OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
victor@2 272
victor@2 273 print "4---calculating distances---"
victor@2 274 distances=self.getDistances(OMRs_symbols)
victor@2 275
victor@2 276 print "5---aligning symbols---"
victor@2 277
victor@2 278 omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
victor@2 279
victor@2 280 return omr_symbolsAligned,betterOmrIds
victor@2 281 def __setVoidArray(self,length):
victor@2 282 '''
victor@2 283 private function to create a void array
victor@2 284 '''
victor@2 285 arrayOut=[]
victor@2 286 for _ in range(length):
victor@2 287 arrayOut.append([])
victor@2 288
victor@2 289 return arrayOut
victor@2 290
victor@2 291 def setOMRSymbolsAligned(self,OMRs_symbols,distances):
victor@2 292 '''
victor@2 293 returns the OMRs symbols aligned from OMR symbols and
victor@2 294 the distances matrix
victor@2 295 '''
victor@2 296 pairings=[]
victor@2 297 pairingsReal=[]
victor@2 298 gapArray=[]
victor@2 299 removedArray=[]
victor@2 300 omr_symbolsAlign=self.__setVoidArray(len(OMRs_symbols))
victor@2 301 faa=FastAlignmentArrays()
victor@2 302 while len(distances[0])>1:
victor@2 303 minim,iMin,jMin=self.__getMinimum(distances)
victor@2 304 print distances,minim,iMin,jMin
victor@2 305 pairings.append([iMin,jMin])
victor@2 306 iMinReal,jMinReal=self.__getPairingReal(iMin,jMin,removedArray)
victor@2 307
victor@2 308 pairingsReal.append([iMinReal,jMinReal])
victor@2 309 if len(omr_symbolsAlign[iMinReal])==0:
victor@2 310 omr_symbolsAlign[iMinReal]=OMRs_symbols[iMin]
victor@2 311 if len(omr_symbolsAlign[jMinReal])==0:
victor@2 312 omr_symbolsAlign[jMinReal]=OMRs_symbols[jMin]
victor@2 313
victor@2 314 out=faa.needleman_wunsch(omr_symbolsAlign[iMinReal], omr_symbolsAlign[jMinReal])[0]
victor@2 315 omr_symbolsAlign[iMinReal]=out[0]
victor@2 316 omr_symbolsAlign[jMinReal]=out[1]
victor@2 317 gap1=out[2]
victor@2 318 gap2=out[3]
victor@2 319 gapArray.append([gap1,gap2])
victor@2 320
victor@2 321
victor@2 322 OMRs_symbols.pop(jMin)
victor@2 323 removedArray.append(jMinReal)
victor@2 324
victor@2 325 distances=self.__recalculeDistances(distances,iMin,jMin)
victor@2 326
victor@2 327
victor@2 328 omr_symbolsAlign=self.__fillingGaps(omr_symbolsAlign,gapArray,pairingsReal)
victor@2 329
victor@2 330
victor@2 331 return omr_symbolsAlign
victor@2 332
victor@2 333 def __fillingGaps(self,omr_symbolsAlign,gapArray,pairingsReal):
victor@2 334 '''
victor@2 335 private function to complete gaps based on the gap matrix
victor@2 336 and the pairs stablished
victor@2 337 '''
victor@2 338 for p in range(len(pairingsReal)-1,0,-1):
victor@2 339 for i in range(2):
victor@2 340 for j in range(2):
victor@2 341 for t in range(1,p+1):
victor@2 342 if(pairingsReal[p][i]==pairingsReal[p-t][j]):
victor@2 343 if j==0:
victor@2 344 s=1
victor@2 345 if j==1:
victor@2 346 s=0
victor@2 347 omrIndex=pairingsReal[p-t][s]
victor@2 348 newGap=[]
victor@2 349 for gap in gapArray[p][i]:
victor@2 350 omr_symbolsAlign[omrIndex].insert(gap,"*")
victor@2 351 newGap.append(gap)
victor@2 352 gapArray[p-t][s]=gapArray[p-t][s]+newGap
victor@2 353
victor@2 354
victor@2 355 return omr_symbolsAlign
victor@2 356
victor@2 357
victor@2 358
victor@2 359
victor@2 360
victor@2 361
victor@2 362
victor@2 363
victor@2 364
victor@2 365
victor@2 366
victor@2 367
victor@2 368
victor@2 369
victor@2 370
victor@2 371
victor@2 372