victor@2
|
1 '''
|
victor@2
|
2 Created on 10/11/2014
|
victor@2
|
3
|
victor@2
|
4 @organization: Lancaster University & University of Leeds
|
victor@2
|
5 @version: 1.0
|
victor@2
|
6 Created on 11/12/2014
|
victor@2
|
7
|
victor@2
|
8 @author: Victor Padilla
|
victor@2
|
9 @contact: v.padilla@lancaster.ac.uk
|
victor@2
|
10
|
victor@2
|
11 Functions related to the alignment
|
victor@2
|
12 and voting process
|
victor@2
|
13 '''
|
victor@2
|
14 from Alignment import FastAlignmentArrays
|
victor@2
|
15 from SymbolConversion import SymbolConversion
|
victor@2
|
16 from Functions import FilesFunctions
|
victor@2
|
17 import numpy as np
|
victor@2
|
18 from Clustering import Clustering
|
victor@2
|
19 import math
|
victor@2
|
20 from Alignment import NWunsch
|
victor@2
|
21
|
victor@2
|
22
|
victor@2
|
23 class PipelineAlignment:
|
victor@2
|
24 def alignGround(self,OMRs,part):
|
victor@2
|
25 '''
|
victor@2
|
26 Returns one part and the ground aligned. The first array value of OMRs
|
victor@2
|
27 should be the ground and the second the omr to align
|
victor@2
|
28 '''
|
victor@2
|
29 sc=SymbolConversion()
|
victor@2
|
30 OMRs_symbols=[]
|
victor@2
|
31 omr_symbolsAlign=[]
|
victor@2
|
32 for omr in OMRs:
|
victor@2
|
33 omr_symbols=sc.filterOMR(omr,part)[1]
|
victor@2
|
34 OMRs_symbols.append(omr_symbols)
|
victor@2
|
35 omr_symbolsAlign.append([])
|
victor@2
|
36
|
victor@2
|
37 faa=FastAlignmentArrays()
|
victor@2
|
38 out=faa.needleman_wunsch(OMRs_symbols[0], OMRs_symbols[1])[0]
|
victor@2
|
39
|
victor@2
|
40
|
victor@2
|
41 return out
|
victor@2
|
42
|
victor@2
|
43 def getDistances(self,OMRs_symbols):
|
victor@2
|
44 '''
|
victor@2
|
45 Returns the distance matrix from several omr
|
victor@2
|
46 in symbols, using the first symbol only
|
victor@2
|
47 [u'N:E-4_0.25', 0.25, '', 2.75, 3.0, None] y
|
victor@2
|
48 [u'N:E-4_0.25', 0.33, '', 2.50, 2.75, None]
|
victor@2
|
49
|
victor@2
|
50 are equals
|
victor@2
|
51
|
victor@2
|
52 Returns a triangular matrix
|
victor@2
|
53 [[ 0. 0.17647058 0.19141912]
|
victor@2
|
54 [ 0. 0. 0.17647058]
|
victor@2
|
55 [ 0. 0. 0. ]]
|
victor@2
|
56
|
victor@2
|
57 Uses the algorithm implemented in C for increasing the speed
|
victor@2
|
58 Alignment/C_Libraries/NWunsch
|
victor@2
|
59 '''
|
victor@2
|
60 ls=len(OMRs_symbols)
|
victor@2
|
61 dimension= (ls,ls)
|
victor@2
|
62 distances=np.zeros(dimension)
|
victor@2
|
63 for i in range(len(OMRs_symbols)):
|
victor@2
|
64 for j in range(i+1,len(OMRs_symbols)):
|
victor@2
|
65 print i,j
|
victor@2
|
66 align1=[]
|
victor@2
|
67 align2=[]
|
victor@2
|
68 for s in OMRs_symbols[i]:
|
victor@2
|
69 align1.append(s[0])
|
victor@2
|
70 for s in OMRs_symbols[j]:
|
victor@2
|
71 align2.append(s[0])
|
victor@2
|
72 #Algorithm implemented in C
|
victor@2
|
73 if len(align1)==0 or len(align2)==0:
|
victor@2
|
74 score=0
|
victor@2
|
75 else:
|
victor@2
|
76 print"-------------------------"
|
victor@2
|
77
|
victor@2
|
78 score=NWunsch.NWunsch_getSimilarity(align1,align2)
|
victor@2
|
79 print"-------------------------"
|
victor@2
|
80 if math.isnan(score):
|
victor@2
|
81 score=0
|
victor@2
|
82 distances[i][j]=1-score
|
victor@2
|
83 return distances
|
victor@2
|
84
|
victor@2
|
85 def getDistanceLength(self,OMRs_symbols):
|
victor@2
|
86 '''
|
victor@2
|
87 Similar to getDistance, but based on the length
|
victor@2
|
88 of the omrs. Testing purposes
|
victor@2
|
89 '''
|
victor@2
|
90 ls=len(OMRs_symbols)
|
victor@2
|
91 dimension= (ls,ls)
|
victor@2
|
92 distances=np.zeros(dimension)
|
victor@2
|
93 for i in range(len(OMRs_symbols)):
|
victor@2
|
94 for j in range(i+1,len(OMRs_symbols)):
|
victor@2
|
95 print i,j
|
victor@2
|
96 len_i=len(OMRs_symbols[i])
|
victor@2
|
97 len_j=len(OMRs_symbols[j])
|
victor@2
|
98 maxLen=len_j
|
victor@2
|
99 if len_i>=len_j:
|
victor@2
|
100 maxLen=len_i
|
victor@2
|
101
|
victor@2
|
102 score=(len_i-len_j)*1.0/maxLen
|
victor@2
|
103 if score<0:
|
victor@2
|
104 score=score*-1
|
victor@2
|
105 distances[i][j]=score
|
victor@2
|
106 return distances
|
victor@2
|
107
|
victor@2
|
108 def __getMinimum(self,distance):
|
victor@2
|
109 '''
|
victor@2
|
110 Returns the minimum value and the x,y position
|
victor@2
|
111 in the distance matrix
|
victor@2
|
112 '''
|
victor@2
|
113 minim=1000
|
victor@2
|
114 iMin=0
|
victor@2
|
115 jMin=0
|
victor@2
|
116 for i in range(len(distance[0])):
|
victor@2
|
117 for j in range(i+1,len(distance[0])):
|
victor@2
|
118 dist=distance[i][j]
|
victor@2
|
119 if isinstance(dist,list):
|
victor@2
|
120 dist=dist[0]
|
victor@2
|
121 if dist<minim:
|
victor@2
|
122 minim=dist
|
victor@2
|
123 iMin=i
|
victor@2
|
124 jMin=j
|
victor@2
|
125
|
victor@2
|
126 return minim,iMin,jMin
|
victor@2
|
127
|
victor@2
|
128
|
victor@2
|
129 def __recalculeDistances(self,distance,iMin,jMin):
|
victor@2
|
130 '''
|
victor@2
|
131 Removes the rows and the column in the distance matrix
|
victor@2
|
132 and calculates the new matrix
|
victor@2
|
133 '''
|
victor@2
|
134 for i in range(len(distance[0])):
|
victor@2
|
135 for j in range(i+1,len(distance[0])):
|
victor@2
|
136 if j==iMin:
|
victor@2
|
137 dist=distance[i][j]
|
victor@2
|
138 dist2=distance[i][jMin]
|
victor@2
|
139 distance[i][j]=(dist+dist2)/2
|
victor@2
|
140 distance=np.delete(distance, jMin, 0)
|
victor@2
|
141 distance=np.delete(distance, jMin, 1)
|
victor@2
|
142 return distance
|
victor@2
|
143
|
victor@2
|
144
|
victor@2
|
145 def __getPairingReal(self,iMin,jMin,removedArray):
|
victor@2
|
146 '''
|
victor@2
|
147 Returns the real omr position in the original matrix
|
victor@2
|
148 based on the actual position and the elements removed
|
victor@2
|
149
|
victor@2
|
150 usage:
|
victor@2
|
151 self._getPairingReal(0,1,[1])
|
victor@2
|
152 returns
|
victor@2
|
153 0,2
|
victor@2
|
154 '''
|
victor@2
|
155 iMinReal=iMin
|
victor@2
|
156 jMinReal=jMin
|
victor@2
|
157 removedArray.sort()
|
victor@2
|
158 for removedItem in removedArray:
|
victor@2
|
159 if iMinReal>=removedItem:
|
victor@2
|
160 iMinReal+=1
|
victor@2
|
161 if jMinReal>=removedItem:
|
victor@2
|
162 jMinReal+=1
|
victor@2
|
163 return iMinReal,jMinReal
|
victor@2
|
164
|
victor@2
|
165 def selectBetterOMRs(self,OMRs_symbols):
|
victor@2
|
166 '''
|
victor@2
|
167 Based on Philogenetic trees, this function
|
victor@2
|
168 takes the best omrs based on the distances between them
|
victor@2
|
169 '''
|
victor@2
|
170 distanceSimple=self.getDistances(OMRs_symbols)
|
victor@2
|
171 clustering=Clustering()
|
victor@2
|
172 distances=clustering.getCompleteMatrix(distanceSimple)
|
victor@2
|
173 species = []
|
victor@2
|
174 for i in range(len(OMRs_symbols)):
|
victor@2
|
175 species.append(i)
|
victor@2
|
176 clu = clustering.make_clusters(species)
|
victor@2
|
177 tree = clustering.regroup(clu, distances)
|
victor@2
|
178
|
victor@2
|
179 #at least 3 leafs in the tree
|
victor@2
|
180 maintree=tree
|
victor@2
|
181 for i in range(3,len(OMRs_symbols)):
|
victor@2
|
182 maintree=clustering.getBetterTree(tree,i)
|
victor@2
|
183 if len(clustering.getLeafs(maintree))>=3:
|
victor@2
|
184 break
|
victor@2
|
185
|
victor@2
|
186
|
victor@2
|
187 betterOmrIds= clustering.getLeafs(maintree)
|
victor@2
|
188
|
victor@2
|
189 #Graphic representation
|
victor@2
|
190 strTree=clustering.getStringTree(tree,tree.height,"")
|
victor@2
|
191 print strTree
|
victor@2
|
192 clustering.showTree(strTree)
|
victor@2
|
193 strMainTree=clustering.getStringTree(maintree,maintree.height,"")
|
victor@2
|
194 print strMainTree
|
victor@2
|
195 clustering.showTree(strMainTree)
|
victor@2
|
196
|
victor@2
|
197 newOMRs=[]
|
victor@2
|
198 for i in betterOmrIds:
|
victor@2
|
199 newOMRs.append(OMRs_symbols[i])
|
victor@2
|
200
|
victor@2
|
201 return newOMRs,betterOmrIds
|
victor@2
|
202
|
victor@2
|
203 # def alignNJ(self,idPart,fsOMRs,partOMRs):
|
victor@2
|
204 # '''
|
victor@2
|
205 # Main function for aligning the different OMRs
|
victor@2
|
206 #
|
victor@2
|
207 # Returns:
|
victor@2
|
208 # omr_symbolsAligned. OMR array of symbols aligned (only the best)
|
victor@2
|
209 # betterOmrIds. Id array of better OMRs (for writing the log file)
|
victor@2
|
210 #
|
victor@2
|
211 # usage:
|
victor@2
|
212 # pa=PipelineAlignment()
|
victor@2
|
213 # omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
|
victor@2
|
214 # '''
|
victor@2
|
215 #
|
victor@2
|
216 #
|
victor@2
|
217 # OMRs_symbols=[]
|
victor@2
|
218 # sc=SymbolConversion()
|
victor@2
|
219 # print "2---converting to symbols---"
|
victor@2
|
220 # for omr in OMRs:
|
victor@2
|
221 # if omr!=[]:
|
victor@2
|
222 # omr_symbols=sc.filterOMR(omr,idPart)[1]
|
victor@2
|
223 # OMRs_symbols.append(omr_symbols)
|
victor@2
|
224 # else:
|
victor@2
|
225 # OMRs_symbols.append([])
|
victor@2
|
226 #
|
victor@2
|
227 # print "3---removing worst OMR---"
|
victor@2
|
228 # # betterOmrIds=[]
|
victor@2
|
229 # OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
|
victor@2
|
230 #
|
victor@2
|
231 # print "4---calculating distances---"
|
victor@2
|
232 # distances=self.getDistances(OMRs_symbols)
|
victor@2
|
233 #
|
victor@2
|
234 # print "5---aligning symbols---"
|
victor@2
|
235 #
|
victor@2
|
236 # omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
|
victor@2
|
237 #
|
victor@2
|
238 # return omr_symbolsAligned,betterOmrIds
|
victor@2
|
239
|
victor@2
|
240 def alignNJ_files(self,idPart,fsOMRs_files,partOMRs_files):
|
victor@2
|
241 '''
|
victor@2
|
242 Main function for aligning the different OMRs
|
victor@2
|
243
|
victor@2
|
244 Returns:
|
victor@2
|
245 omr_symbolsAligned. OMR array of symbols aligned (only the best)
|
victor@2
|
246 betterOmrIds. Id array of better OMRs (for writing the log file)
|
victor@2
|
247
|
victor@2
|
248 usage:
|
victor@2
|
249 pa=PipelineAlignment()
|
victor@2
|
250 omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
|
victor@2
|
251 '''
|
victor@2
|
252
|
victor@2
|
253
|
victor@2
|
254 ff=FilesFunctions()
|
victor@2
|
255 OMRs_files=partOMRs_files+fsOMRs_files
|
victor@2
|
256 OMRs_symbols=[]
|
victor@2
|
257 sc=SymbolConversion()
|
victor@2
|
258 print "2---converting to symbols---"
|
victor@2
|
259 for omr_file in OMRs_files:
|
victor@2
|
260 omr=ff.getOMR(omr_file)
|
victor@2
|
261 if omr!=[]:
|
victor@2
|
262 omr_symbols=sc.filterOMR(omr,idPart)[1]
|
victor@2
|
263 OMRs_symbols.append(omr_symbols)
|
victor@2
|
264 else:
|
victor@2
|
265 OMRs_symbols.append([])
|
victor@2
|
266
|
victor@2
|
267 print "3---removing worst OMR---"
|
victor@2
|
268 # betterOmrIds=[]
|
victor@2
|
269
|
victor@2
|
270
|
victor@2
|
271 OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
|
victor@2
|
272
|
victor@2
|
273 print "4---calculating distances---"
|
victor@2
|
274 distances=self.getDistances(OMRs_symbols)
|
victor@2
|
275
|
victor@2
|
276 print "5---aligning symbols---"
|
victor@2
|
277
|
victor@2
|
278 omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
|
victor@2
|
279
|
victor@2
|
280 return omr_symbolsAligned,betterOmrIds
|
victor@2
|
281 def __setVoidArray(self,length):
|
victor@2
|
282 '''
|
victor@2
|
283 private function to create a void array
|
victor@2
|
284 '''
|
victor@2
|
285 arrayOut=[]
|
victor@2
|
286 for _ in range(length):
|
victor@2
|
287 arrayOut.append([])
|
victor@2
|
288
|
victor@2
|
289 return arrayOut
|
victor@2
|
290
|
victor@2
|
291 def setOMRSymbolsAligned(self,OMRs_symbols,distances):
|
victor@2
|
292 '''
|
victor@2
|
293 returns the OMRs symbols aligned from OMR symbols and
|
victor@2
|
294 the distances matrix
|
victor@2
|
295 '''
|
victor@2
|
296 pairings=[]
|
victor@2
|
297 pairingsReal=[]
|
victor@2
|
298 gapArray=[]
|
victor@2
|
299 removedArray=[]
|
victor@2
|
300 omr_symbolsAlign=self.__setVoidArray(len(OMRs_symbols))
|
victor@2
|
301 faa=FastAlignmentArrays()
|
victor@2
|
302 while len(distances[0])>1:
|
victor@2
|
303 minim,iMin,jMin=self.__getMinimum(distances)
|
victor@2
|
304 print distances,minim,iMin,jMin
|
victor@2
|
305 pairings.append([iMin,jMin])
|
victor@2
|
306 iMinReal,jMinReal=self.__getPairingReal(iMin,jMin,removedArray)
|
victor@2
|
307
|
victor@2
|
308 pairingsReal.append([iMinReal,jMinReal])
|
victor@2
|
309 if len(omr_symbolsAlign[iMinReal])==0:
|
victor@2
|
310 omr_symbolsAlign[iMinReal]=OMRs_symbols[iMin]
|
victor@2
|
311 if len(omr_symbolsAlign[jMinReal])==0:
|
victor@2
|
312 omr_symbolsAlign[jMinReal]=OMRs_symbols[jMin]
|
victor@2
|
313
|
victor@2
|
314 out=faa.needleman_wunsch(omr_symbolsAlign[iMinReal], omr_symbolsAlign[jMinReal])[0]
|
victor@2
|
315 omr_symbolsAlign[iMinReal]=out[0]
|
victor@2
|
316 omr_symbolsAlign[jMinReal]=out[1]
|
victor@2
|
317 gap1=out[2]
|
victor@2
|
318 gap2=out[3]
|
victor@2
|
319 gapArray.append([gap1,gap2])
|
victor@2
|
320
|
victor@2
|
321
|
victor@2
|
322 OMRs_symbols.pop(jMin)
|
victor@2
|
323 removedArray.append(jMinReal)
|
victor@2
|
324
|
victor@2
|
325 distances=self.__recalculeDistances(distances,iMin,jMin)
|
victor@2
|
326
|
victor@2
|
327
|
victor@2
|
328 omr_symbolsAlign=self.__fillingGaps(omr_symbolsAlign,gapArray,pairingsReal)
|
victor@2
|
329
|
victor@2
|
330
|
victor@2
|
331 return omr_symbolsAlign
|
victor@2
|
332
|
victor@2
|
333 def __fillingGaps(self,omr_symbolsAlign,gapArray,pairingsReal):
|
victor@2
|
334 '''
|
victor@2
|
335 private function to complete gaps based on the gap matrix
|
victor@2
|
336 and the pairs stablished
|
victor@2
|
337 '''
|
victor@2
|
338 for p in range(len(pairingsReal)-1,0,-1):
|
victor@2
|
339 for i in range(2):
|
victor@2
|
340 for j in range(2):
|
victor@2
|
341 for t in range(1,p+1):
|
victor@2
|
342 if(pairingsReal[p][i]==pairingsReal[p-t][j]):
|
victor@2
|
343 if j==0:
|
victor@2
|
344 s=1
|
victor@2
|
345 if j==1:
|
victor@2
|
346 s=0
|
victor@2
|
347 omrIndex=pairingsReal[p-t][s]
|
victor@2
|
348 newGap=[]
|
victor@2
|
349 for gap in gapArray[p][i]:
|
victor@2
|
350 omr_symbolsAlign[omrIndex].insert(gap,"*")
|
victor@2
|
351 newGap.append(gap)
|
victor@2
|
352 gapArray[p-t][s]=gapArray[p-t][s]+newGap
|
victor@2
|
353
|
victor@2
|
354
|
victor@2
|
355 return omr_symbolsAlign
|
victor@2
|
356
|
victor@2
|
357
|
victor@2
|
358
|
victor@2
|
359
|
victor@2
|
360
|
victor@2
|
361
|
victor@2
|
362
|
victor@2
|
363
|
victor@2
|
364
|
victor@2
|
365
|
victor@2
|
366
|
victor@2
|
367
|
victor@2
|
368
|
victor@2
|
369
|
victor@2
|
370
|
victor@2
|
371
|
victor@2
|
372 |