comparison Process/PipelineAlignment.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
comparison
equal deleted inserted replaced
1:0f7f611deca4 2:46fb79167a61
1 '''
2 Created on 10/11/2014
3
4 @organization: Lancaster University & University of Leeds
5 @version: 1.0
6 Created on 11/12/2014
7
8 @author: Victor Padilla
9 @contact: v.padilla@lancaster.ac.uk
10
11 Functions related to the alignment
12 and voting process
13 '''
14 from Alignment import FastAlignmentArrays
15 from SymbolConversion import SymbolConversion
16 from Functions import FilesFunctions
17 import numpy as np
18 from Clustering import Clustering
19 import math
20 from Alignment import NWunsch
21
22
23 class PipelineAlignment:
24 def alignGround(self,OMRs,part):
25 '''
26 Returns one part and the ground aligned. The first array value of OMRs
27 should be the ground and the second the omr to align
28 '''
29 sc=SymbolConversion()
30 OMRs_symbols=[]
31 omr_symbolsAlign=[]
32 for omr in OMRs:
33 omr_symbols=sc.filterOMR(omr,part)[1]
34 OMRs_symbols.append(omr_symbols)
35 omr_symbolsAlign.append([])
36
37 faa=FastAlignmentArrays()
38 out=faa.needleman_wunsch(OMRs_symbols[0], OMRs_symbols[1])[0]
39
40
41 return out
42
43 def getDistances(self,OMRs_symbols):
44 '''
45 Returns the distance matrix from several omr
46 in symbols, using the first symbol only
47 [u'N:E-4_0.25', 0.25, '', 2.75, 3.0, None] y
48 [u'N:E-4_0.25', 0.33, '', 2.50, 2.75, None]
49
50 are equals
51
52 Returns a triangular matrix
53 [[ 0. 0.17647058 0.19141912]
54 [ 0. 0. 0.17647058]
55 [ 0. 0. 0. ]]
56
57 Uses the algorithm implemented in C for increasing the speed
58 Alignment/C_Libraries/NWunsch
59 '''
60 ls=len(OMRs_symbols)
61 dimension= (ls,ls)
62 distances=np.zeros(dimension)
63 for i in range(len(OMRs_symbols)):
64 for j in range(i+1,len(OMRs_symbols)):
65 print i,j
66 align1=[]
67 align2=[]
68 for s in OMRs_symbols[i]:
69 align1.append(s[0])
70 for s in OMRs_symbols[j]:
71 align2.append(s[0])
72 #Algorithm implemented in C
73 if len(align1)==0 or len(align2)==0:
74 score=0
75 else:
76 print"-------------------------"
77
78 score=NWunsch.NWunsch_getSimilarity(align1,align2)
79 print"-------------------------"
80 if math.isnan(score):
81 score=0
82 distances[i][j]=1-score
83 return distances
84
85 def getDistanceLength(self,OMRs_symbols):
86 '''
87 Similar to getDistance, but based on the length
88 of the omrs. Testing purposes
89 '''
90 ls=len(OMRs_symbols)
91 dimension= (ls,ls)
92 distances=np.zeros(dimension)
93 for i in range(len(OMRs_symbols)):
94 for j in range(i+1,len(OMRs_symbols)):
95 print i,j
96 len_i=len(OMRs_symbols[i])
97 len_j=len(OMRs_symbols[j])
98 maxLen=len_j
99 if len_i>=len_j:
100 maxLen=len_i
101
102 score=(len_i-len_j)*1.0/maxLen
103 if score<0:
104 score=score*-1
105 distances[i][j]=score
106 return distances
107
108 def __getMinimum(self,distance):
109 '''
110 Returns the minimum value and the x,y position
111 in the distance matrix
112 '''
113 minim=1000
114 iMin=0
115 jMin=0
116 for i in range(len(distance[0])):
117 for j in range(i+1,len(distance[0])):
118 dist=distance[i][j]
119 if isinstance(dist,list):
120 dist=dist[0]
121 if dist<minim:
122 minim=dist
123 iMin=i
124 jMin=j
125
126 return minim,iMin,jMin
127
128
129 def __recalculeDistances(self,distance,iMin,jMin):
130 '''
131 Removes the rows and the column in the distance matrix
132 and calculates the new matrix
133 '''
134 for i in range(len(distance[0])):
135 for j in range(i+1,len(distance[0])):
136 if j==iMin:
137 dist=distance[i][j]
138 dist2=distance[i][jMin]
139 distance[i][j]=(dist+dist2)/2
140 distance=np.delete(distance, jMin, 0)
141 distance=np.delete(distance, jMin, 1)
142 return distance
143
144
145 def __getPairingReal(self,iMin,jMin,removedArray):
146 '''
147 Returns the real omr position in the original matrix
148 based on the actual position and the elements removed
149
150 usage:
151 self._getPairingReal(0,1,[1])
152 returns
153 0,2
154 '''
155 iMinReal=iMin
156 jMinReal=jMin
157 removedArray.sort()
158 for removedItem in removedArray:
159 if iMinReal>=removedItem:
160 iMinReal+=1
161 if jMinReal>=removedItem:
162 jMinReal+=1
163 return iMinReal,jMinReal
164
165 def selectBetterOMRs(self,OMRs_symbols):
166 '''
167 Based on Philogenetic trees, this function
168 takes the best omrs based on the distances between them
169 '''
170 distanceSimple=self.getDistances(OMRs_symbols)
171 clustering=Clustering()
172 distances=clustering.getCompleteMatrix(distanceSimple)
173 species = []
174 for i in range(len(OMRs_symbols)):
175 species.append(i)
176 clu = clustering.make_clusters(species)
177 tree = clustering.regroup(clu, distances)
178
179 #at least 3 leafs in the tree
180 maintree=tree
181 for i in range(3,len(OMRs_symbols)):
182 maintree=clustering.getBetterTree(tree,i)
183 if len(clustering.getLeafs(maintree))>=3:
184 break
185
186
187 betterOmrIds= clustering.getLeafs(maintree)
188
189 #Graphic representation
190 strTree=clustering.getStringTree(tree,tree.height,"")
191 print strTree
192 clustering.showTree(strTree)
193 strMainTree=clustering.getStringTree(maintree,maintree.height,"")
194 print strMainTree
195 clustering.showTree(strMainTree)
196
197 newOMRs=[]
198 for i in betterOmrIds:
199 newOMRs.append(OMRs_symbols[i])
200
201 return newOMRs,betterOmrIds
202
203 # def alignNJ(self,idPart,fsOMRs,partOMRs):
204 # '''
205 # Main function for aligning the different OMRs
206 #
207 # Returns:
208 # omr_symbolsAligned. OMR array of symbols aligned (only the best)
209 # betterOmrIds. Id array of better OMRs (for writing the log file)
210 #
211 # usage:
212 # pa=PipelineAlignment()
213 # omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
214 # '''
215 #
216 #
217 # OMRs_symbols=[]
218 # sc=SymbolConversion()
219 # print "2---converting to symbols---"
220 # for omr in OMRs:
221 # if omr!=[]:
222 # omr_symbols=sc.filterOMR(omr,idPart)[1]
223 # OMRs_symbols.append(omr_symbols)
224 # else:
225 # OMRs_symbols.append([])
226 #
227 # print "3---removing worst OMR---"
228 # # betterOmrIds=[]
229 # OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
230 #
231 # print "4---calculating distances---"
232 # distances=self.getDistances(OMRs_symbols)
233 #
234 # print "5---aligning symbols---"
235 #
236 # omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
237 #
238 # return omr_symbolsAligned,betterOmrIds
239
240 def alignNJ_files(self,idPart,fsOMRs_files,partOMRs_files):
241 '''
242 Main function for aligning the different OMRs
243
244 Returns:
245 omr_symbolsAligned. OMR array of symbols aligned (only the best)
246 betterOmrIds. Id array of better OMRs (for writing the log file)
247
248 usage:
249 pa=PipelineAlignment()
250 omr_symbolsAligned,betterOmrIds=pa.alignNJ(idPart,fsOMRs,partOMRs)
251 '''
252
253
254 ff=FilesFunctions()
255 OMRs_files=partOMRs_files+fsOMRs_files
256 OMRs_symbols=[]
257 sc=SymbolConversion()
258 print "2---converting to symbols---"
259 for omr_file in OMRs_files:
260 omr=ff.getOMR(omr_file)
261 if omr!=[]:
262 omr_symbols=sc.filterOMR(omr,idPart)[1]
263 OMRs_symbols.append(omr_symbols)
264 else:
265 OMRs_symbols.append([])
266
267 print "3---removing worst OMR---"
268 # betterOmrIds=[]
269
270
271 OMRs_symbols,betterOmrIds=self.selectBetterOMRs(OMRs_symbols)
272
273 print "4---calculating distances---"
274 distances=self.getDistances(OMRs_symbols)
275
276 print "5---aligning symbols---"
277
278 omr_symbolsAligned=self.setOMRSymbolsAligned(OMRs_symbols,distances)
279
280 return omr_symbolsAligned,betterOmrIds
281 def __setVoidArray(self,length):
282 '''
283 private function to create a void array
284 '''
285 arrayOut=[]
286 for _ in range(length):
287 arrayOut.append([])
288
289 return arrayOut
290
291 def setOMRSymbolsAligned(self,OMRs_symbols,distances):
292 '''
293 returns the OMRs symbols aligned from OMR symbols and
294 the distances matrix
295 '''
296 pairings=[]
297 pairingsReal=[]
298 gapArray=[]
299 removedArray=[]
300 omr_symbolsAlign=self.__setVoidArray(len(OMRs_symbols))
301 faa=FastAlignmentArrays()
302 while len(distances[0])>1:
303 minim,iMin,jMin=self.__getMinimum(distances)
304 print distances,minim,iMin,jMin
305 pairings.append([iMin,jMin])
306 iMinReal,jMinReal=self.__getPairingReal(iMin,jMin,removedArray)
307
308 pairingsReal.append([iMinReal,jMinReal])
309 if len(omr_symbolsAlign[iMinReal])==0:
310 omr_symbolsAlign[iMinReal]=OMRs_symbols[iMin]
311 if len(omr_symbolsAlign[jMinReal])==0:
312 omr_symbolsAlign[jMinReal]=OMRs_symbols[jMin]
313
314 out=faa.needleman_wunsch(omr_symbolsAlign[iMinReal], omr_symbolsAlign[jMinReal])[0]
315 omr_symbolsAlign[iMinReal]=out[0]
316 omr_symbolsAlign[jMinReal]=out[1]
317 gap1=out[2]
318 gap2=out[3]
319 gapArray.append([gap1,gap2])
320
321
322 OMRs_symbols.pop(jMin)
323 removedArray.append(jMinReal)
324
325 distances=self.__recalculeDistances(distances,iMin,jMin)
326
327
328 omr_symbolsAlign=self.__fillingGaps(omr_symbolsAlign,gapArray,pairingsReal)
329
330
331 return omr_symbolsAlign
332
333 def __fillingGaps(self,omr_symbolsAlign,gapArray,pairingsReal):
334 '''
335 private function to complete gaps based on the gap matrix
336 and the pairs stablished
337 '''
338 for p in range(len(pairingsReal)-1,0,-1):
339 for i in range(2):
340 for j in range(2):
341 for t in range(1,p+1):
342 if(pairingsReal[p][i]==pairingsReal[p-t][j]):
343 if j==0:
344 s=1
345 if j==1:
346 s=0
347 omrIndex=pairingsReal[p-t][s]
348 newGap=[]
349 for gap in gapArray[p][i]:
350 omr_symbolsAlign[omrIndex].insert(gap,"*")
351 newGap.append(gap)
352 gapArray[p-t][s]=gapArray[p-t][s]+newGap
353
354
355 return omr_symbolsAlign
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372