Mercurial > hg > multiomr
comparison Process/Clustering.py @ 2:46fb79167a61 tip
Main Code
author | Victor Padilla <victor.padilla.mc@gmail.com> |
---|---|
date | Mon, 04 May 2015 22:56:18 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:0f7f611deca4 | 2:46fb79167a61 |
---|---|
1 ''' | |
2 Created on 10/11/2014 | |
3 | |
4 @organization: Lancaster University & University of Leeds | |
5 @version: 1.0 | |
6 Created on 11/12/2014 | |
7 | |
8 @author: Victor Padilla | |
9 @contact: v.padilla@lancaster.ac.uk | |
10 | |
11 Class related to Phylogenetic trees | |
12 | |
13 ''' | |
14 import copy | |
15 import numpy | |
16 from Bio import Phylo | |
17 from cStringIO import StringIO | |
18 | |
19 class cluster: | |
20 ''' | |
21 cluster object for Clustering class | |
22 ''' | |
23 pass | |
24 | |
25 class Clustering: | |
26 ''' | |
27 This class allows to transform a triangular matrix | |
28 in a philogenetic tree cutting it at some point | |
29 | |
30 | |
31 usage: | |
32 ### The omr name array | |
33 species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ] | |
34 matr = [ [ 0., 6., 8., 1., 2. ,6. ], | |
35 [ 0., 0., 8., 6., 6. ,4. ], | |
36 [ 0., 0., 0., 8., 8. ,8. ], | |
37 [ 0., 0., 0., 0., 2. ,6. ], | |
38 [ 0., 0., 0., 0., 0. ,6. ], | |
39 [ 0., 0., 0., 0., 0. ,0. ], ] | |
40 | |
41 clustering=Clustering() | |
42 ### Transforms the triangular matrix in a complete matrix | |
43 matr= clustering.getCompleteMatrix(matr) | |
44 | |
45 ### establishes the clusters | |
46 clu = clustering.make_clusters(species) | |
47 | |
48 ### Regroup the cluster | |
49 tree = clustering.regroup(clu, matr) | |
50 | |
51 ### get the string in newick format | |
52 strTree=clustering.getStringTree(tree,tree.height,"") | |
53 | |
54 print(strTree) | |
55 clustering.showTree(strTree) | |
56 | |
57 ### Takes the main tree with 5 branches | |
58 maintree=clustering.getBetterTree(tree,5) | |
59 strTree=clustering.getStringTree(maintree,maintree.height,"") | |
60 print(strTree) | |
61 clustering.showTree(strTree) | |
62 | |
63 ''' | |
64 | |
65 def make_clusters(self,species): | |
66 ''' | |
67 Organises the cluster based on the species array | |
68 ''' | |
69 clusters = {} | |
70 Id = 1 | |
71 for s in species: | |
72 c = cluster() | |
73 c.id = Id | |
74 c.data = s | |
75 c.size = 1 | |
76 c.height = 0 | |
77 clusters[c.id] = c | |
78 Id = Id + 1 | |
79 return clusters | |
80 | |
81 def __find_min(self,clu, d): | |
82 ''' | |
83 finding the minimum distance | |
84 private function | |
85 ''' | |
86 mini = None | |
87 i_mini = 0 | |
88 j_mini = 0 | |
89 for i in clu: | |
90 for j in clu: | |
91 if j>i: | |
92 tmp = d[j -1 ][i -1 ] | |
93 if not mini: | |
94 mini = tmp | |
95 if tmp <= mini: | |
96 i_mini = i | |
97 j_mini = j | |
98 mini = tmp | |
99 return (i_mini, j_mini, mini) | |
100 | |
101 def regroup(self,clusters, dist): | |
102 ''' | |
103 organizing the cluster | |
104 ''' | |
105 i, j, dij = self.__find_min(clusters, dist) | |
106 ci = clusters[i] | |
107 cj = clusters[j] | |
108 # create new cluster | |
109 k = cluster() | |
110 k.id = max(clusters) + 1 | |
111 k.data = (ci, cj) | |
112 k.size = ci.size + cj.size | |
113 k.height = dij / 2. | |
114 # remove clusters | |
115 del clusters[i] | |
116 del clusters[j] | |
117 # compute new distance values and insert them | |
118 dist.append([]) | |
119 for l in range(0, k.id -1): | |
120 dist[k.id-1].append(0) | |
121 for l in clusters: | |
122 dil = dist[max(i, l) -1][min(i, l) -1] | |
123 djl = dist[max(j, l) -1][min(j, l) -1] | |
124 dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size) | |
125 dist[k.id -1][l-1] = dkl | |
126 # insert the new cluster | |
127 clusters[k.id] = k | |
128 | |
129 if len(clusters) == 1: | |
130 # we're through ! | |
131 return clusters.values()[0] | |
132 else: | |
133 return self.regroup(clusters, dist) | |
134 | |
135 def __pprint(self,tree, length): | |
136 ''' | |
137 print the tree in newick format | |
138 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names | |
139 ''' | |
140 if tree.size > 1: | |
141 # it's an internal node | |
142 print "(", | |
143 self.__pprint(tree.data[0], tree.height) | |
144 print ",", | |
145 self.__pprint(tree.data[1], tree.height) | |
146 print ("):%2.2f" % (length - tree.height)), | |
147 else : | |
148 # it's a leaf | |
149 print ("%s:%2.2f" % (tree.data, length)), | |
150 | |
151 def getStringTree(self,tree, length,strOut): | |
152 ''' | |
153 returns the string of the tree in newick format | |
154 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names | |
155 ''' | |
156 if tree.size > 1: | |
157 # it's an internal node | |
158 strOut+="(" | |
159 strOut=self.getStringTree(tree.data[0], tree.height,strOut) | |
160 strOut+="," | |
161 strOut=self.getStringTree(tree.data[1], tree.height,strOut) | |
162 strOut+="):"+str(length- tree.height) | |
163 else : | |
164 # it's a leaf | |
165 strOut+=str(tree.data)+":"+str(length) | |
166 return strOut | |
167 | |
168 | |
169 def createClusters(self,tree,length): | |
170 ''' | |
171 separates the tree at the point determined by length | |
172 ''' | |
173 clusters=[] | |
174 self.__separateClusters(clusters,tree,length) | |
175 self.clusters=self.__filterClusters(clusters,length) | |
176 return clusters | |
177 | |
178 def getLeafs(self,tree): | |
179 ''' | |
180 returns the number of leafs from a tree | |
181 ''' | |
182 leafs=[] | |
183 self.getTreeLeafs(leafs,tree,tree.height) | |
184 return leafs | |
185 | |
186 def __separateClusters(self,clusters,tree,length): | |
187 ''' | |
188 divides the main tree in multiples trees based on the length | |
189 ''' | |
190 if tree.size > 1: | |
191 if tree.height>=length: | |
192 clusters.append(tree.data[0]) | |
193 clusters.append(tree.data[1]) | |
194 else: | |
195 pass | |
196 self.__separateClusters(clusters,tree.data[0], length) | |
197 self.__separateClusters(clusters,tree.data[1], length) | |
198 | |
199 def __filterClusters(self,clusters,length): | |
200 ''' | |
201 removes trees higher than length | |
202 ''' | |
203 clustersOut=[] | |
204 for c in clusters: | |
205 if c.height<length: | |
206 clustersOut.append(c) | |
207 return clustersOut | |
208 | |
209 def getTreeLeafs(self,leafs,tree,length): | |
210 ''' | |
211 returns the number of leafs from a tree. Recursive function | |
212 ''' | |
213 if tree.size > 1: | |
214 # it's an internal node | |
215 self.getTreeLeafs(leafs,tree.data[0], tree.height) | |
216 self.getTreeLeafs(leafs,tree.data[1], tree.height) | |
217 else : | |
218 # it's a leaf | |
219 leafs.append(tree.data) | |
220 | |
221 def getBetterTree(self,tree,numberLeafs): | |
222 ''' | |
223 takes the smallest tree that has <= numberLeafs | |
224 ''' | |
225 while True: | |
226 leafs=self.getLeafs(tree) | |
227 if len(leafs)<=numberLeafs: | |
228 return tree | |
229 clusters=self.createClusters(tree,tree.height) | |
230 tree=self.getMainTree(clusters) | |
231 | |
232 | |
233 def getAverageDistance(self,matr): | |
234 ''' | |
235 returns the average distance from the matrix | |
236 ''' | |
237 sumat=0 | |
238 lengthMatrix=len(matr[0]) | |
239 length=lengthMatrix*lengthMatrix-lengthMatrix | |
240 for i in range(lengthMatrix): | |
241 for j in range(lengthMatrix): | |
242 sumat=sumat+matr[i][j] | |
243 return (sumat/length)/2 | |
244 | |
245 def getMaximumDistance(self,matr): | |
246 ''' | |
247 returns the maximum distance from the matrix | |
248 ''' | |
249 maxValue=0 | |
250 lengthMatrix=len(matr[0]) | |
251 for i in range(lengthMatrix): | |
252 for j in range(lengthMatrix): | |
253 if matr[i][j]>maxValue: | |
254 maxValue=matr[i][j] | |
255 return maxValue/2 | |
256 | |
257 def getMainTree(self,clusters): | |
258 ''' | |
259 Takes the tree with the higher number of leafs | |
260 ''' | |
261 maxim=0 | |
262 indexmax=0 | |
263 for i in range(len(clusters)): | |
264 leafs=self.getLeafs(clusters[i]) | |
265 if len(leafs)>maxim: | |
266 maxim=len(leafs) | |
267 indexmax=i | |
268 | |
269 return clusters[indexmax] | |
270 | |
271 def getCompleteMatrix(self,matr): | |
272 ''' | |
273 return the complete matrix from a | |
274 triangular matrix | |
275 ''' | |
276 newMatrix=copy.copy(matr) | |
277 if isinstance(matr,numpy.ndarray): | |
278 newMatrix=newMatrix.tolist() | |
279 lengthMatrix=len(matr[0]) | |
280 for i in range(lengthMatrix): | |
281 for j in range(i,lengthMatrix): | |
282 newMatrix[j][i]=matr[i][j] | |
283 return newMatrix | |
284 | |
285 def showTree(self,strTree): | |
286 ''' | |
287 Shows a graphical representation from a newick tree format | |
288 ''' | |
289 handle = StringIO(strTree) | |
290 tree = Phylo.read(handle, 'newick') | |
291 tree.ladderize() | |
292 # Phylo.draw(tree) | |
293 Phylo.draw_ascii(tree) | |
294 | |
295 |