comparison Process/Clustering.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
comparison
equal deleted inserted replaced
1:0f7f611deca4 2:46fb79167a61
1 '''
2 Created on 10/11/2014
3
4 @organization: Lancaster University & University of Leeds
5 @version: 1.0
6 Created on 11/12/2014
7
8 @author: Victor Padilla
9 @contact: v.padilla@lancaster.ac.uk
10
11 Class related to Phylogenetic trees
12
13 '''
14 import copy
15 import numpy
16 from Bio import Phylo
17 from cStringIO import StringIO
18
19 class cluster:
20 '''
21 cluster object for Clustering class
22 '''
23 pass
24
25 class Clustering:
26 '''
27 This class allows to transform a triangular matrix
28 in a philogenetic tree cutting it at some point
29
30
31 usage:
32 ### The omr name array
33 species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ]
34 matr = [ [ 0., 6., 8., 1., 2. ,6. ],
35 [ 0., 0., 8., 6., 6. ,4. ],
36 [ 0., 0., 0., 8., 8. ,8. ],
37 [ 0., 0., 0., 0., 2. ,6. ],
38 [ 0., 0., 0., 0., 0. ,6. ],
39 [ 0., 0., 0., 0., 0. ,0. ], ]
40
41 clustering=Clustering()
42 ### Transforms the triangular matrix in a complete matrix
43 matr= clustering.getCompleteMatrix(matr)
44
45 ### establishes the clusters
46 clu = clustering.make_clusters(species)
47
48 ### Regroup the cluster
49 tree = clustering.regroup(clu, matr)
50
51 ### get the string in newick format
52 strTree=clustering.getStringTree(tree,tree.height,"")
53
54 print(strTree)
55 clustering.showTree(strTree)
56
57 ### Takes the main tree with 5 branches
58 maintree=clustering.getBetterTree(tree,5)
59 strTree=clustering.getStringTree(maintree,maintree.height,"")
60 print(strTree)
61 clustering.showTree(strTree)
62
63 '''
64
65 def make_clusters(self,species):
66 '''
67 Organises the cluster based on the species array
68 '''
69 clusters = {}
70 Id = 1
71 for s in species:
72 c = cluster()
73 c.id = Id
74 c.data = s
75 c.size = 1
76 c.height = 0
77 clusters[c.id] = c
78 Id = Id + 1
79 return clusters
80
81 def __find_min(self,clu, d):
82 '''
83 finding the minimum distance
84 private function
85 '''
86 mini = None
87 i_mini = 0
88 j_mini = 0
89 for i in clu:
90 for j in clu:
91 if j>i:
92 tmp = d[j -1 ][i -1 ]
93 if not mini:
94 mini = tmp
95 if tmp <= mini:
96 i_mini = i
97 j_mini = j
98 mini = tmp
99 return (i_mini, j_mini, mini)
100
101 def regroup(self,clusters, dist):
102 '''
103 organizing the cluster
104 '''
105 i, j, dij = self.__find_min(clusters, dist)
106 ci = clusters[i]
107 cj = clusters[j]
108 # create new cluster
109 k = cluster()
110 k.id = max(clusters) + 1
111 k.data = (ci, cj)
112 k.size = ci.size + cj.size
113 k.height = dij / 2.
114 # remove clusters
115 del clusters[i]
116 del clusters[j]
117 # compute new distance values and insert them
118 dist.append([])
119 for l in range(0, k.id -1):
120 dist[k.id-1].append(0)
121 for l in clusters:
122 dil = dist[max(i, l) -1][min(i, l) -1]
123 djl = dist[max(j, l) -1][min(j, l) -1]
124 dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size)
125 dist[k.id -1][l-1] = dkl
126 # insert the new cluster
127 clusters[k.id] = k
128
129 if len(clusters) == 1:
130 # we're through !
131 return clusters.values()[0]
132 else:
133 return self.regroup(clusters, dist)
134
135 def __pprint(self,tree, length):
136 '''
137 print the tree in newick format
138 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
139 '''
140 if tree.size > 1:
141 # it's an internal node
142 print "(",
143 self.__pprint(tree.data[0], tree.height)
144 print ",",
145 self.__pprint(tree.data[1], tree.height)
146 print ("):%2.2f" % (length - tree.height)),
147 else :
148 # it's a leaf
149 print ("%s:%2.2f" % (tree.data, length)),
150
151 def getStringTree(self,tree, length,strOut):
152 '''
153 returns the string of the tree in newick format
154 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
155 '''
156 if tree.size > 1:
157 # it's an internal node
158 strOut+="("
159 strOut=self.getStringTree(tree.data[0], tree.height,strOut)
160 strOut+=","
161 strOut=self.getStringTree(tree.data[1], tree.height,strOut)
162 strOut+="):"+str(length- tree.height)
163 else :
164 # it's a leaf
165 strOut+=str(tree.data)+":"+str(length)
166 return strOut
167
168
169 def createClusters(self,tree,length):
170 '''
171 separates the tree at the point determined by length
172 '''
173 clusters=[]
174 self.__separateClusters(clusters,tree,length)
175 self.clusters=self.__filterClusters(clusters,length)
176 return clusters
177
178 def getLeafs(self,tree):
179 '''
180 returns the number of leafs from a tree
181 '''
182 leafs=[]
183 self.getTreeLeafs(leafs,tree,tree.height)
184 return leafs
185
186 def __separateClusters(self,clusters,tree,length):
187 '''
188 divides the main tree in multiples trees based on the length
189 '''
190 if tree.size > 1:
191 if tree.height>=length:
192 clusters.append(tree.data[0])
193 clusters.append(tree.data[1])
194 else:
195 pass
196 self.__separateClusters(clusters,tree.data[0], length)
197 self.__separateClusters(clusters,tree.data[1], length)
198
199 def __filterClusters(self,clusters,length):
200 '''
201 removes trees higher than length
202 '''
203 clustersOut=[]
204 for c in clusters:
205 if c.height<length:
206 clustersOut.append(c)
207 return clustersOut
208
209 def getTreeLeafs(self,leafs,tree,length):
210 '''
211 returns the number of leafs from a tree. Recursive function
212 '''
213 if tree.size > 1:
214 # it's an internal node
215 self.getTreeLeafs(leafs,tree.data[0], tree.height)
216 self.getTreeLeafs(leafs,tree.data[1], tree.height)
217 else :
218 # it's a leaf
219 leafs.append(tree.data)
220
221 def getBetterTree(self,tree,numberLeafs):
222 '''
223 takes the smallest tree that has <= numberLeafs
224 '''
225 while True:
226 leafs=self.getLeafs(tree)
227 if len(leafs)<=numberLeafs:
228 return tree
229 clusters=self.createClusters(tree,tree.height)
230 tree=self.getMainTree(clusters)
231
232
233 def getAverageDistance(self,matr):
234 '''
235 returns the average distance from the matrix
236 '''
237 sumat=0
238 lengthMatrix=len(matr[0])
239 length=lengthMatrix*lengthMatrix-lengthMatrix
240 for i in range(lengthMatrix):
241 for j in range(lengthMatrix):
242 sumat=sumat+matr[i][j]
243 return (sumat/length)/2
244
245 def getMaximumDistance(self,matr):
246 '''
247 returns the maximum distance from the matrix
248 '''
249 maxValue=0
250 lengthMatrix=len(matr[0])
251 for i in range(lengthMatrix):
252 for j in range(lengthMatrix):
253 if matr[i][j]>maxValue:
254 maxValue=matr[i][j]
255 return maxValue/2
256
257 def getMainTree(self,clusters):
258 '''
259 Takes the tree with the higher number of leafs
260 '''
261 maxim=0
262 indexmax=0
263 for i in range(len(clusters)):
264 leafs=self.getLeafs(clusters[i])
265 if len(leafs)>maxim:
266 maxim=len(leafs)
267 indexmax=i
268
269 return clusters[indexmax]
270
271 def getCompleteMatrix(self,matr):
272 '''
273 return the complete matrix from a
274 triangular matrix
275 '''
276 newMatrix=copy.copy(matr)
277 if isinstance(matr,numpy.ndarray):
278 newMatrix=newMatrix.tolist()
279 lengthMatrix=len(matr[0])
280 for i in range(lengthMatrix):
281 for j in range(i,lengthMatrix):
282 newMatrix[j][i]=matr[i][j]
283 return newMatrix
284
285 def showTree(self,strTree):
286 '''
287 Shows a graphical representation from a newick tree format
288 '''
289 handle = StringIO(strTree)
290 tree = Phylo.read(handle, 'newick')
291 tree.ladderize()
292 # Phylo.draw(tree)
293 Phylo.draw_ascii(tree)
294
295