victor@2
|
1 '''
|
victor@2
|
2 Created on 10/11/2014
|
victor@2
|
3
|
victor@2
|
4 @organization: Lancaster University & University of Leeds
|
victor@2
|
5 @version: 1.0
|
victor@2
|
6 Created on 11/12/2014
|
victor@2
|
7
|
victor@2
|
8 @author: Victor Padilla
|
victor@2
|
9 @contact: v.padilla@lancaster.ac.uk
|
victor@2
|
10
|
victor@2
|
11 Class related to Phylogenetic trees
|
victor@2
|
12
|
victor@2
|
13 '''
|
victor@2
|
14 import copy
|
victor@2
|
15 import numpy
|
victor@2
|
16 from Bio import Phylo
|
victor@2
|
17 from cStringIO import StringIO
|
victor@2
|
18
|
victor@2
|
19 class cluster:
|
victor@2
|
20 '''
|
victor@2
|
21 cluster object for Clustering class
|
victor@2
|
22 '''
|
victor@2
|
23 pass
|
victor@2
|
24
|
victor@2
|
25 class Clustering:
|
victor@2
|
26 '''
|
victor@2
|
27 This class allows to transform a triangular matrix
|
victor@2
|
28 in a philogenetic tree cutting it at some point
|
victor@2
|
29
|
victor@2
|
30
|
victor@2
|
31 usage:
|
victor@2
|
32 ### The omr name array
|
victor@2
|
33 species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ]
|
victor@2
|
34 matr = [ [ 0., 6., 8., 1., 2. ,6. ],
|
victor@2
|
35 [ 0., 0., 8., 6., 6. ,4. ],
|
victor@2
|
36 [ 0., 0., 0., 8., 8. ,8. ],
|
victor@2
|
37 [ 0., 0., 0., 0., 2. ,6. ],
|
victor@2
|
38 [ 0., 0., 0., 0., 0. ,6. ],
|
victor@2
|
39 [ 0., 0., 0., 0., 0. ,0. ], ]
|
victor@2
|
40
|
victor@2
|
41 clustering=Clustering()
|
victor@2
|
42 ### Transforms the triangular matrix in a complete matrix
|
victor@2
|
43 matr= clustering.getCompleteMatrix(matr)
|
victor@2
|
44
|
victor@2
|
45 ### establishes the clusters
|
victor@2
|
46 clu = clustering.make_clusters(species)
|
victor@2
|
47
|
victor@2
|
48 ### Regroup the cluster
|
victor@2
|
49 tree = clustering.regroup(clu, matr)
|
victor@2
|
50
|
victor@2
|
51 ### get the string in newick format
|
victor@2
|
52 strTree=clustering.getStringTree(tree,tree.height,"")
|
victor@2
|
53
|
victor@2
|
54 print(strTree)
|
victor@2
|
55 clustering.showTree(strTree)
|
victor@2
|
56
|
victor@2
|
57 ### Takes the main tree with 5 branches
|
victor@2
|
58 maintree=clustering.getBetterTree(tree,5)
|
victor@2
|
59 strTree=clustering.getStringTree(maintree,maintree.height,"")
|
victor@2
|
60 print(strTree)
|
victor@2
|
61 clustering.showTree(strTree)
|
victor@2
|
62
|
victor@2
|
63 '''
|
victor@2
|
64
|
victor@2
|
65 def make_clusters(self,species):
|
victor@2
|
66 '''
|
victor@2
|
67 Organises the cluster based on the species array
|
victor@2
|
68 '''
|
victor@2
|
69 clusters = {}
|
victor@2
|
70 Id = 1
|
victor@2
|
71 for s in species:
|
victor@2
|
72 c = cluster()
|
victor@2
|
73 c.id = Id
|
victor@2
|
74 c.data = s
|
victor@2
|
75 c.size = 1
|
victor@2
|
76 c.height = 0
|
victor@2
|
77 clusters[c.id] = c
|
victor@2
|
78 Id = Id + 1
|
victor@2
|
79 return clusters
|
victor@2
|
80
|
victor@2
|
81 def __find_min(self,clu, d):
|
victor@2
|
82 '''
|
victor@2
|
83 finding the minimum distance
|
victor@2
|
84 private function
|
victor@2
|
85 '''
|
victor@2
|
86 mini = None
|
victor@2
|
87 i_mini = 0
|
victor@2
|
88 j_mini = 0
|
victor@2
|
89 for i in clu:
|
victor@2
|
90 for j in clu:
|
victor@2
|
91 if j>i:
|
victor@2
|
92 tmp = d[j -1 ][i -1 ]
|
victor@2
|
93 if not mini:
|
victor@2
|
94 mini = tmp
|
victor@2
|
95 if tmp <= mini:
|
victor@2
|
96 i_mini = i
|
victor@2
|
97 j_mini = j
|
victor@2
|
98 mini = tmp
|
victor@2
|
99 return (i_mini, j_mini, mini)
|
victor@2
|
100
|
victor@2
|
101 def regroup(self,clusters, dist):
|
victor@2
|
102 '''
|
victor@2
|
103 organizing the cluster
|
victor@2
|
104 '''
|
victor@2
|
105 i, j, dij = self.__find_min(clusters, dist)
|
victor@2
|
106 ci = clusters[i]
|
victor@2
|
107 cj = clusters[j]
|
victor@2
|
108 # create new cluster
|
victor@2
|
109 k = cluster()
|
victor@2
|
110 k.id = max(clusters) + 1
|
victor@2
|
111 k.data = (ci, cj)
|
victor@2
|
112 k.size = ci.size + cj.size
|
victor@2
|
113 k.height = dij / 2.
|
victor@2
|
114 # remove clusters
|
victor@2
|
115 del clusters[i]
|
victor@2
|
116 del clusters[j]
|
victor@2
|
117 # compute new distance values and insert them
|
victor@2
|
118 dist.append([])
|
victor@2
|
119 for l in range(0, k.id -1):
|
victor@2
|
120 dist[k.id-1].append(0)
|
victor@2
|
121 for l in clusters:
|
victor@2
|
122 dil = dist[max(i, l) -1][min(i, l) -1]
|
victor@2
|
123 djl = dist[max(j, l) -1][min(j, l) -1]
|
victor@2
|
124 dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size)
|
victor@2
|
125 dist[k.id -1][l-1] = dkl
|
victor@2
|
126 # insert the new cluster
|
victor@2
|
127 clusters[k.id] = k
|
victor@2
|
128
|
victor@2
|
129 if len(clusters) == 1:
|
victor@2
|
130 # we're through !
|
victor@2
|
131 return clusters.values()[0]
|
victor@2
|
132 else:
|
victor@2
|
133 return self.regroup(clusters, dist)
|
victor@2
|
134
|
victor@2
|
135 def __pprint(self,tree, length):
|
victor@2
|
136 '''
|
victor@2
|
137 print the tree in newick format
|
victor@2
|
138 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
|
victor@2
|
139 '''
|
victor@2
|
140 if tree.size > 1:
|
victor@2
|
141 # it's an internal node
|
victor@2
|
142 print "(",
|
victor@2
|
143 self.__pprint(tree.data[0], tree.height)
|
victor@2
|
144 print ",",
|
victor@2
|
145 self.__pprint(tree.data[1], tree.height)
|
victor@2
|
146 print ("):%2.2f" % (length - tree.height)),
|
victor@2
|
147 else :
|
victor@2
|
148 # it's a leaf
|
victor@2
|
149 print ("%s:%2.2f" % (tree.data, length)),
|
victor@2
|
150
|
victor@2
|
151 def getStringTree(self,tree, length,strOut):
|
victor@2
|
152 '''
|
victor@2
|
153 returns the string of the tree in newick format
|
victor@2
|
154 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
|
victor@2
|
155 '''
|
victor@2
|
156 if tree.size > 1:
|
victor@2
|
157 # it's an internal node
|
victor@2
|
158 strOut+="("
|
victor@2
|
159 strOut=self.getStringTree(tree.data[0], tree.height,strOut)
|
victor@2
|
160 strOut+=","
|
victor@2
|
161 strOut=self.getStringTree(tree.data[1], tree.height,strOut)
|
victor@2
|
162 strOut+="):"+str(length- tree.height)
|
victor@2
|
163 else :
|
victor@2
|
164 # it's a leaf
|
victor@2
|
165 strOut+=str(tree.data)+":"+str(length)
|
victor@2
|
166 return strOut
|
victor@2
|
167
|
victor@2
|
168
|
victor@2
|
169 def createClusters(self,tree,length):
|
victor@2
|
170 '''
|
victor@2
|
171 separates the tree at the point determined by length
|
victor@2
|
172 '''
|
victor@2
|
173 clusters=[]
|
victor@2
|
174 self.__separateClusters(clusters,tree,length)
|
victor@2
|
175 self.clusters=self.__filterClusters(clusters,length)
|
victor@2
|
176 return clusters
|
victor@2
|
177
|
victor@2
|
178 def getLeafs(self,tree):
|
victor@2
|
179 '''
|
victor@2
|
180 returns the number of leafs from a tree
|
victor@2
|
181 '''
|
victor@2
|
182 leafs=[]
|
victor@2
|
183 self.getTreeLeafs(leafs,tree,tree.height)
|
victor@2
|
184 return leafs
|
victor@2
|
185
|
victor@2
|
186 def __separateClusters(self,clusters,tree,length):
|
victor@2
|
187 '''
|
victor@2
|
188 divides the main tree in multiples trees based on the length
|
victor@2
|
189 '''
|
victor@2
|
190 if tree.size > 1:
|
victor@2
|
191 if tree.height>=length:
|
victor@2
|
192 clusters.append(tree.data[0])
|
victor@2
|
193 clusters.append(tree.data[1])
|
victor@2
|
194 else:
|
victor@2
|
195 pass
|
victor@2
|
196 self.__separateClusters(clusters,tree.data[0], length)
|
victor@2
|
197 self.__separateClusters(clusters,tree.data[1], length)
|
victor@2
|
198
|
victor@2
|
199 def __filterClusters(self,clusters,length):
|
victor@2
|
200 '''
|
victor@2
|
201 removes trees higher than length
|
victor@2
|
202 '''
|
victor@2
|
203 clustersOut=[]
|
victor@2
|
204 for c in clusters:
|
victor@2
|
205 if c.height<length:
|
victor@2
|
206 clustersOut.append(c)
|
victor@2
|
207 return clustersOut
|
victor@2
|
208
|
victor@2
|
209 def getTreeLeafs(self,leafs,tree,length):
|
victor@2
|
210 '''
|
victor@2
|
211 returns the number of leafs from a tree. Recursive function
|
victor@2
|
212 '''
|
victor@2
|
213 if tree.size > 1:
|
victor@2
|
214 # it's an internal node
|
victor@2
|
215 self.getTreeLeafs(leafs,tree.data[0], tree.height)
|
victor@2
|
216 self.getTreeLeafs(leafs,tree.data[1], tree.height)
|
victor@2
|
217 else :
|
victor@2
|
218 # it's a leaf
|
victor@2
|
219 leafs.append(tree.data)
|
victor@2
|
220
|
victor@2
|
221 def getBetterTree(self,tree,numberLeafs):
|
victor@2
|
222 '''
|
victor@2
|
223 takes the smallest tree that has <= numberLeafs
|
victor@2
|
224 '''
|
victor@2
|
225 while True:
|
victor@2
|
226 leafs=self.getLeafs(tree)
|
victor@2
|
227 if len(leafs)<=numberLeafs:
|
victor@2
|
228 return tree
|
victor@2
|
229 clusters=self.createClusters(tree,tree.height)
|
victor@2
|
230 tree=self.getMainTree(clusters)
|
victor@2
|
231
|
victor@2
|
232
|
victor@2
|
233 def getAverageDistance(self,matr):
|
victor@2
|
234 '''
|
victor@2
|
235 returns the average distance from the matrix
|
victor@2
|
236 '''
|
victor@2
|
237 sumat=0
|
victor@2
|
238 lengthMatrix=len(matr[0])
|
victor@2
|
239 length=lengthMatrix*lengthMatrix-lengthMatrix
|
victor@2
|
240 for i in range(lengthMatrix):
|
victor@2
|
241 for j in range(lengthMatrix):
|
victor@2
|
242 sumat=sumat+matr[i][j]
|
victor@2
|
243 return (sumat/length)/2
|
victor@2
|
244
|
victor@2
|
245 def getMaximumDistance(self,matr):
|
victor@2
|
246 '''
|
victor@2
|
247 returns the maximum distance from the matrix
|
victor@2
|
248 '''
|
victor@2
|
249 maxValue=0
|
victor@2
|
250 lengthMatrix=len(matr[0])
|
victor@2
|
251 for i in range(lengthMatrix):
|
victor@2
|
252 for j in range(lengthMatrix):
|
victor@2
|
253 if matr[i][j]>maxValue:
|
victor@2
|
254 maxValue=matr[i][j]
|
victor@2
|
255 return maxValue/2
|
victor@2
|
256
|
victor@2
|
257 def getMainTree(self,clusters):
|
victor@2
|
258 '''
|
victor@2
|
259 Takes the tree with the higher number of leafs
|
victor@2
|
260 '''
|
victor@2
|
261 maxim=0
|
victor@2
|
262 indexmax=0
|
victor@2
|
263 for i in range(len(clusters)):
|
victor@2
|
264 leafs=self.getLeafs(clusters[i])
|
victor@2
|
265 if len(leafs)>maxim:
|
victor@2
|
266 maxim=len(leafs)
|
victor@2
|
267 indexmax=i
|
victor@2
|
268
|
victor@2
|
269 return clusters[indexmax]
|
victor@2
|
270
|
victor@2
|
271 def getCompleteMatrix(self,matr):
|
victor@2
|
272 '''
|
victor@2
|
273 return the complete matrix from a
|
victor@2
|
274 triangular matrix
|
victor@2
|
275 '''
|
victor@2
|
276 newMatrix=copy.copy(matr)
|
victor@2
|
277 if isinstance(matr,numpy.ndarray):
|
victor@2
|
278 newMatrix=newMatrix.tolist()
|
victor@2
|
279 lengthMatrix=len(matr[0])
|
victor@2
|
280 for i in range(lengthMatrix):
|
victor@2
|
281 for j in range(i,lengthMatrix):
|
victor@2
|
282 newMatrix[j][i]=matr[i][j]
|
victor@2
|
283 return newMatrix
|
victor@2
|
284
|
victor@2
|
285 def showTree(self,strTree):
|
victor@2
|
286 '''
|
victor@2
|
287 Shows a graphical representation from a newick tree format
|
victor@2
|
288 '''
|
victor@2
|
289 handle = StringIO(strTree)
|
victor@2
|
290 tree = Phylo.read(handle, 'newick')
|
victor@2
|
291 tree.ladderize()
|
victor@2
|
292 # Phylo.draw(tree)
|
victor@2
|
293 Phylo.draw_ascii(tree)
|
victor@2
|
294
|
victor@2
|
295
|