annotate Process/Clustering.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
rev   line source
victor@2 1 '''
victor@2 2 Created on 10/11/2014
victor@2 3
victor@2 4 @organization: Lancaster University & University of Leeds
victor@2 5 @version: 1.0
victor@2 6 Created on 11/12/2014
victor@2 7
victor@2 8 @author: Victor Padilla
victor@2 9 @contact: v.padilla@lancaster.ac.uk
victor@2 10
victor@2 11 Class related to Phylogenetic trees
victor@2 12
victor@2 13 '''
victor@2 14 import copy
victor@2 15 import numpy
victor@2 16 from Bio import Phylo
victor@2 17 from cStringIO import StringIO
victor@2 18
victor@2 19 class cluster:
victor@2 20 '''
victor@2 21 cluster object for Clustering class
victor@2 22 '''
victor@2 23 pass
victor@2 24
victor@2 25 class Clustering:
victor@2 26 '''
victor@2 27 This class allows to transform a triangular matrix
victor@2 28 in a philogenetic tree cutting it at some point
victor@2 29
victor@2 30
victor@2 31 usage:
victor@2 32 ### The omr name array
victor@2 33 species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ]
victor@2 34 matr = [ [ 0., 6., 8., 1., 2. ,6. ],
victor@2 35 [ 0., 0., 8., 6., 6. ,4. ],
victor@2 36 [ 0., 0., 0., 8., 8. ,8. ],
victor@2 37 [ 0., 0., 0., 0., 2. ,6. ],
victor@2 38 [ 0., 0., 0., 0., 0. ,6. ],
victor@2 39 [ 0., 0., 0., 0., 0. ,0. ], ]
victor@2 40
victor@2 41 clustering=Clustering()
victor@2 42 ### Transforms the triangular matrix in a complete matrix
victor@2 43 matr= clustering.getCompleteMatrix(matr)
victor@2 44
victor@2 45 ### establishes the clusters
victor@2 46 clu = clustering.make_clusters(species)
victor@2 47
victor@2 48 ### Regroup the cluster
victor@2 49 tree = clustering.regroup(clu, matr)
victor@2 50
victor@2 51 ### get the string in newick format
victor@2 52 strTree=clustering.getStringTree(tree,tree.height,"")
victor@2 53
victor@2 54 print(strTree)
victor@2 55 clustering.showTree(strTree)
victor@2 56
victor@2 57 ### Takes the main tree with 5 branches
victor@2 58 maintree=clustering.getBetterTree(tree,5)
victor@2 59 strTree=clustering.getStringTree(maintree,maintree.height,"")
victor@2 60 print(strTree)
victor@2 61 clustering.showTree(strTree)
victor@2 62
victor@2 63 '''
victor@2 64
victor@2 65 def make_clusters(self,species):
victor@2 66 '''
victor@2 67 Organises the cluster based on the species array
victor@2 68 '''
victor@2 69 clusters = {}
victor@2 70 Id = 1
victor@2 71 for s in species:
victor@2 72 c = cluster()
victor@2 73 c.id = Id
victor@2 74 c.data = s
victor@2 75 c.size = 1
victor@2 76 c.height = 0
victor@2 77 clusters[c.id] = c
victor@2 78 Id = Id + 1
victor@2 79 return clusters
victor@2 80
victor@2 81 def __find_min(self,clu, d):
victor@2 82 '''
victor@2 83 finding the minimum distance
victor@2 84 private function
victor@2 85 '''
victor@2 86 mini = None
victor@2 87 i_mini = 0
victor@2 88 j_mini = 0
victor@2 89 for i in clu:
victor@2 90 for j in clu:
victor@2 91 if j>i:
victor@2 92 tmp = d[j -1 ][i -1 ]
victor@2 93 if not mini:
victor@2 94 mini = tmp
victor@2 95 if tmp <= mini:
victor@2 96 i_mini = i
victor@2 97 j_mini = j
victor@2 98 mini = tmp
victor@2 99 return (i_mini, j_mini, mini)
victor@2 100
victor@2 101 def regroup(self,clusters, dist):
victor@2 102 '''
victor@2 103 organizing the cluster
victor@2 104 '''
victor@2 105 i, j, dij = self.__find_min(clusters, dist)
victor@2 106 ci = clusters[i]
victor@2 107 cj = clusters[j]
victor@2 108 # create new cluster
victor@2 109 k = cluster()
victor@2 110 k.id = max(clusters) + 1
victor@2 111 k.data = (ci, cj)
victor@2 112 k.size = ci.size + cj.size
victor@2 113 k.height = dij / 2.
victor@2 114 # remove clusters
victor@2 115 del clusters[i]
victor@2 116 del clusters[j]
victor@2 117 # compute new distance values and insert them
victor@2 118 dist.append([])
victor@2 119 for l in range(0, k.id -1):
victor@2 120 dist[k.id-1].append(0)
victor@2 121 for l in clusters:
victor@2 122 dil = dist[max(i, l) -1][min(i, l) -1]
victor@2 123 djl = dist[max(j, l) -1][min(j, l) -1]
victor@2 124 dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size)
victor@2 125 dist[k.id -1][l-1] = dkl
victor@2 126 # insert the new cluster
victor@2 127 clusters[k.id] = k
victor@2 128
victor@2 129 if len(clusters) == 1:
victor@2 130 # we're through !
victor@2 131 return clusters.values()[0]
victor@2 132 else:
victor@2 133 return self.regroup(clusters, dist)
victor@2 134
victor@2 135 def __pprint(self,tree, length):
victor@2 136 '''
victor@2 137 print the tree in newick format
victor@2 138 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
victor@2 139 '''
victor@2 140 if tree.size > 1:
victor@2 141 # it's an internal node
victor@2 142 print "(",
victor@2 143 self.__pprint(tree.data[0], tree.height)
victor@2 144 print ",",
victor@2 145 self.__pprint(tree.data[1], tree.height)
victor@2 146 print ("):%2.2f" % (length - tree.height)),
victor@2 147 else :
victor@2 148 # it's a leaf
victor@2 149 print ("%s:%2.2f" % (tree.data, length)),
victor@2 150
victor@2 151 def getStringTree(self,tree, length,strOut):
victor@2 152 '''
victor@2 153 returns the string of the tree in newick format
victor@2 154 (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names
victor@2 155 '''
victor@2 156 if tree.size > 1:
victor@2 157 # it's an internal node
victor@2 158 strOut+="("
victor@2 159 strOut=self.getStringTree(tree.data[0], tree.height,strOut)
victor@2 160 strOut+=","
victor@2 161 strOut=self.getStringTree(tree.data[1], tree.height,strOut)
victor@2 162 strOut+="):"+str(length- tree.height)
victor@2 163 else :
victor@2 164 # it's a leaf
victor@2 165 strOut+=str(tree.data)+":"+str(length)
victor@2 166 return strOut
victor@2 167
victor@2 168
victor@2 169 def createClusters(self,tree,length):
victor@2 170 '''
victor@2 171 separates the tree at the point determined by length
victor@2 172 '''
victor@2 173 clusters=[]
victor@2 174 self.__separateClusters(clusters,tree,length)
victor@2 175 self.clusters=self.__filterClusters(clusters,length)
victor@2 176 return clusters
victor@2 177
victor@2 178 def getLeafs(self,tree):
victor@2 179 '''
victor@2 180 returns the number of leafs from a tree
victor@2 181 '''
victor@2 182 leafs=[]
victor@2 183 self.getTreeLeafs(leafs,tree,tree.height)
victor@2 184 return leafs
victor@2 185
victor@2 186 def __separateClusters(self,clusters,tree,length):
victor@2 187 '''
victor@2 188 divides the main tree in multiples trees based on the length
victor@2 189 '''
victor@2 190 if tree.size > 1:
victor@2 191 if tree.height>=length:
victor@2 192 clusters.append(tree.data[0])
victor@2 193 clusters.append(tree.data[1])
victor@2 194 else:
victor@2 195 pass
victor@2 196 self.__separateClusters(clusters,tree.data[0], length)
victor@2 197 self.__separateClusters(clusters,tree.data[1], length)
victor@2 198
victor@2 199 def __filterClusters(self,clusters,length):
victor@2 200 '''
victor@2 201 removes trees higher than length
victor@2 202 '''
victor@2 203 clustersOut=[]
victor@2 204 for c in clusters:
victor@2 205 if c.height<length:
victor@2 206 clustersOut.append(c)
victor@2 207 return clustersOut
victor@2 208
victor@2 209 def getTreeLeafs(self,leafs,tree,length):
victor@2 210 '''
victor@2 211 returns the number of leafs from a tree. Recursive function
victor@2 212 '''
victor@2 213 if tree.size > 1:
victor@2 214 # it's an internal node
victor@2 215 self.getTreeLeafs(leafs,tree.data[0], tree.height)
victor@2 216 self.getTreeLeafs(leafs,tree.data[1], tree.height)
victor@2 217 else :
victor@2 218 # it's a leaf
victor@2 219 leafs.append(tree.data)
victor@2 220
victor@2 221 def getBetterTree(self,tree,numberLeafs):
victor@2 222 '''
victor@2 223 takes the smallest tree that has <= numberLeafs
victor@2 224 '''
victor@2 225 while True:
victor@2 226 leafs=self.getLeafs(tree)
victor@2 227 if len(leafs)<=numberLeafs:
victor@2 228 return tree
victor@2 229 clusters=self.createClusters(tree,tree.height)
victor@2 230 tree=self.getMainTree(clusters)
victor@2 231
victor@2 232
victor@2 233 def getAverageDistance(self,matr):
victor@2 234 '''
victor@2 235 returns the average distance from the matrix
victor@2 236 '''
victor@2 237 sumat=0
victor@2 238 lengthMatrix=len(matr[0])
victor@2 239 length=lengthMatrix*lengthMatrix-lengthMatrix
victor@2 240 for i in range(lengthMatrix):
victor@2 241 for j in range(lengthMatrix):
victor@2 242 sumat=sumat+matr[i][j]
victor@2 243 return (sumat/length)/2
victor@2 244
victor@2 245 def getMaximumDistance(self,matr):
victor@2 246 '''
victor@2 247 returns the maximum distance from the matrix
victor@2 248 '''
victor@2 249 maxValue=0
victor@2 250 lengthMatrix=len(matr[0])
victor@2 251 for i in range(lengthMatrix):
victor@2 252 for j in range(lengthMatrix):
victor@2 253 if matr[i][j]>maxValue:
victor@2 254 maxValue=matr[i][j]
victor@2 255 return maxValue/2
victor@2 256
victor@2 257 def getMainTree(self,clusters):
victor@2 258 '''
victor@2 259 Takes the tree with the higher number of leafs
victor@2 260 '''
victor@2 261 maxim=0
victor@2 262 indexmax=0
victor@2 263 for i in range(len(clusters)):
victor@2 264 leafs=self.getLeafs(clusters[i])
victor@2 265 if len(leafs)>maxim:
victor@2 266 maxim=len(leafs)
victor@2 267 indexmax=i
victor@2 268
victor@2 269 return clusters[indexmax]
victor@2 270
victor@2 271 def getCompleteMatrix(self,matr):
victor@2 272 '''
victor@2 273 return the complete matrix from a
victor@2 274 triangular matrix
victor@2 275 '''
victor@2 276 newMatrix=copy.copy(matr)
victor@2 277 if isinstance(matr,numpy.ndarray):
victor@2 278 newMatrix=newMatrix.tolist()
victor@2 279 lengthMatrix=len(matr[0])
victor@2 280 for i in range(lengthMatrix):
victor@2 281 for j in range(i,lengthMatrix):
victor@2 282 newMatrix[j][i]=matr[i][j]
victor@2 283 return newMatrix
victor@2 284
victor@2 285 def showTree(self,strTree):
victor@2 286 '''
victor@2 287 Shows a graphical representation from a newick tree format
victor@2 288 '''
victor@2 289 handle = StringIO(strTree)
victor@2 290 tree = Phylo.read(handle, 'newick')
victor@2 291 tree.ladderize()
victor@2 292 # Phylo.draw(tree)
victor@2 293 Phylo.draw_ascii(tree)
victor@2 294
victor@2 295