Mercurial > hg > multiomr
view Process/Clustering.py @ 2:46fb79167a61 tip
Main Code
author | Victor Padilla <victor.padilla.mc@gmail.com> |
---|---|
date | Mon, 04 May 2015 22:56:18 +0200 |
parents | |
children |
line wrap: on
line source
''' Created on 10/11/2014 @organization: Lancaster University & University of Leeds @version: 1.0 Created on 11/12/2014 @author: Victor Padilla @contact: v.padilla@lancaster.ac.uk Class related to Phylogenetic trees ''' import copy import numpy from Bio import Phylo from cStringIO import StringIO class cluster: ''' cluster object for Clustering class ''' pass class Clustering: ''' This class allows to transform a triangular matrix in a philogenetic tree cutting it at some point usage: ### The omr name array species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ] matr = [ [ 0., 6., 8., 1., 2. ,6. ], [ 0., 0., 8., 6., 6. ,4. ], [ 0., 0., 0., 8., 8. ,8. ], [ 0., 0., 0., 0., 2. ,6. ], [ 0., 0., 0., 0., 0. ,6. ], [ 0., 0., 0., 0., 0. ,0. ], ] clustering=Clustering() ### Transforms the triangular matrix in a complete matrix matr= clustering.getCompleteMatrix(matr) ### establishes the clusters clu = clustering.make_clusters(species) ### Regroup the cluster tree = clustering.regroup(clu, matr) ### get the string in newick format strTree=clustering.getStringTree(tree,tree.height,"") print(strTree) clustering.showTree(strTree) ### Takes the main tree with 5 branches maintree=clustering.getBetterTree(tree,5) strTree=clustering.getStringTree(maintree,maintree.height,"") print(strTree) clustering.showTree(strTree) ''' def make_clusters(self,species): ''' Organises the cluster based on the species array ''' clusters = {} Id = 1 for s in species: c = cluster() c.id = Id c.data = s c.size = 1 c.height = 0 clusters[c.id] = c Id = Id + 1 return clusters def __find_min(self,clu, d): ''' finding the minimum distance private function ''' mini = None i_mini = 0 j_mini = 0 for i in clu: for j in clu: if j>i: tmp = d[j -1 ][i -1 ] if not mini: mini = tmp if tmp <= mini: i_mini = i j_mini = j mini = tmp return (i_mini, j_mini, mini) def regroup(self,clusters, dist): ''' organizing the cluster ''' i, j, dij = self.__find_min(clusters, dist) ci = clusters[i] cj = clusters[j] # create new cluster k = cluster() k.id = max(clusters) + 1 k.data = (ci, cj) k.size = ci.size + cj.size k.height = dij / 2. # remove clusters del clusters[i] del clusters[j] # compute new distance values and insert them dist.append([]) for l in range(0, k.id -1): dist[k.id-1].append(0) for l in clusters: dil = dist[max(i, l) -1][min(i, l) -1] djl = dist[max(j, l) -1][min(j, l) -1] dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size) dist[k.id -1][l-1] = dkl # insert the new cluster clusters[k.id] = k if len(clusters) == 1: # we're through ! return clusters.values()[0] else: return self.regroup(clusters, dist) def __pprint(self,tree, length): ''' print the tree in newick format (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names ''' if tree.size > 1: # it's an internal node print "(", self.__pprint(tree.data[0], tree.height) print ",", self.__pprint(tree.data[1], tree.height) print ("):%2.2f" % (length - tree.height)), else : # it's a leaf print ("%s:%2.2f" % (tree.data, length)), def getStringTree(self,tree, length,strOut): ''' returns the string of the tree in newick format (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names ''' if tree.size > 1: # it's an internal node strOut+="(" strOut=self.getStringTree(tree.data[0], tree.height,strOut) strOut+="," strOut=self.getStringTree(tree.data[1], tree.height,strOut) strOut+="):"+str(length- tree.height) else : # it's a leaf strOut+=str(tree.data)+":"+str(length) return strOut def createClusters(self,tree,length): ''' separates the tree at the point determined by length ''' clusters=[] self.__separateClusters(clusters,tree,length) self.clusters=self.__filterClusters(clusters,length) return clusters def getLeafs(self,tree): ''' returns the number of leafs from a tree ''' leafs=[] self.getTreeLeafs(leafs,tree,tree.height) return leafs def __separateClusters(self,clusters,tree,length): ''' divides the main tree in multiples trees based on the length ''' if tree.size > 1: if tree.height>=length: clusters.append(tree.data[0]) clusters.append(tree.data[1]) else: pass self.__separateClusters(clusters,tree.data[0], length) self.__separateClusters(clusters,tree.data[1], length) def __filterClusters(self,clusters,length): ''' removes trees higher than length ''' clustersOut=[] for c in clusters: if c.height<length: clustersOut.append(c) return clustersOut def getTreeLeafs(self,leafs,tree,length): ''' returns the number of leafs from a tree. Recursive function ''' if tree.size > 1: # it's an internal node self.getTreeLeafs(leafs,tree.data[0], tree.height) self.getTreeLeafs(leafs,tree.data[1], tree.height) else : # it's a leaf leafs.append(tree.data) def getBetterTree(self,tree,numberLeafs): ''' takes the smallest tree that has <= numberLeafs ''' while True: leafs=self.getLeafs(tree) if len(leafs)<=numberLeafs: return tree clusters=self.createClusters(tree,tree.height) tree=self.getMainTree(clusters) def getAverageDistance(self,matr): ''' returns the average distance from the matrix ''' sumat=0 lengthMatrix=len(matr[0]) length=lengthMatrix*lengthMatrix-lengthMatrix for i in range(lengthMatrix): for j in range(lengthMatrix): sumat=sumat+matr[i][j] return (sumat/length)/2 def getMaximumDistance(self,matr): ''' returns the maximum distance from the matrix ''' maxValue=0 lengthMatrix=len(matr[0]) for i in range(lengthMatrix): for j in range(lengthMatrix): if matr[i][j]>maxValue: maxValue=matr[i][j] return maxValue/2 def getMainTree(self,clusters): ''' Takes the tree with the higher number of leafs ''' maxim=0 indexmax=0 for i in range(len(clusters)): leafs=self.getLeafs(clusters[i]) if len(leafs)>maxim: maxim=len(leafs) indexmax=i return clusters[indexmax] def getCompleteMatrix(self,matr): ''' return the complete matrix from a triangular matrix ''' newMatrix=copy.copy(matr) if isinstance(matr,numpy.ndarray): newMatrix=newMatrix.tolist() lengthMatrix=len(matr[0]) for i in range(lengthMatrix): for j in range(i,lengthMatrix): newMatrix[j][i]=matr[i][j] return newMatrix def showTree(self,strTree): ''' Shows a graphical representation from a newick tree format ''' handle = StringIO(strTree) tree = Phylo.read(handle, 'newick') tree.ladderize() # Phylo.draw(tree) Phylo.draw_ascii(tree)