victor@2: ''' victor@2: Created on 10/11/2014 victor@2: victor@2: @organization: Lancaster University & University of Leeds victor@2: @version: 1.0 victor@2: Created on 11/12/2014 victor@2: victor@2: @author: Victor Padilla victor@2: @contact: v.padilla@lancaster.ac.uk victor@2: victor@2: Class related to Phylogenetic trees victor@2: victor@2: ''' victor@2: import copy victor@2: import numpy victor@2: from Bio import Phylo victor@2: from cStringIO import StringIO victor@2: victor@2: class cluster: victor@2: ''' victor@2: cluster object for Clustering class victor@2: ''' victor@2: pass victor@2: victor@2: class Clustering: victor@2: ''' victor@2: This class allows to transform a triangular matrix victor@2: in a philogenetic tree cutting it at some point victor@2: victor@2: victor@2: usage: victor@2: ### The omr name array victor@2: species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ] victor@2: matr = [ [ 0., 6., 8., 1., 2. ,6. ], victor@2: [ 0., 0., 8., 6., 6. ,4. ], victor@2: [ 0., 0., 0., 8., 8. ,8. ], victor@2: [ 0., 0., 0., 0., 2. ,6. ], victor@2: [ 0., 0., 0., 0., 0. ,6. ], victor@2: [ 0., 0., 0., 0., 0. ,0. ], ] victor@2: victor@2: clustering=Clustering() victor@2: ### Transforms the triangular matrix in a complete matrix victor@2: matr= clustering.getCompleteMatrix(matr) victor@2: victor@2: ### establishes the clusters victor@2: clu = clustering.make_clusters(species) victor@2: victor@2: ### Regroup the cluster victor@2: tree = clustering.regroup(clu, matr) victor@2: victor@2: ### get the string in newick format victor@2: strTree=clustering.getStringTree(tree,tree.height,"") victor@2: victor@2: print(strTree) victor@2: clustering.showTree(strTree) victor@2: victor@2: ### Takes the main tree with 5 branches victor@2: maintree=clustering.getBetterTree(tree,5) victor@2: strTree=clustering.getStringTree(maintree,maintree.height,"") victor@2: print(strTree) victor@2: clustering.showTree(strTree) victor@2: victor@2: ''' victor@2: victor@2: def make_clusters(self,species): victor@2: ''' victor@2: Organises the cluster based on the species array victor@2: ''' victor@2: clusters = {} victor@2: Id = 1 victor@2: for s in species: victor@2: c = cluster() victor@2: c.id = Id victor@2: c.data = s victor@2: c.size = 1 victor@2: c.height = 0 victor@2: clusters[c.id] = c victor@2: Id = Id + 1 victor@2: return clusters victor@2: victor@2: def __find_min(self,clu, d): victor@2: ''' victor@2: finding the minimum distance victor@2: private function victor@2: ''' victor@2: mini = None victor@2: i_mini = 0 victor@2: j_mini = 0 victor@2: for i in clu: victor@2: for j in clu: victor@2: if j>i: victor@2: tmp = d[j -1 ][i -1 ] victor@2: if not mini: victor@2: mini = tmp victor@2: if tmp <= mini: victor@2: i_mini = i victor@2: j_mini = j victor@2: mini = tmp victor@2: return (i_mini, j_mini, mini) victor@2: victor@2: def regroup(self,clusters, dist): victor@2: ''' victor@2: organizing the cluster victor@2: ''' victor@2: i, j, dij = self.__find_min(clusters, dist) victor@2: ci = clusters[i] victor@2: cj = clusters[j] victor@2: # create new cluster victor@2: k = cluster() victor@2: k.id = max(clusters) + 1 victor@2: k.data = (ci, cj) victor@2: k.size = ci.size + cj.size victor@2: k.height = dij / 2. victor@2: # remove clusters victor@2: del clusters[i] victor@2: del clusters[j] victor@2: # compute new distance values and insert them victor@2: dist.append([]) victor@2: for l in range(0, k.id -1): victor@2: dist[k.id-1].append(0) victor@2: for l in clusters: victor@2: dil = dist[max(i, l) -1][min(i, l) -1] victor@2: djl = dist[max(j, l) -1][min(j, l) -1] victor@2: dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size) victor@2: dist[k.id -1][l-1] = dkl victor@2: # insert the new cluster victor@2: clusters[k.id] = k victor@2: victor@2: if len(clusters) == 1: victor@2: # we're through ! victor@2: return clusters.values()[0] victor@2: else: victor@2: return self.regroup(clusters, dist) victor@2: victor@2: def __pprint(self,tree, length): victor@2: ''' victor@2: print the tree in newick format victor@2: (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names victor@2: ''' victor@2: if tree.size > 1: victor@2: # it's an internal node victor@2: print "(", victor@2: self.__pprint(tree.data[0], tree.height) victor@2: print ",", victor@2: self.__pprint(tree.data[1], tree.height) victor@2: print ("):%2.2f" % (length - tree.height)), victor@2: else : victor@2: # it's a leaf victor@2: print ("%s:%2.2f" % (tree.data, length)), victor@2: victor@2: def getStringTree(self,tree, length,strOut): victor@2: ''' victor@2: returns the string of the tree in newick format victor@2: (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names victor@2: ''' victor@2: if tree.size > 1: victor@2: # it's an internal node victor@2: strOut+="(" victor@2: strOut=self.getStringTree(tree.data[0], tree.height,strOut) victor@2: strOut+="," victor@2: strOut=self.getStringTree(tree.data[1], tree.height,strOut) victor@2: strOut+="):"+str(length- tree.height) victor@2: else : victor@2: # it's a leaf victor@2: strOut+=str(tree.data)+":"+str(length) victor@2: return strOut victor@2: victor@2: victor@2: def createClusters(self,tree,length): victor@2: ''' victor@2: separates the tree at the point determined by length victor@2: ''' victor@2: clusters=[] victor@2: self.__separateClusters(clusters,tree,length) victor@2: self.clusters=self.__filterClusters(clusters,length) victor@2: return clusters victor@2: victor@2: def getLeafs(self,tree): victor@2: ''' victor@2: returns the number of leafs from a tree victor@2: ''' victor@2: leafs=[] victor@2: self.getTreeLeafs(leafs,tree,tree.height) victor@2: return leafs victor@2: victor@2: def __separateClusters(self,clusters,tree,length): victor@2: ''' victor@2: divides the main tree in multiples trees based on the length victor@2: ''' victor@2: if tree.size > 1: victor@2: if tree.height>=length: victor@2: clusters.append(tree.data[0]) victor@2: clusters.append(tree.data[1]) victor@2: else: victor@2: pass victor@2: self.__separateClusters(clusters,tree.data[0], length) victor@2: self.__separateClusters(clusters,tree.data[1], length) victor@2: victor@2: def __filterClusters(self,clusters,length): victor@2: ''' victor@2: removes trees higher than length victor@2: ''' victor@2: clustersOut=[] victor@2: for c in clusters: victor@2: if c.height 1: victor@2: # it's an internal node victor@2: self.getTreeLeafs(leafs,tree.data[0], tree.height) victor@2: self.getTreeLeafs(leafs,tree.data[1], tree.height) victor@2: else : victor@2: # it's a leaf victor@2: leafs.append(tree.data) victor@2: victor@2: def getBetterTree(self,tree,numberLeafs): victor@2: ''' victor@2: takes the smallest tree that has <= numberLeafs victor@2: ''' victor@2: while True: victor@2: leafs=self.getLeafs(tree) victor@2: if len(leafs)<=numberLeafs: victor@2: return tree victor@2: clusters=self.createClusters(tree,tree.height) victor@2: tree=self.getMainTree(clusters) victor@2: victor@2: victor@2: def getAverageDistance(self,matr): victor@2: ''' victor@2: returns the average distance from the matrix victor@2: ''' victor@2: sumat=0 victor@2: lengthMatrix=len(matr[0]) victor@2: length=lengthMatrix*lengthMatrix-lengthMatrix victor@2: for i in range(lengthMatrix): victor@2: for j in range(lengthMatrix): victor@2: sumat=sumat+matr[i][j] victor@2: return (sumat/length)/2 victor@2: victor@2: def getMaximumDistance(self,matr): victor@2: ''' victor@2: returns the maximum distance from the matrix victor@2: ''' victor@2: maxValue=0 victor@2: lengthMatrix=len(matr[0]) victor@2: for i in range(lengthMatrix): victor@2: for j in range(lengthMatrix): victor@2: if matr[i][j]>maxValue: victor@2: maxValue=matr[i][j] victor@2: return maxValue/2 victor@2: victor@2: def getMainTree(self,clusters): victor@2: ''' victor@2: Takes the tree with the higher number of leafs victor@2: ''' victor@2: maxim=0 victor@2: indexmax=0 victor@2: for i in range(len(clusters)): victor@2: leafs=self.getLeafs(clusters[i]) victor@2: if len(leafs)>maxim: victor@2: maxim=len(leafs) victor@2: indexmax=i victor@2: victor@2: return clusters[indexmax] victor@2: victor@2: def getCompleteMatrix(self,matr): victor@2: ''' victor@2: return the complete matrix from a victor@2: triangular matrix victor@2: ''' victor@2: newMatrix=copy.copy(matr) victor@2: if isinstance(matr,numpy.ndarray): victor@2: newMatrix=newMatrix.tolist() victor@2: lengthMatrix=len(matr[0]) victor@2: for i in range(lengthMatrix): victor@2: for j in range(i,lengthMatrix): victor@2: newMatrix[j][i]=matr[i][j] victor@2: return newMatrix victor@2: victor@2: def showTree(self,strTree): victor@2: ''' victor@2: Shows a graphical representation from a newick tree format victor@2: ''' victor@2: handle = StringIO(strTree) victor@2: tree = Phylo.read(handle, 'newick') victor@2: tree.ladderize() victor@2: # Phylo.draw(tree) victor@2: Phylo.draw_ascii(tree) victor@2: victor@2: