diff Process/Clustering.py @ 2:46fb79167a61 tip

Main Code
author Victor Padilla <victor.padilla.mc@gmail.com>
date Mon, 04 May 2015 22:56:18 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Process/Clustering.py	Mon May 04 22:56:18 2015 +0200
@@ -0,0 +1,295 @@
+'''
+Created on 10/11/2014
+
+@organization: Lancaster University & University of Leeds
+@version: 1.0
+Created on 11/12/2014
+
+@author: Victor Padilla
+@contact: v.padilla@lancaster.ac.uk
+
+Class related to Phylogenetic trees
+
+'''
+import copy
+import numpy
+from Bio import Phylo
+from cStringIO import StringIO
+
+class cluster:
+    '''
+    cluster object for Clustering class
+    '''
+    pass
+    
+class Clustering:
+    '''
+    This class allows to transform a triangular matrix
+    in a philogenetic tree cutting it at some point 
+    
+    
+    usage:
+    ###  The omr name array
+    species = [ "omr1", "omr2", "omr3", "omr4", "omr5","omr6" ]
+    matr = [ [ 0., 6., 8., 1., 2. ,6. ],
+             [ 0., 0., 8., 6., 6. ,4. ],
+             [ 0., 0., 0., 8., 8. ,8. ],
+             [ 0., 0., 0., 0., 2. ,6. ],
+             [ 0., 0., 0., 0., 0. ,6. ],
+             [ 0., 0., 0., 0., 0. ,0. ], ]
+    
+    clustering=Clustering()
+    ### Transforms the triangular matrix in a complete matrix
+    matr= clustering.getCompleteMatrix(matr)
+    
+    ### establishes the clusters
+    clu = clustering.make_clusters(species)  
+    
+    ### Regroup the cluster 
+    tree = clustering.regroup(clu, matr)
+    
+    ### get the string in newick format
+    strTree=clustering.getStringTree(tree,tree.height,"")
+    
+    print(strTree)
+    clustering.showTree(strTree)
+    
+    ### Takes the main tree with 5 branches
+    maintree=clustering.getBetterTree(tree,5)
+    strTree=clustering.getStringTree(maintree,maintree.height,"")
+    print(strTree)
+    clustering.showTree(strTree)
+        
+    '''
+    
+    def make_clusters(self,species):
+        '''
+        Organises the cluster based on the species array
+        '''
+        clusters = {}
+        Id = 1
+        for s in species:
+            c = cluster()
+            c.id = Id
+            c.data = s
+            c.size = 1
+            c.height = 0
+            clusters[c.id] = c
+            Id = Id + 1
+        return clusters
+    
+    def __find_min(self,clu, d):
+        '''
+        finding the minimum distance
+        private function
+        '''
+        mini = None
+        i_mini = 0
+        j_mini = 0
+        for i in clu:
+            for j in clu:
+                if j>i:
+                    tmp = d[j -1 ][i -1 ]
+                    if not mini:
+                        mini = tmp
+                    if tmp <= mini:
+                        i_mini = i
+                        j_mini = j
+                        mini = tmp
+        return (i_mini, j_mini, mini)
+    
+    def regroup(self,clusters, dist):
+        '''
+        organizing the cluster
+        '''
+        i, j, dij = self.__find_min(clusters, dist)
+        ci = clusters[i]
+        cj = clusters[j]
+        # create new cluster
+        k = cluster()
+        k.id = max(clusters) + 1
+        k.data = (ci, cj)
+        k.size = ci.size + cj.size
+        k.height = dij / 2.
+        # remove clusters
+        del clusters[i]
+        del clusters[j]
+        # compute new distance values and insert them
+        dist.append([])
+        for l in range(0, k.id -1):
+            dist[k.id-1].append(0)
+        for l in clusters:
+            dil = dist[max(i, l) -1][min(i, l) -1]
+            djl = dist[max(j, l) -1][min(j, l) -1]
+            dkl = (dil * ci.size + djl * cj.size) / float (ci.size + cj.size)
+            dist[k.id -1][l-1] = dkl
+        # insert the new cluster
+        clusters[k.id] = k
+    
+        if len(clusters) == 1:
+            # we're through !
+            return clusters.values()[0]
+        else:
+            return self.regroup(clusters, dist)
+    
+    def __pprint(self,tree, length):
+        '''
+        print the tree in newick format 
+        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);       distances and leaf names
+        '''
+        if tree.size > 1:
+            # it's an internal node
+            print "(",
+            self.__pprint(tree.data[0], tree.height)
+            print ",",
+            self.__pprint(tree.data[1], tree.height)
+            print ("):%2.2f" % (length - tree.height)),
+        else :
+            # it's a leaf
+            print ("%s:%2.2f" % (tree.data, length)),
+    
+    def getStringTree(self,tree, length,strOut):
+        '''
+        returns the string of the tree in newick format
+        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);       distances and leaf names
+        '''
+        if tree.size > 1:
+            # it's an internal node
+            strOut+="("
+            strOut=self.getStringTree(tree.data[0], tree.height,strOut)
+            strOut+=","
+            strOut=self.getStringTree(tree.data[1], tree.height,strOut)
+            strOut+="):"+str(length- tree.height)
+        else :
+            # it's a leaf
+            strOut+=str(tree.data)+":"+str(length)
+        return strOut
+    
+    
+    def createClusters(self,tree,length):
+        '''
+        separates the tree at the point determined by length
+        '''
+        clusters=[]
+        self.__separateClusters(clusters,tree,length)
+        self.clusters=self.__filterClusters(clusters,length)
+        return clusters
+    
+    def getLeafs(self,tree):
+        '''
+        returns the number of leafs from a tree
+        '''
+        leafs=[]
+        self.getTreeLeafs(leafs,tree,tree.height)
+        return leafs
+    
+    def __separateClusters(self,clusters,tree,length):
+        '''
+        divides the main tree in multiples trees based on the length
+        '''
+        if tree.size > 1:
+            if tree.height>=length:
+                clusters.append(tree.data[0])
+                clusters.append(tree.data[1])
+            else:
+                pass
+            self.__separateClusters(clusters,tree.data[0], length) 
+            self.__separateClusters(clusters,tree.data[1], length)
+        
+    def __filterClusters(self,clusters,length): 
+        '''
+        removes trees higher than length
+        ''' 
+        clustersOut=[]  
+        for c in clusters:
+            if c.height<length:
+                clustersOut.append(c)
+        return clustersOut
+    
+    def getTreeLeafs(self,leafs,tree,length):
+        '''
+        returns the number of leafs from a tree. Recursive function
+        '''
+        if tree.size > 1:
+            # it's an internal node
+            self.getTreeLeafs(leafs,tree.data[0], tree.height)
+            self.getTreeLeafs(leafs,tree.data[1], tree.height)
+        else :
+            # it's a leaf
+            leafs.append(tree.data)
+            
+    def getBetterTree(self,tree,numberLeafs):
+        '''
+        takes the smallest tree that has <= numberLeafs
+        '''
+        while True:
+            leafs=self.getLeafs(tree)
+            if len(leafs)<=numberLeafs:
+                return tree
+            clusters=self.createClusters(tree,tree.height)
+            tree=self.getMainTree(clusters)
+    
+    
+    def getAverageDistance(self,matr):
+        '''
+        returns the average distance from the matrix
+        '''
+        sumat=0
+        lengthMatrix=len(matr[0])
+        length=lengthMatrix*lengthMatrix-lengthMatrix
+        for i in range(lengthMatrix):
+            for j in range(lengthMatrix):
+                    sumat=sumat+matr[i][j]
+        return (sumat/length)/2
+    
+    def getMaximumDistance(self,matr):
+        '''
+        returns the maximum distance from the matrix
+        '''
+        maxValue=0
+        lengthMatrix=len(matr[0])
+        for i in range(lengthMatrix):
+            for j in range(lengthMatrix):
+                    if matr[i][j]>maxValue:
+                        maxValue=matr[i][j]
+        return maxValue/2
+    
+    def getMainTree(self,clusters):
+        '''
+        Takes the tree with the higher number of leafs
+        '''
+        maxim=0
+        indexmax=0
+        for i in range(len(clusters)):
+            leafs=self.getLeafs(clusters[i])
+            if len(leafs)>maxim:
+                maxim=len(leafs)
+                indexmax=i
+                
+        return clusters[indexmax]
+            
+    def getCompleteMatrix(self,matr):
+        '''
+        return the complete matrix from a 
+        triangular matrix
+        '''
+        newMatrix=copy.copy(matr)
+        if isinstance(matr,numpy.ndarray):
+            newMatrix=newMatrix.tolist()
+        lengthMatrix=len(matr[0])
+        for i in range(lengthMatrix):
+            for j in range(i,lengthMatrix): 
+                newMatrix[j][i]=matr[i][j]
+        return newMatrix
+    
+    def showTree(self,strTree): 
+        '''
+        Shows a graphical representation from a newick tree format
+        '''  
+        handle = StringIO(strTree)
+        tree = Phylo.read(handle, 'newick')
+        tree.ladderize()   
+    #     Phylo.draw(tree) 
+        Phylo.draw_ascii(tree) 
+         
+