view pymf/sivm_cur.py @ 0:26838b1f560f

initial commit of a segmenter project
author mi tian
date Thu, 02 Apr 2015 18:09:27 +0100
parents
children
line wrap: on
line source
#!/usr/bin/python
#
# Copyright (C) Christian Thurau, 2010. 
# Licensed under the GNU General Public License (GPL). 
# http://www.gnu.org/licenses/gpl.txt
"""  
PyMF Simplex Volume Maximization for CUR [1]

    SIVMCUR: class for SiVM-CUR

[1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 
Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 
Conf. on Information and Knowledge Management. ACM. 2010.
"""


import numpy as np
import scipy
from sivm import SIVM
from cur import CUR

__all__ = ["SIVM_CUR"]

class SIVM_CUR(CUR):
    '''
    SIVM_CUR(data, num_bases=4, dist_measure='l2')
    
    Simplex Volume based CUR Decomposition. Factorize a data matrix into three 
    matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the
    rows and columns using SIVM, i.e. it tries to maximize the volume of the
    enclosed simplex.
    
    Parameters
    ----------
    data : array_like [data_dimension x num_samples]
        the input data
    rrank: int, optional 
        Number of rows to sample from data.
        4 (default)crank
    crank: int, optional
        Number of columns to sample from data.
        4 (default)   
    dist_measure: string, optional
        The distance measure for finding the next best candidate that 
        maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
        'l2' (default)
    
    Attributes
    ----------
        U,S,V : submatrices s.t. data = USV        
    
    Example
    -------
    >>> import numpy as np
    >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
    >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2)    
    >>> sivmcur_mdl.factorize()
    '''
    
    def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'):
        CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank)
        self._dist_measure = dist_measure
        self.init = init

    def sample(self, A, c):
        # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of
        # the ordinary simplex would be optimized) 
        sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure,  
                            init=self.init)                        
        sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1,
                           compute_h=False, compute_err=False)
        
        return sivm_mdl.select    
            
            
    def factorize(self):
        """ Factorize s.t. CUR = data
            
            Updated Values
            --------------
            .C : updated values for C.
            .U : updated values for U.
            .R : updated values for R.          
        """            
        # sample row and column indices that maximize the volume of the submatrix
        self._rid = self.sample(self.data.transpose(), self._rrank)
        self._cid = self.sample(self.data, self._crank)
                        
        self._rcnt = np.ones(len(self._rid))
        self._ccnt = np.ones(len(self._cid))    
                                    
        self.computeUCR()


if __name__ == "__main__":
    import doctest  
    doctest.testmod()