annotate pymf/dist.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF several distance functions
mi@0 8
mi@0 9 kl_divergence(): KL Divergence
mi@0 10 l1_distance(): L1 distance
mi@0 11 l2_distance(): L2 distance
mi@0 12 cosine_distance(): Cosine distance
mi@0 13 pdist(): Pairwise distance computation
mi@0 14 vq(): Vector quantization
mi@0 15
mi@0 16 """
mi@0 17
mi@0 18
mi@0 19 import numpy as np
mi@0 20 import scipy.sparse
mi@0 21
mi@0 22 __all__ = ["abs_cosine_distance", "kl_divergence", "l1_distance", "l2_distance",
mi@0 23 "weighted_abs_cosine_distance","cosine_distance","vq", "pdist"]
mi@0 24
mi@0 25 def kl_divergence(d, vec):
mi@0 26 b = vec*(1/d)
mi@0 27 b = np.where(b>0, np.log(b),0)
mi@0 28 b = vec * b
mi@0 29 b = np.sum(b - vec + d, axis=0).reshape((-1))
mi@0 30 return b
mi@0 31
mi@0 32 def l1_distance(d, vec):
mi@0 33 ret_val = np.sum(np.abs(d - vec), axis=0)
mi@0 34 ret_val = ret_val.reshape((-1))
mi@0 35 return ret_val
mi@0 36
mi@0 37 def sparse_l2_distance(d, vec):
mi@0 38 # compute the norm of d
mi@0 39 nd = (d.multiply(d)).sum(axis=0)
mi@0 40 nv = (vec.multiply(vec)).sum(axis=0)
mi@0 41 ret_val = nd + nv - 2.0*(d.T * vec).T
mi@0 42 return np.sqrt(ret_val)
mi@0 43
mi@0 44 def approx_l2_distance(d, vec):
mi@0 45 # Use random projections to approximate the conventional l2 distance
mi@0 46 k = np.round(np.log(d.shape[0]))
mi@0 47 #k = d.shape[0]
mi@0 48 R = np.random.randn(k, d.shape[0])
mi@0 49 R = R / np.sqrt((R**2).sum(axis=0))
mi@0 50 A = np.dot(R,d)
mi@0 51 B = np.dot(R, vec)
mi@0 52 ret_val = np.sum( (A - B)**2, axis=0)
mi@0 53 ret_val = np.sqrt(R.shape[1]/R.shape[0]) * np.sqrt(ret_val)
mi@0 54 ret_val = ret_val.reshape((-1))
mi@0 55 return ret_val
mi@0 56
mi@0 57 def l2_distance(d, vec):
mi@0 58 if scipy.sparse.issparse(d):
mi@0 59 ret_val = sparse_l2_distance(d, vec)
mi@0 60 else:
mi@0 61 ret_val = np.sqrt(((d[:,:] - vec)**2).sum(axis=0))
mi@0 62
mi@0 63 return ret_val.reshape((-1))
mi@0 64
mi@0 65 def l2_distance_new(d,vec):
mi@0 66 # compute the norm of d
mi@0 67 nd = (d**2).sum(axis=0)
mi@0 68 nv = (vec**2).sum(axis=0)
mi@0 69 ret_val = nd + nv - 2.0*np.dot(d.T,vec.reshape((-1,1))).T
mi@0 70
mi@0 71 return np.sqrt(ret_val)
mi@0 72
mi@0 73 def cosine_distance(d, vec):
mi@0 74 tmp = np.dot(np.transpose(d), vec)
mi@0 75 a = np.sqrt(np.sum(d**2, axis=0))
mi@0 76 b = np.sqrt(np.sum(vec**2))
mi@0 77 k = (a*b).reshape(-1) + (10**-9)
mi@0 78
mi@0 79 # compute distance
mi@0 80 ret_val = 1.0 - tmp/k
mi@0 81
mi@0 82 return ret_val.reshape((-1))
mi@0 83
mi@0 84 def abs_cosine_distance(d, vec, weighted=False):
mi@0 85 if scipy.sparse.issparse(d):
mi@0 86 tmp = np.array((d.T * vec).todense(), dtype=np.float32).reshape(-1)
mi@0 87 a = np.sqrt(np.array(d.multiply(d).sum(axis=0), dtype=np.float32).reshape(-1))
mi@0 88 b = np.sqrt(np.array(vec.multiply(vec).sum(axis=0), dtype=np.float32).reshape(-1))
mi@0 89 else:
mi@0 90 tmp = np.dot(np.transpose(d), vec).reshape(-1)
mi@0 91 a = np.sqrt(np.sum(d**2, axis=0)).reshape(-1)
mi@0 92 b = np.sqrt(np.sum(vec**2)).reshape(-1)
mi@0 93
mi@0 94 k = (a*b).reshape(-1) + 10**-9
mi@0 95
mi@0 96 # compute distance
mi@0 97 ret_val = 1.0 - np.abs(tmp/k)
mi@0 98
mi@0 99 if weighted:
mi@0 100 ret_val = ret_val * a
mi@0 101 return ret_val.reshape((-1))
mi@0 102
mi@0 103 def weighted_abs_cosine_distance(d, vec):
mi@0 104 ret_val = abs_cosine_distance(d, vec, weighted=True)
mi@0 105 return ret_val
mi@0 106
mi@0 107 def pdist(A, B, metric='l2' ):
mi@0 108 # compute pairwise distance between a data matrix A (d x n) and B (d x m).
mi@0 109 # Returns a distance matrix d (n x m).
mi@0 110 d = np.zeros((A.shape[1], B.shape[1]))
mi@0 111 if A.shape[1] <= B.shape[1]:
mi@0 112 for aidx in xrange(A.shape[1]):
mi@0 113 if metric == 'l2':
mi@0 114 d[aidx:aidx+1,:] = l2_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
mi@0 115 if metric == 'l1':
mi@0 116 d[aidx:aidx+1,:] = l1_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
mi@0 117 else:
mi@0 118 for bidx in xrange(B.shape[1]):
mi@0 119 if metric == 'l2':
mi@0 120 d[:, bidx:bidx+1] = l2_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
mi@0 121 if metric == 'l1':
mi@0 122 d[:, bidx:bidx+1] = l1_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
mi@0 123
mi@0 124 return d
mi@0 125
mi@0 126 def vq(A, B, metric='l2'):
mi@0 127 # assigns data samples in B to cluster centers A and
mi@0 128 # returns an index list [assume n column vectors, d x n]
mi@0 129 assigned = np.argmin(pdist(A,B, metric=metric), axis=0)
mi@0 130 return assigned