mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF several distance functions
|
mi@0
|
8
|
mi@0
|
9 kl_divergence(): KL Divergence
|
mi@0
|
10 l1_distance(): L1 distance
|
mi@0
|
11 l2_distance(): L2 distance
|
mi@0
|
12 cosine_distance(): Cosine distance
|
mi@0
|
13 pdist(): Pairwise distance computation
|
mi@0
|
14 vq(): Vector quantization
|
mi@0
|
15
|
mi@0
|
16 """
|
mi@0
|
17
|
mi@0
|
18
|
mi@0
|
19 import numpy as np
|
mi@0
|
20 import scipy.sparse
|
mi@0
|
21
|
mi@0
|
22 __all__ = ["abs_cosine_distance", "kl_divergence", "l1_distance", "l2_distance",
|
mi@0
|
23 "weighted_abs_cosine_distance","cosine_distance","vq", "pdist"]
|
mi@0
|
24
|
mi@0
|
25 def kl_divergence(d, vec):
|
mi@0
|
26 b = vec*(1/d)
|
mi@0
|
27 b = np.where(b>0, np.log(b),0)
|
mi@0
|
28 b = vec * b
|
mi@0
|
29 b = np.sum(b - vec + d, axis=0).reshape((-1))
|
mi@0
|
30 return b
|
mi@0
|
31
|
mi@0
|
32 def l1_distance(d, vec):
|
mi@0
|
33 ret_val = np.sum(np.abs(d - vec), axis=0)
|
mi@0
|
34 ret_val = ret_val.reshape((-1))
|
mi@0
|
35 return ret_val
|
mi@0
|
36
|
mi@0
|
37 def sparse_l2_distance(d, vec):
|
mi@0
|
38 # compute the norm of d
|
mi@0
|
39 nd = (d.multiply(d)).sum(axis=0)
|
mi@0
|
40 nv = (vec.multiply(vec)).sum(axis=0)
|
mi@0
|
41 ret_val = nd + nv - 2.0*(d.T * vec).T
|
mi@0
|
42 return np.sqrt(ret_val)
|
mi@0
|
43
|
mi@0
|
44 def approx_l2_distance(d, vec):
|
mi@0
|
45 # Use random projections to approximate the conventional l2 distance
|
mi@0
|
46 k = np.round(np.log(d.shape[0]))
|
mi@0
|
47 #k = d.shape[0]
|
mi@0
|
48 R = np.random.randn(k, d.shape[0])
|
mi@0
|
49 R = R / np.sqrt((R**2).sum(axis=0))
|
mi@0
|
50 A = np.dot(R,d)
|
mi@0
|
51 B = np.dot(R, vec)
|
mi@0
|
52 ret_val = np.sum( (A - B)**2, axis=0)
|
mi@0
|
53 ret_val = np.sqrt(R.shape[1]/R.shape[0]) * np.sqrt(ret_val)
|
mi@0
|
54 ret_val = ret_val.reshape((-1))
|
mi@0
|
55 return ret_val
|
mi@0
|
56
|
mi@0
|
57 def l2_distance(d, vec):
|
mi@0
|
58 if scipy.sparse.issparse(d):
|
mi@0
|
59 ret_val = sparse_l2_distance(d, vec)
|
mi@0
|
60 else:
|
mi@0
|
61 ret_val = np.sqrt(((d[:,:] - vec)**2).sum(axis=0))
|
mi@0
|
62
|
mi@0
|
63 return ret_val.reshape((-1))
|
mi@0
|
64
|
mi@0
|
65 def l2_distance_new(d,vec):
|
mi@0
|
66 # compute the norm of d
|
mi@0
|
67 nd = (d**2).sum(axis=0)
|
mi@0
|
68 nv = (vec**2).sum(axis=0)
|
mi@0
|
69 ret_val = nd + nv - 2.0*np.dot(d.T,vec.reshape((-1,1))).T
|
mi@0
|
70
|
mi@0
|
71 return np.sqrt(ret_val)
|
mi@0
|
72
|
mi@0
|
73 def cosine_distance(d, vec):
|
mi@0
|
74 tmp = np.dot(np.transpose(d), vec)
|
mi@0
|
75 a = np.sqrt(np.sum(d**2, axis=0))
|
mi@0
|
76 b = np.sqrt(np.sum(vec**2))
|
mi@0
|
77 k = (a*b).reshape(-1) + (10**-9)
|
mi@0
|
78
|
mi@0
|
79 # compute distance
|
mi@0
|
80 ret_val = 1.0 - tmp/k
|
mi@0
|
81
|
mi@0
|
82 return ret_val.reshape((-1))
|
mi@0
|
83
|
mi@0
|
84 def abs_cosine_distance(d, vec, weighted=False):
|
mi@0
|
85 if scipy.sparse.issparse(d):
|
mi@0
|
86 tmp = np.array((d.T * vec).todense(), dtype=np.float32).reshape(-1)
|
mi@0
|
87 a = np.sqrt(np.array(d.multiply(d).sum(axis=0), dtype=np.float32).reshape(-1))
|
mi@0
|
88 b = np.sqrt(np.array(vec.multiply(vec).sum(axis=0), dtype=np.float32).reshape(-1))
|
mi@0
|
89 else:
|
mi@0
|
90 tmp = np.dot(np.transpose(d), vec).reshape(-1)
|
mi@0
|
91 a = np.sqrt(np.sum(d**2, axis=0)).reshape(-1)
|
mi@0
|
92 b = np.sqrt(np.sum(vec**2)).reshape(-1)
|
mi@0
|
93
|
mi@0
|
94 k = (a*b).reshape(-1) + 10**-9
|
mi@0
|
95
|
mi@0
|
96 # compute distance
|
mi@0
|
97 ret_val = 1.0 - np.abs(tmp/k)
|
mi@0
|
98
|
mi@0
|
99 if weighted:
|
mi@0
|
100 ret_val = ret_val * a
|
mi@0
|
101 return ret_val.reshape((-1))
|
mi@0
|
102
|
mi@0
|
103 def weighted_abs_cosine_distance(d, vec):
|
mi@0
|
104 ret_val = abs_cosine_distance(d, vec, weighted=True)
|
mi@0
|
105 return ret_val
|
mi@0
|
106
|
mi@0
|
107 def pdist(A, B, metric='l2' ):
|
mi@0
|
108 # compute pairwise distance between a data matrix A (d x n) and B (d x m).
|
mi@0
|
109 # Returns a distance matrix d (n x m).
|
mi@0
|
110 d = np.zeros((A.shape[1], B.shape[1]))
|
mi@0
|
111 if A.shape[1] <= B.shape[1]:
|
mi@0
|
112 for aidx in xrange(A.shape[1]):
|
mi@0
|
113 if metric == 'l2':
|
mi@0
|
114 d[aidx:aidx+1,:] = l2_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
|
mi@0
|
115 if metric == 'l1':
|
mi@0
|
116 d[aidx:aidx+1,:] = l1_distance(B[:,:], A[:,aidx:aidx+1]).reshape((1,-1))
|
mi@0
|
117 else:
|
mi@0
|
118 for bidx in xrange(B.shape[1]):
|
mi@0
|
119 if metric == 'l2':
|
mi@0
|
120 d[:, bidx:bidx+1] = l2_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
|
mi@0
|
121 if metric == 'l1':
|
mi@0
|
122 d[:, bidx:bidx+1] = l1_distance(A[:,:], B[:,bidx:bidx+1]).reshape((-1,1))
|
mi@0
|
123
|
mi@0
|
124 return d
|
mi@0
|
125
|
mi@0
|
126 def vq(A, B, metric='l2'):
|
mi@0
|
127 # assigns data samples in B to cluster centers A and
|
mi@0
|
128 # returns an index list [assume n column vectors, d x n]
|
mi@0
|
129 assigned = np.argmin(pdist(A,B, metric=metric), axis=0)
|
mi@0
|
130 return assigned
|