mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Non-negative Matrix Factorization.
|
mi@0
|
8
|
mi@0
|
9 NMF: Class for Non-negative Matrix Factorization
|
mi@0
|
10
|
mi@0
|
11 [1] Lee, D. D. and Seung, H. S. (1999), Learning the Parts of Objects by Non-negative
|
mi@0
|
12 Matrix Factorization, Nature 401(6755), 788-799.
|
mi@0
|
13 """
|
mi@0
|
14
|
mi@0
|
15
|
mi@0
|
16 import numpy as np
|
mi@0
|
17 import logging
|
mi@0
|
18 import logging.config
|
mi@0
|
19 import scipy.sparse
|
mi@0
|
20
|
mi@0
|
21 __all__ = ["NMF"]
|
mi@0
|
22
|
mi@0
|
23 class NMF():
|
mi@0
|
24 """
|
mi@0
|
25 NMF(data, num_bases=4)
|
mi@0
|
26
|
mi@0
|
27
|
mi@0
|
28 Non-negative Matrix Factorization. Factorize a data matrix into two matrices
|
mi@0
|
29 s.t. F = | data - W*H | = | is minimal. H, and W are restricted to non-negative
|
mi@0
|
30 data. Uses the classicial multiplicative update rule.
|
mi@0
|
31
|
mi@0
|
32 Parameters
|
mi@0
|
33 ----------
|
mi@0
|
34 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
35 the input data
|
mi@0
|
36 num_bases: int, optional
|
mi@0
|
37 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
38 4 (default)
|
mi@0
|
39
|
mi@0
|
40 Attributes
|
mi@0
|
41 ----------
|
mi@0
|
42 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
43 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
44 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
45
|
mi@0
|
46 Example
|
mi@0
|
47 -------
|
mi@0
|
48 Applying NMF to some rather stupid data set:
|
mi@0
|
49
|
mi@0
|
50 >>> import numpy as np
|
mi@0
|
51 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
52 >>> nmf_mdl = NMF(data, num_bases=2, niter=10)
|
mi@0
|
53 >>> nmf_mdl.factorize()
|
mi@0
|
54
|
mi@0
|
55 The basis vectors are now stored in nmf_mdl.W, the coefficients in nmf_mdl.H.
|
mi@0
|
56 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
57 to nmf_mdl.W, and set compute_w to False:
|
mi@0
|
58
|
mi@0
|
59 >>> data = np.array([[1.5], [1.2]])
|
mi@0
|
60 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
61 >>> nmf_mdl = NMF(data, num_bases=2)
|
mi@0
|
62 >>> nmf_mdl.W = W
|
mi@0
|
63 >>> nmf_mdl.factorize(niter=20, compute_w=False)
|
mi@0
|
64
|
mi@0
|
65 The result is a set of coefficients nmf_mdl.H, s.t. data = W * nmf_mdl.H.
|
mi@0
|
66 """
|
mi@0
|
67
|
mi@0
|
68 # some small value
|
mi@0
|
69 _EPS = 10**-8
|
mi@0
|
70
|
mi@0
|
71 def __init__(self, data, num_bases=4):
|
mi@0
|
72
|
mi@0
|
73 def setup_logging():
|
mi@0
|
74 # create logger
|
mi@0
|
75 self._logger = logging.getLogger("pymf")
|
mi@0
|
76
|
mi@0
|
77 # add ch to logger
|
mi@0
|
78 if len(self._logger.handlers) < 1:
|
mi@0
|
79 # create console handler and set level to debug
|
mi@0
|
80 ch = logging.StreamHandler()
|
mi@0
|
81 ch.setLevel(logging.DEBUG)
|
mi@0
|
82 # create formatter
|
mi@0
|
83 formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s")
|
mi@0
|
84
|
mi@0
|
85 # add formatter to ch
|
mi@0
|
86 ch.setFormatter(formatter)
|
mi@0
|
87
|
mi@0
|
88 self._logger.addHandler(ch)
|
mi@0
|
89
|
mi@0
|
90 setup_logging()
|
mi@0
|
91
|
mi@0
|
92 # set variables
|
mi@0
|
93 self.data = data
|
mi@0
|
94 self._num_bases = num_bases
|
mi@0
|
95
|
mi@0
|
96 # initialize H and W to random values
|
mi@0
|
97 (self._data_dimension, self._num_samples) = self.data.shape
|
mi@0
|
98
|
mi@0
|
99
|
mi@0
|
100 def frobenius_norm(self):
|
mi@0
|
101 """ Frobenius norm (||data - WH||) of a data matrix and a low rank
|
mi@0
|
102 approximation given by WH
|
mi@0
|
103
|
mi@0
|
104 Returns:
|
mi@0
|
105 frobenius norm: F = ||data - WH||
|
mi@0
|
106 """
|
mi@0
|
107
|
mi@0
|
108 # check if W and H exist
|
mi@0
|
109 if hasattr(self,'H') and hasattr(self,'W') and not scipy.sparse.issparse(self.data):
|
mi@0
|
110 err = np.sqrt( np.sum((self.data[:,:] - np.dot(self.W, self.H))**2 ))
|
mi@0
|
111 else:
|
mi@0
|
112 err = -123456
|
mi@0
|
113
|
mi@0
|
114 return err
|
mi@0
|
115
|
mi@0
|
116 def init_w(self):
|
mi@0
|
117 self.W = np.random.random((self._data_dimension, self._num_bases))
|
mi@0
|
118
|
mi@0
|
119 def init_h(self):
|
mi@0
|
120 self.H = np.random.random((self._num_bases, self._num_samples))
|
mi@0
|
121
|
mi@0
|
122 def update_h(self):
|
mi@0
|
123 # pre init H1, and H2 (necessary for storing matrices on disk)
|
mi@0
|
124 H2 = np.dot(np.dot(self.W.T, self.W), self.H) + 10**-9
|
mi@0
|
125 self.H *= np.dot(self.W.T, self.data[:,:])
|
mi@0
|
126 self.H /= H2
|
mi@0
|
127
|
mi@0
|
128 def update_w(self):
|
mi@0
|
129 # pre init W1, and W2 (necessary for storing matrices on disk)
|
mi@0
|
130 W2 = np.dot(np.dot(self.W, self.H), self.H.T) + 10**-9
|
mi@0
|
131 self.W *= np.dot(self.data[:,:], self.H.T)
|
mi@0
|
132 self.W /= W2
|
mi@0
|
133
|
mi@0
|
134 def converged(self, i):
|
mi@0
|
135 derr = np.abs(self.ferr[i] - self.ferr[i-1])/self._num_samples
|
mi@0
|
136 if derr < self._EPS:
|
mi@0
|
137 return True
|
mi@0
|
138 else:
|
mi@0
|
139 return False
|
mi@0
|
140
|
mi@0
|
141 def factorize(self, niter=1, show_progress=False,
|
mi@0
|
142 compute_w=True, compute_h=True, compute_err=True):
|
mi@0
|
143 """ Factorize s.t. WH = data
|
mi@0
|
144
|
mi@0
|
145 Parameters
|
mi@0
|
146 ----------
|
mi@0
|
147 niter : int
|
mi@0
|
148 number of iterations.
|
mi@0
|
149 show_progress : bool
|
mi@0
|
150 print some extra information to stdout.
|
mi@0
|
151 compute_h : bool
|
mi@0
|
152 iteratively update values for H.
|
mi@0
|
153 compute_w : bool
|
mi@0
|
154 iteratively update values for W.
|
mi@0
|
155 compute_err : bool
|
mi@0
|
156 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
157 it to .ferr[k].
|
mi@0
|
158
|
mi@0
|
159 Updated Values
|
mi@0
|
160 --------------
|
mi@0
|
161 .W : updated values for W.
|
mi@0
|
162 .H : updated values for H.
|
mi@0
|
163 .ferr : Frobenius norm |data-WH| for each iteration.
|
mi@0
|
164 """
|
mi@0
|
165
|
mi@0
|
166 if show_progress:
|
mi@0
|
167 self._logger.setLevel(logging.INFO)
|
mi@0
|
168 else:
|
mi@0
|
169 self._logger.setLevel(logging.ERROR)
|
mi@0
|
170
|
mi@0
|
171 # create W and H if they don't already exist
|
mi@0
|
172 # -> any custom initialization to W,H should be done before
|
mi@0
|
173 if not hasattr(self,'W'):
|
mi@0
|
174 self.init_w()
|
mi@0
|
175
|
mi@0
|
176 if not hasattr(self,'H'):
|
mi@0
|
177 self.init_h()
|
mi@0
|
178
|
mi@0
|
179 if compute_err:
|
mi@0
|
180 self.ferr = np.zeros(niter)
|
mi@0
|
181
|
mi@0
|
182 for i in xrange(niter):
|
mi@0
|
183 if compute_w:
|
mi@0
|
184 self.update_w()
|
mi@0
|
185
|
mi@0
|
186 if compute_h:
|
mi@0
|
187 self.update_h()
|
mi@0
|
188
|
mi@0
|
189 if compute_err:
|
mi@0
|
190 self.ferr[i] = self.frobenius_norm()
|
mi@0
|
191 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
|
mi@0
|
192 ' FN:' + str(self.ferr[i]))
|
mi@0
|
193 else:
|
mi@0
|
194 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
|
mi@0
|
195
|
mi@0
|
196
|
mi@0
|
197 # check if the err is not changing anymore
|
mi@0
|
198 if i > 1 and compute_err:
|
mi@0
|
199 if self.converged(i):
|
mi@0
|
200 # adjust the error measure
|
mi@0
|
201 self.ferr = self.ferr[:i]
|
mi@0
|
202 break
|
mi@0
|
203
|
mi@0
|
204 if __name__ == "__main__":
|
mi@0
|
205 import doctest
|
mi@0
|
206 doctest.testmod()
|