comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_norm_variable.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 function [x,sNorm] = som_norm_variable(x, method, operation)
2
3 %SOM_NORM_VARIABLE Normalize or denormalize a scalar variable.
4 %
5 % [x,sNorm] = som_norm_variable(x, method, operation)
6 %
7 % xnew = som_norm_variable(x,'var','do');
8 % [dummy,sN] = som_norm_variable(x,'log','init');
9 % [xnew,sN] = som_norm_variable(x,sN,'do');
10 % xorig = som_norm_variable(xnew,sN,'undo');
11 %
12 % Input and output arguments:
13 % x (vector) a set of values of a scalar variable for
14 % which the (de)normalization is performed.
15 % The processed values are returned.
16 % method (string) identifier for a normalization method: 'var',
17 % 'range', 'log', 'logistic', 'histD', or 'histC'.
18 % A normalization struct with default values is created.
19 % (struct) normalization struct, or an array of such
20 % (cellstr) first string gives normalization operation, and the
21 % second gives denormalization operation, with x
22 % representing the variable, for example:
23 % {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.
24 % Note that in the last case, no denorm operation is
25 % defined.
26 % operation (string) the operation to be performed: 'init', 'do' or 'undo'
27 %
28 % sNorm (struct) updated normalization struct/struct array
29 %
30 % For more help, try 'type som_norm_variable' or check out online documentation.
31 % See also SOM_NORMALIZE, SOM_DENORMALIZE.
32
33 %%%%%%%%%%%%% DETAILED DESCRIPTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 %
35 % som_norm_variable
36 %
37 % PURPOSE
38 %
39 % Initialize, apply and undo normalizations on a given vector of
40 % scalar values.
41 %
42 % SYNTAX
43 %
44 % xnew = som_norm_variable(x,method,operation)
45 % xnew = som_norm_variable(x,sNorm,operation)
46 % [xnew,sNorm] = som_norm_variable(...)
47 %
48 % DESCRIPTION
49 %
50 % This function is used to initialize, apply and undo normalizations
51 % on scalar variables. It is the low-level function that upper-level
52 % functions SOM_NORMALIZE and SOM_DENORMALIZE utilize to actually (un)do
53 % the normalizations.
54 %
55 % Normalizations are typically performed to control the variance of
56 % vector components. If some vector components have variance which is
57 % significantly higher than the variance of other components, those
58 % components will dominate the map organization. Normalization of
59 % the variance of vector components (method 'var') is used to prevent
60 % that. In addition to variance normalization, other methods have
61 % been implemented as well (see list below).
62 %
63 % Usually normalizations convert the variable values so that they no
64 % longer make any sense: the values are still ordered, but their range
65 % may have changed so radically that interpreting the numbers in the
66 % original context is very hard. For this reason all implemented methods
67 % are (more or less) revertible. The normalizations are monotonic
68 % and information is saved so that they can be undone. Also, the saved
69 % information makes it possible to apply the EXACTLY SAME normalization
70 % to another set of values. The normalization information is determined
71 % with 'init' operation, while 'do' and 'undo' operations are used to
72 % apply or revert the normalization.
73 %
74 % The normalization information is saved in a normalization struct,
75 % which is returned as the second argument of this function. Note that
76 % normalization operations may be stacked. In this case, normalization
77 % structs are positioned in a struct array. When applied, the array is
78 % gone through from start to end, and when undone, in reverse order.
79 %
80 % method description
81 %
82 % 'var' Variance normalization. A linear transformation which
83 % scales the values such that their variance=1. This is
84 % convenient way to use Mahalanobis distance measure without
85 % actually changing the distance calculation procedure.
86 %
87 % 'range' Normalization of range of values. A linear transformation
88 % which scales the values between [0,1].
89 %
90 % 'log' Logarithmic normalization. In many cases the values of
91 % a vector component are exponentially distributed. This
92 % normalization is a good way to get more resolution to
93 % (the low end of) that vector component. What this
94 % actually does is a non-linear transformation:
95 % x_new = log(x_old - m + 1)
96 % where m=min(x_old) and log is the natural logarithm.
97 % Applying the transformation to a value which is lower
98 % than m-1 will give problems, as the result is then complex.
99 % If the minimum for values is known a priori,
100 % it might be a good idea to initialize the normalization with
101 % [dummy,sN] = som_norm_variable(minimum,'log','init');
102 % and normalize only after this:
103 % x_new = som_norm_variable(x,sN,'do');
104 %
105 % 'logistic' or softmax normalization. This normalization ensures
106 % that all values in the future, too, are within the range
107 % [0,1]. The transformation is more-or-less linear in the
108 % middle range (around mean value), and has a smooth
109 % nonlinearity at both ends which ensures that all values
110 % are within the range. The data is first scaled as in
111 % variance normalization:
112 % x_scaled = (x_old - mean(x_old))/std(x_old)
113 % and then transformed with the logistic function
114 % x_new = 1/(1+exp(-x_scaled))
115 %
116 % 'histD' Discrete histogram equalization. Non-linear. Orders the
117 % values and replaces each value by its ordinal number.
118 % Finally, scales the values such that they are between [0,1].
119 % Useful for both discrete and continuous variables, but as
120 % the saved normalization information consists of all
121 % unique values of the initialization data set, it may use
122 % considerable amounts of memory. If the variable can get
123 % more than a few values (say, 20), it might be better to
124 % use 'histC' method below. Another important note is that
125 % this method is not exactly revertible if it is applied
126 % to values which are not part of the original value set.
127 %
128 % 'histC' Continuous histogram equalization. Actually, a partially
129 % linear transformation which tries to do something like
130 % histogram equalization. The value range is divided to
131 % a number of bins such that the number of values in each
132 % bin is (almost) the same. The values are transformed
133 % linearly in each bin. For example, values in bin number 3
134 % are scaled between [3,4[. Finally, all values are scaled
135 % between [0,1]. The number of bins is the square root
136 % of the number of unique values in the initialization set,
137 % rounded up. The resulting histogram equalization is not
138 % as good as the one that 'histD' makes, but the benefit
139 % is that it is exactly revertible - even outside the
140 % original value range (although the results may be funny).
141 %
142 % 'eval' With this method, freeform normalization operations can be
143 % specified. The parameter field contains strings to be
144 % evaluated with 'eval' function, with variable name 'x'
145 % representing the variable itself. The first string is
146 % the normalization operation, and the second is a
147 % denormalization operation. If the denormalization operation
148 % is empty, it is ignored.
149 %
150 % INPUT ARGUMENTS
151 %
152 % x (vector) The scalar values to which the normalization
153 % operation is applied.
154 %
155 % method The normalization specification.
156 % (string) Identifier for a normalization method: 'var',
157 % 'range', 'log', 'logistic', 'histD' or 'histC'.
158 % Corresponding default normalization struct is created.
159 % (struct) normalization struct
160 % (struct array) of normalization structs, applied to
161 % x one after the other
162 % (cellstr) of length
163 % (cellstr array) first string gives normalization operation, and
164 % the second gives denormalization operation, with x
165 % representing the variable, for example:
166 % {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.
167 % Note that in the last case, no denorm operation is
168 % defined.
169 %
170 % note: if the method is given as struct(s), it is
171 % applied (done or undone, as specified by operation)
172 % regardless of what the value of '.status' field
173 % is in the struct(s). Only if the status is
174 % 'uninit', the undoing operation is halted.
175 % Anyhow, the '.status' fields in the returned
176 % normalization struct(s) is set to approriate value.
177 %
178 % operation (string) The operation to perform: 'init' to initialize
179 % the normalization struct, 'do' to perform the
180 % normalization, 'undo' to undo the normalization,
181 % if possible. If operation 'do' is given, but the
182 % normalization struct has not yet been initialized,
183 % it is initialized using the given data (x).
184 %
185 % OUTPUT ARGUMENTS
186 %
187 % x (vector) Appropriately processed values.
188 %
189 % sNorm (struct) Updated normalization struct/struct array. If any,
190 % the '.status' and '.params' fields are updated.
191 %
192 % EXAMPLES
193 %
194 % To initialize and apply a normalization on a set of scalar values:
195 %
196 % [x_new,sN] = som_norm_variable(x_old,'var','do');
197 %
198 % To just initialize, use:
199 %
200 % [dummy,sN] = som_norm_variable(x_old,'var','init');
201 %
202 % To undo the normalization(s):
203 %
204 % x_orig = som_norm_variable(x_new,sN,'undo');
205 %
206 % Typically, normalizations of data structs/sets are handled using
207 % functions SOM_NORMALIZE and SOM_DENORMALIZE. However, when only the
208 % values of a single variable are of interest, SOM_NORM_VARIABLE may
209 % be useful. For example, assume one wants to apply the normalization
210 % done on a component (i) of a data struct (sD) to a new set of values
211 % (x) of that component. With SOM_NORM_VARIABLE this can be done with:
212 %
213 % x_new = som_norm_variable(x,sD.comp_norm{i},'do');
214 %
215 % Now, as the normalizations in sD.comp_norm{i} have already been
216 % initialized with the original data set (presumably sD.data),
217 % the EXACTLY SAME normalization(s) can be applied to the new values.
218 % The same thing can be done with SOM_NORMALIZE function, too:
219 %
220 % x_new = som_normalize(x,sD.comp_norm{i});
221 %
222 % Or, if the new data set were in variable D - a matrix of same
223 % dimension as the original data set:
224 %
225 % D_new = som_normalize(D,sD,i);
226 %
227 % SEE ALSO
228 %
229 % som_normalize Add/apply/redo normalizations for a data struct/set.
230 % som_denormalize Undo normalizations of a data struct/set.
231
232 % Copyright (c) 1998-2000 by the SOM toolbox programming team.
233 % http://www.cis.hut.fi/projects/somtoolbox/
234
235 % Version 2.0beta juuso 151199 170400 150500
236
237 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
238 %% check arguments
239
240 error(nargchk(3, 3, nargin)); % check no. of input arguments is correct
241
242 % method
243 sNorm = [];
244 if ischar(method)
245 if any(strcmp(method,{'var','range','log','logistic','histD','histC'})),
246 sNorm = som_set('som_norm','method',method);
247 else
248 method = cellstr(method);
249 end
250 end
251 if iscell(method),
252 if length(method)==1 & isstruct(method{1}), sNorm = method{1};
253 else
254 if length(method)==1 | isempty(method{2}), method{2} = 'x'; end
255 sNorm = som_set('som_norm','method','eval','params',method);
256 end
257 else
258 sNorm = method;
259 end
260
261 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
262 %% action
263
264 order = [1:length(sNorm)];
265 if length(order)>1 & strcmp(operation,'undo'), order = order(end:-1:1); end
266
267 for i=order,
268
269 % initialize
270 if strcmp(operation,'init') | ...
271 (strcmp(operation,'do') & strcmp(sNorm(i).status,'uninit')),
272
273 % case method = 'hist'
274 if strcmp(sNorm(i).method,'hist'),
275 inds = find(~isnan(x) & ~isinf(x));
276 if length(unique(x(inds)))>20, sNorm(i).method = 'histC';
277 else sNorm{i}.method = 'histD'; end
278 end
279
280 switch(sNorm(i).method),
281 case 'var', params = norm_variance_init(x);
282 case 'range', params = norm_scale01_init(x);
283 case 'log', params = norm_log_init(x);
284 case 'logistic', params = norm_logistic_init(x);
285 case 'histD', params = norm_histeqD_init(x);
286 case 'histC', params = norm_histeqC_init(x);
287 case 'eval', params = sNorm(i).params;
288 otherwise,
289 error(['Unrecognized method: ' sNorm(i).method]);
290 end
291 sNorm(i).params = params;
292 sNorm(i).status = 'undone';
293 end
294
295 % do / undo
296 if strcmp(operation,'do'),
297 switch(sNorm(i).method),
298 case 'var', x = norm_scale_do(x,sNorm(i).params);
299 case 'range', x = norm_scale_do(x,sNorm(i).params);
300 case 'log', x = norm_log_do(x,sNorm(i).params);
301 case 'logistic', x = norm_logistic_do(x,sNorm(i).params);
302 case 'histD', x = norm_histeqD_do(x,sNorm(i).params);
303 case 'histC', x = norm_histeqC_do(x,sNorm(i).params);
304 case 'eval', x = norm_eval_do(x,sNorm(i).params);
305 otherwise,
306 error(['Unrecognized method: ' sNorm(i).method]);
307 end
308 sNorm(i).status = 'done';
309
310 elseif strcmp(operation,'undo'),
311
312 if strcmp(sNorm(i).status,'uninit'),
313 warning('Could not undo: uninitialized normalization struct.')
314 break;
315 end
316 switch(sNorm(i).method),
317 case 'var', x = norm_scale_undo(x,sNorm(i).params);
318 case 'range', x = norm_scale_undo(x,sNorm(i).params);
319 case 'log', x = norm_log_undo(x,sNorm(i).params);
320 case 'logistic', x = norm_logistic_undo(x,sNorm(i).params);
321 case 'histD', x = norm_histeqD_undo(x,sNorm(i).params);
322 case 'histC', x = norm_histeqC_undo(x,sNorm(i).params);
323 case 'eval', x = norm_eval_undo(x,sNorm(i).params);
324 otherwise,
325 error(['Unrecognized method: ' sNorm(i).method]);
326 end
327 sNorm(i).status = 'undone';
328
329 elseif ~strcmp(operation,'init'),
330
331 error(['Unrecognized operation: ' operation])
332
333 end
334 end
335
336 return;
337
338 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
339 %% subfunctions
340
341 % linear scaling
342
343 function p = norm_variance_init(x)
344 inds = find(~isnan(x) & isfinite(x));
345 p = [mean(x(inds)), std(x(inds))];
346 if p(2) == 0, p(2) = 1; end
347 %end of norm_variance_init
348
349 function p = norm_scale01_init(x)
350 inds = find(~isnan(x) & isfinite(x));
351 mi = min(x(inds));
352 ma = max(x(inds));
353 if mi == ma, p = [mi, 1]; else p = [mi, ma-mi]; end
354 %end of norm_scale01_init
355
356 function x = norm_scale_do(x,p)
357 x = (x - p(1)) / p(2);
358 % end of norm_scale_do
359
360 function x = norm_scale_undo(x,p)
361 x = x * p(2) + p(1);
362 % end of norm_scale_undo
363
364 % logarithm
365
366 function p = norm_log_init(x)
367 inds = find(~isnan(x) & isfinite(x));
368 p = min(x(inds));
369 % end of norm_log_init
370
371 function x = norm_log_do(x,p)
372 x = log(x - p +1);
373 % if any(~isreal(x)), ok = 0; end
374 % end of norm_log_do
375
376 function x = norm_log_undo(x,p)
377 x = exp(x) -1 + p;
378 % end of norm_log_undo
379
380 % logistic
381
382 function p = norm_logistic_init(x)
383 inds = find(~isnan(x) & isfinite(x));
384 p = [mean(x(inds)), std(x(inds))];
385 if p(2)==0, p(2) = 1; end
386 % end of norm_logistic_init
387
388 function x = norm_logistic_do(x,p)
389 x = (x-p(1))/p(2);
390 x = 1./(1+exp(-x));
391 % end of norm_logistic_do
392
393 function x = norm_logistic_undo(x,p)
394 x = log(x./(1-x));
395 x = x*p(2)+p(1);
396 % end of norm_logistic_undo
397
398 % histogram equalization for discrete values
399
400 function p = norm_histeqD_init(x)
401 inds = find(~isnan(x) & ~isinf(x));
402 p = unique(x(inds));
403 % end of norm_histeqD_init
404
405 function x = norm_histeqD_do(x,p)
406 bins = length(p);
407 inds = find(~isnan(x) & ~isinf(x))';
408 for i = inds,
409 [dummy ind] = min(abs(x(i) - p));
410 % data item closer to the left-hand bin wall is indexed after RH wall
411 if x(i) > p(ind) & ind < bins,
412 x(i) = ind + 1;
413 else
414 x(i) = ind;
415 end
416 end
417 x = (x-1)/(bins-1); % normalization between [0,1]
418 % end of norm_histeqD_do
419
420 function x = norm_histeqD_undo(x,p)
421 bins = length(p);
422 x = round(x*(bins-1)+1);
423 inds = find(~isnan(x) & ~isinf(x));
424 x(inds) = p(x(inds));
425 % end of norm_histeqD_undo
426
427 % histogram equalization with partially linear functions
428
429 function p = norm_histeqC_init(x)
430 % investigate x
431 inds = find(~isnan(x) & ~isinf(x));
432 samples = length(inds);
433 xs = unique(x(inds));
434 mi = xs(1);
435 ma = xs(end);
436 % decide number of limits
437 lims = ceil(sqrt(length(xs))); % 2->2,100->10,1000->32,10000->100
438 % decide limits
439 if lims==1,
440 p = [mi, mi+1];
441 lims = 2;
442 elseif lims==2,
443 p = [mi, ma];
444 else
445 p = zeros(lims,1);
446 p(1) = mi;
447 p(end) = ma;
448 binsize = zeros(lims-1,1); b = 1; avebinsize = samples/(lims-1);
449 for i=1:(length(xs)-1),
450 binsize(b) = binsize(b) + sum(x==xs(i));
451 if binsize(b) >= avebinsize,
452 b = b + 1;
453 p(b) = (xs(i)+xs(i+1))/2;
454 end
455 if b==(lims-1),
456 binsize(b) = samples-sum(binsize); break;
457 else
458 avebinsize = (samples-sum(binsize))/(lims-1-b);
459 end
460 end
461 end
462 % end of norm_histeqC_init
463
464 function x = norm_histeqC_do(x,p)
465 xnew = x;
466 lims = length(p);
467 % handle values below minimum
468 r = p(2)-p(1);
469 inds = find(x<=p(1) & isfinite(x));
470 if any(inds), xnew(inds) = 0-(p(1)-x(inds))/r; end
471 % handle values above maximum
472 r = p(end)-p(end-1);
473 inds = find(x>p(end) & isfinite(x));
474 if any(inds), xnew(inds) = lims-1+(x(inds)-p(end))/r; end
475 % handle all other values
476 for i=1:(lims-1),
477 r0 = p(i); r1 = p(i+1); r = r1-r0;
478 inds = find(x>r0 & x<=r1);
479 if any(inds), xnew(inds) = i-1+(x(inds)-r0)/r; end
480 end
481 % scale so that minimum and maximum correspond to 0 and 1
482 x = xnew/(lims-1);
483 % end of norm_histeqC_do
484
485 function x = norm_histeqC_undo(x,p)
486 xnew = x;
487 lims = length(p);
488 % scale so that 0 and 1 correspond to minimum and maximum
489 x = x*(lims-1);
490
491 % handle values below minimum
492 r = p(2)-p(1);
493 inds = find(x<=0 & isfinite(x));
494 if any(inds), xnew(inds) = x(inds)*r + p(1); end
495 % handle values above maximum
496 r = p(end)-p(end-1);
497 inds = find(x>lims-1 & isfinite(x));
498 if any(inds), xnew(inds) = (x(inds)-(lims-1))*r+p(end); end
499 % handle all other values
500 for i=1:(lims-1),
501 r0 = p(i); r1 = p(i+1); r = r1-r0;
502 inds = find(x>i-1 & x<=i);
503 if any(inds), xnew(inds) = (x(inds)-(i-1))*r + r0; end
504 end
505 x = xnew;
506 % end of norm_histeqC_undo
507
508 % eval
509
510 function p = norm_eval_init(method)
511 p = method;
512 %end of norm_eval_init
513
514 function x = norm_eval_do(x,p)
515 x_tmp = eval(p{1});
516 if size(x_tmp,1)>=1 & size(x,1)>=1 & ...
517 size(x_tmp,2)==1 & size(x,2)==1,
518 x = x_tmp;
519 end
520 %end of norm_eval_do
521
522 function x = norm_eval_undo(x,p)
523 x_tmp = eval(p{2});
524 if size(x_tmp,1)>=1 & size(x,1)>=1 & ...
525 size(x_tmp,2)==1 & size(x,2)==1,
526 x = x_tmp;
527 end
528 %end of norm_eval_undo
529
530 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
531
532
533