annotate toolboxes/MIRtoolbox1.3.2/somtoolbox/knn.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 function [C,P]=knn(d, Cp, K)
wolffd@0 2
wolffd@0 3 %KNN K-Nearest Neighbor classifier using an arbitrary distance matrix
wolffd@0 4 %
wolffd@0 5 % [C,P]=knn(d, Cp, [K])
wolffd@0 6 %
wolffd@0 7 % Input and output arguments ([]'s are optional):
wolffd@0 8 % d (matrix) of size NxP: This is a precalculated dissimilarity (distance matrix).
wolffd@0 9 % P is the number of prototype vectors and N is the number of data vectors
wolffd@0 10 % That is, d(i,j) is the distance between data item i and prototype j.
wolffd@0 11 % Cp (vector) of size Px1 that contains integer class labels. Cp(j) is the class of
wolffd@0 12 % jth prototype.
wolffd@0 13 % [K] (scalar) the maximum K in K-NN classifier, default is 1
wolffd@0 14 % C (matrix) of size NxK: integers indicating the class
wolffd@0 15 % decision for data items according to the K-NN rule for each K.
wolffd@0 16 % C(i,K) is the classification for data item i using the K-NN rule
wolffd@0 17 % P (matrix) of size NxkxK: the relative amount of prototypes of
wolffd@0 18 % each class among the K closest prototypes for each classifiee.
wolffd@0 19 % That is, P(i,j,K) is the relative amount of prototypes of class j
wolffd@0 20 % among K nearest prototypes for data item i.
wolffd@0 21 %
wolffd@0 22 % If there is a tie between representatives of two or more classes
wolffd@0 23 % among the K closest neighbors to the classifiee, the class i selected randomly
wolffd@0 24 % among these candidates.
wolffd@0 25 %
wolffd@0 26 % IMPORTANT If K>1 this function uses 'sort' which is considerably slower than
wolffd@0 27 % 'max' which is used for K=1. If K>1 the knn always calculates
wolffd@0 28 % results for all K-NN models from 1-NN up to K-NN.
wolffd@0 29 %
wolffd@0 30 % EXAMPLE 1
wolffd@0 31 %
wolffd@0 32 % sP; % a SOM Toolbox data struct containing labeled prototype vectors
wolffd@0 33 % [Cp,label]=som_label2num(sP); % get integer class labels for prototype vectors
wolffd@0 34 % sD; % a SOM Toolbox data struct containing vectors to be classified
wolffd@0 35 % d=som_eucdist2(sD,sP); % calculate euclidean distance matrix
wolffd@0 36 % class=knn(d,Cp,10); % classify using 1,2,...,10-rules
wolffd@0 37 % class(:,5); % includes results for 5NN
wolffd@0 38 % label(class(:,5)) % original class labels for 5NN
wolffd@0 39 %
wolffd@0 40 % EXAMPLE 2 (leave-one-out-crossvalidate KNN for selection of proper K)
wolffd@0 41 %
wolffd@0 42 % P; % a data matrix of prototype vectors (rows)
wolffd@0 43 % Cp; % column vector of integer class labels for vectors in P
wolffd@0 44 % d=som_eucdist2(P,P); % calculate euclidean distance matrix PxP
wolffd@0 45 % d(eye(size(d))==1)=NaN; % set self-dissimilarity to NaN:
wolffd@0 46 % % this drops the prototype itself away from its neighborhood
wolffd@0 47 % % leave-one-out-crossvalidation (LOOCV)
wolffd@0 48 % class=knn(d,Cp,size(P,1)); % classify using all possible K
wolffd@0 49 % % calculate and plot LOOC-validated errors for all K
wolffd@0 50 % failratep = ...
wolffd@0 51 % 100*sum((class~=repmat(Cp,1,size(P,1))))./size(P,1); plot(1:size(P,1),failratep)
wolffd@0 52
wolffd@0 53 % See also SOM_LABEL2NUM, SOM_EUCDIST2, PDIST.
wolffd@0 54 %
wolffd@0 55 % Contributed to SOM Toolbox 2.0, October 29th, 2000 by Johan Himberg
wolffd@0 56 % Copyright (c) by Johan Himberg
wolffd@0 57 % http://www.cis.hut.fi/projects/somtoolbox/
wolffd@0 58
wolffd@0 59 % Version 2.0beta Johan 291000
wolffd@0 60
wolffd@0 61 %% Init %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 62
wolffd@0 63 % Check K
wolffd@0 64 if nargin<3 | isempty(K),
wolffd@0 65 K=1;
wolffd@0 66 end
wolffd@0 67
wolffd@0 68 if ~vis_valuetype(K,{'1x1'})
wolffd@0 69 error('Value for K must be a scalar');
wolffd@0 70 end
wolffd@0 71
wolffd@0 72 % Check that dist is a matrix
wolffd@0 73 if ~vis_valuetype(d,{'nxm'}),
wolffd@0 74 error('Distance matrix not valid.')
wolffd@0 75 end
wolffd@0 76
wolffd@0 77 [N_data N_proto]=size(d);
wolffd@0 78
wolffd@0 79 % Check class label vector: must be numerical and of integers
wolffd@0 80 if ~vis_valuetype(Cp,{[N_proto 1]});
wolffd@0 81 error(['Class vector is invalid: has to be a N-of-data_rows x 1' ...
wolffd@0 82 ' vector of integers']);
wolffd@0 83 elseif sum(fix(Cp)-Cp)~=0
wolffd@0 84 error('Class labels in vector ''Cp'' must be integers.');
wolffd@0 85 end
wolffd@0 86
wolffd@0 87 if size(d,2) ~= length(Cp),
wolffd@0 88 error('Distance matrix and prototype class vector dimensions do not match.');
wolffd@0 89 end
wolffd@0 90
wolffd@0 91 % Check if the classes are given as labels (no class input arg.)
wolffd@0 92 % if they are take them from prototype struct
wolffd@0 93
wolffd@0 94 % Find all class labels
wolffd@0 95 ClassIndex=unique(Cp);
wolffd@0 96 N_class=length(ClassIndex); % number of different classes
wolffd@0 97
wolffd@0 98
wolffd@0 99 %%%% Classification %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 100
wolffd@0 101 if K==1, % sort distances only if K>1
wolffd@0 102
wolffd@0 103 % 1NN
wolffd@0 104 % Select the closest prototype
wolffd@0 105 [tmp,proto_index]=min(d,[],2);
wolffd@0 106 C=Cp(proto_index);
wolffd@0 107
wolffd@0 108 else
wolffd@0 109
wolffd@0 110 % Sort the prototypes for each classifiee according to distance
wolffd@0 111 [tmp, proto_index]=sort(d');
wolffd@0 112
wolffd@0 113 %% Select up to K closest prototypes
wolffd@0 114 proto_index=proto_index(1:K,:);
wolffd@0 115 knn_class=Cp(proto_index);
wolffd@0 116 for i=1:N_class,
wolffd@0 117 classcounter(:,:,i)=cumsum(knn_class==ClassIndex(i));
wolffd@0 118 end
wolffd@0 119
wolffd@0 120 %% Vote between classes of K neighbors
wolffd@0 121 [winner,vote_index]=max(classcounter,[],3);
wolffd@0 122
wolffd@0 123 %%% Handle ties
wolffd@0 124
wolffd@0 125 % Set index to classes that got as much votes as winner
wolffd@0 126
wolffd@0 127 equal_to_winner=(repmat(winner,[1 1 N_class])==classcounter);
wolffd@0 128
wolffd@0 129 % set index to ties
wolffd@0 130 [tie_indexi,tie_indexj]=find(sum(equal_to_winner,3)>1); % drop the winner from counter
wolffd@0 131
wolffd@0 132 % Go through tie cases and reset vote_index randomly to one
wolffd@0 133 % of them
wolffd@0 134
wolffd@0 135 for i=1:length(tie_indexi),
wolffd@0 136 tie_class_index=find(squeeze(equal_to_winner(tie_indexi(i),tie_indexj(i),:)));
wolffd@0 137 fortuna=randperm(length(tie_class_index));
wolffd@0 138 vote_index(tie_indexi(i),tie_indexj(i))=tie_class_index(fortuna(1));
wolffd@0 139 end
wolffd@0 140
wolffd@0 141 C=ClassIndex(vote_index)';
wolffd@0 142 end
wolffd@0 143
wolffd@0 144 %% Build output %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0 145
wolffd@0 146 % Relative amount of classes in K neighbors for each classifiee
wolffd@0 147
wolffd@0 148 if K==1,
wolffd@0 149 P=zeros(N_data,N_class);
wolffd@0 150 if nargout>1,
wolffd@0 151 for i=1:N_data,
wolffd@0 152 P(i,ClassIndex==C(i))=1;
wolffd@0 153 end
wolffd@0 154 end
wolffd@0 155 else
wolffd@0 156 P=shiftdim(classcounter,1)./repmat(shiftdim(1:K,-1), [N_data N_class 1]);
wolffd@0 157 end
wolffd@0 158