wolffd@0
|
1 function [C,P]=knn(d, Cp, K)
|
wolffd@0
|
2
|
wolffd@0
|
3 %KNN K-Nearest Neighbor classifier using an arbitrary distance matrix
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % [C,P]=knn(d, Cp, [K])
|
wolffd@0
|
6 %
|
wolffd@0
|
7 % Input and output arguments ([]'s are optional):
|
wolffd@0
|
8 % d (matrix) of size NxP: This is a precalculated dissimilarity (distance matrix).
|
wolffd@0
|
9 % P is the number of prototype vectors and N is the number of data vectors
|
wolffd@0
|
10 % That is, d(i,j) is the distance between data item i and prototype j.
|
wolffd@0
|
11 % Cp (vector) of size Px1 that contains integer class labels. Cp(j) is the class of
|
wolffd@0
|
12 % jth prototype.
|
wolffd@0
|
13 % [K] (scalar) the maximum K in K-NN classifier, default is 1
|
wolffd@0
|
14 % C (matrix) of size NxK: integers indicating the class
|
wolffd@0
|
15 % decision for data items according to the K-NN rule for each K.
|
wolffd@0
|
16 % C(i,K) is the classification for data item i using the K-NN rule
|
wolffd@0
|
17 % P (matrix) of size NxkxK: the relative amount of prototypes of
|
wolffd@0
|
18 % each class among the K closest prototypes for each classifiee.
|
wolffd@0
|
19 % That is, P(i,j,K) is the relative amount of prototypes of class j
|
wolffd@0
|
20 % among K nearest prototypes for data item i.
|
wolffd@0
|
21 %
|
wolffd@0
|
22 % If there is a tie between representatives of two or more classes
|
wolffd@0
|
23 % among the K closest neighbors to the classifiee, the class i selected randomly
|
wolffd@0
|
24 % among these candidates.
|
wolffd@0
|
25 %
|
wolffd@0
|
26 % IMPORTANT If K>1 this function uses 'sort' which is considerably slower than
|
wolffd@0
|
27 % 'max' which is used for K=1. If K>1 the knn always calculates
|
wolffd@0
|
28 % results for all K-NN models from 1-NN up to K-NN.
|
wolffd@0
|
29 %
|
wolffd@0
|
30 % EXAMPLE 1
|
wolffd@0
|
31 %
|
wolffd@0
|
32 % sP; % a SOM Toolbox data struct containing labeled prototype vectors
|
wolffd@0
|
33 % [Cp,label]=som_label2num(sP); % get integer class labels for prototype vectors
|
wolffd@0
|
34 % sD; % a SOM Toolbox data struct containing vectors to be classified
|
wolffd@0
|
35 % d=som_eucdist2(sD,sP); % calculate euclidean distance matrix
|
wolffd@0
|
36 % class=knn(d,Cp,10); % classify using 1,2,...,10-rules
|
wolffd@0
|
37 % class(:,5); % includes results for 5NN
|
wolffd@0
|
38 % label(class(:,5)) % original class labels for 5NN
|
wolffd@0
|
39 %
|
wolffd@0
|
40 % EXAMPLE 2 (leave-one-out-crossvalidate KNN for selection of proper K)
|
wolffd@0
|
41 %
|
wolffd@0
|
42 % P; % a data matrix of prototype vectors (rows)
|
wolffd@0
|
43 % Cp; % column vector of integer class labels for vectors in P
|
wolffd@0
|
44 % d=som_eucdist2(P,P); % calculate euclidean distance matrix PxP
|
wolffd@0
|
45 % d(eye(size(d))==1)=NaN; % set self-dissimilarity to NaN:
|
wolffd@0
|
46 % % this drops the prototype itself away from its neighborhood
|
wolffd@0
|
47 % % leave-one-out-crossvalidation (LOOCV)
|
wolffd@0
|
48 % class=knn(d,Cp,size(P,1)); % classify using all possible K
|
wolffd@0
|
49 % % calculate and plot LOOC-validated errors for all K
|
wolffd@0
|
50 % failratep = ...
|
wolffd@0
|
51 % 100*sum((class~=repmat(Cp,1,size(P,1))))./size(P,1); plot(1:size(P,1),failratep)
|
wolffd@0
|
52
|
wolffd@0
|
53 % See also SOM_LABEL2NUM, SOM_EUCDIST2, PDIST.
|
wolffd@0
|
54 %
|
wolffd@0
|
55 % Contributed to SOM Toolbox 2.0, October 29th, 2000 by Johan Himberg
|
wolffd@0
|
56 % Copyright (c) by Johan Himberg
|
wolffd@0
|
57 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
58
|
wolffd@0
|
59 % Version 2.0beta Johan 291000
|
wolffd@0
|
60
|
wolffd@0
|
61 %% Init %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
62
|
wolffd@0
|
63 % Check K
|
wolffd@0
|
64 if nargin<3 | isempty(K),
|
wolffd@0
|
65 K=1;
|
wolffd@0
|
66 end
|
wolffd@0
|
67
|
wolffd@0
|
68 if ~vis_valuetype(K,{'1x1'})
|
wolffd@0
|
69 error('Value for K must be a scalar');
|
wolffd@0
|
70 end
|
wolffd@0
|
71
|
wolffd@0
|
72 % Check that dist is a matrix
|
wolffd@0
|
73 if ~vis_valuetype(d,{'nxm'}),
|
wolffd@0
|
74 error('Distance matrix not valid.')
|
wolffd@0
|
75 end
|
wolffd@0
|
76
|
wolffd@0
|
77 [N_data N_proto]=size(d);
|
wolffd@0
|
78
|
wolffd@0
|
79 % Check class label vector: must be numerical and of integers
|
wolffd@0
|
80 if ~vis_valuetype(Cp,{[N_proto 1]});
|
wolffd@0
|
81 error(['Class vector is invalid: has to be a N-of-data_rows x 1' ...
|
wolffd@0
|
82 ' vector of integers']);
|
wolffd@0
|
83 elseif sum(fix(Cp)-Cp)~=0
|
wolffd@0
|
84 error('Class labels in vector ''Cp'' must be integers.');
|
wolffd@0
|
85 end
|
wolffd@0
|
86
|
wolffd@0
|
87 if size(d,2) ~= length(Cp),
|
wolffd@0
|
88 error('Distance matrix and prototype class vector dimensions do not match.');
|
wolffd@0
|
89 end
|
wolffd@0
|
90
|
wolffd@0
|
91 % Check if the classes are given as labels (no class input arg.)
|
wolffd@0
|
92 % if they are take them from prototype struct
|
wolffd@0
|
93
|
wolffd@0
|
94 % Find all class labels
|
wolffd@0
|
95 ClassIndex=unique(Cp);
|
wolffd@0
|
96 N_class=length(ClassIndex); % number of different classes
|
wolffd@0
|
97
|
wolffd@0
|
98
|
wolffd@0
|
99 %%%% Classification %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
100
|
wolffd@0
|
101 if K==1, % sort distances only if K>1
|
wolffd@0
|
102
|
wolffd@0
|
103 % 1NN
|
wolffd@0
|
104 % Select the closest prototype
|
wolffd@0
|
105 [tmp,proto_index]=min(d,[],2);
|
wolffd@0
|
106 C=Cp(proto_index);
|
wolffd@0
|
107
|
wolffd@0
|
108 else
|
wolffd@0
|
109
|
wolffd@0
|
110 % Sort the prototypes for each classifiee according to distance
|
wolffd@0
|
111 [tmp, proto_index]=sort(d');
|
wolffd@0
|
112
|
wolffd@0
|
113 %% Select up to K closest prototypes
|
wolffd@0
|
114 proto_index=proto_index(1:K,:);
|
wolffd@0
|
115 knn_class=Cp(proto_index);
|
wolffd@0
|
116 for i=1:N_class,
|
wolffd@0
|
117 classcounter(:,:,i)=cumsum(knn_class==ClassIndex(i));
|
wolffd@0
|
118 end
|
wolffd@0
|
119
|
wolffd@0
|
120 %% Vote between classes of K neighbors
|
wolffd@0
|
121 [winner,vote_index]=max(classcounter,[],3);
|
wolffd@0
|
122
|
wolffd@0
|
123 %%% Handle ties
|
wolffd@0
|
124
|
wolffd@0
|
125 % Set index to classes that got as much votes as winner
|
wolffd@0
|
126
|
wolffd@0
|
127 equal_to_winner=(repmat(winner,[1 1 N_class])==classcounter);
|
wolffd@0
|
128
|
wolffd@0
|
129 % set index to ties
|
wolffd@0
|
130 [tie_indexi,tie_indexj]=find(sum(equal_to_winner,3)>1); % drop the winner from counter
|
wolffd@0
|
131
|
wolffd@0
|
132 % Go through tie cases and reset vote_index randomly to one
|
wolffd@0
|
133 % of them
|
wolffd@0
|
134
|
wolffd@0
|
135 for i=1:length(tie_indexi),
|
wolffd@0
|
136 tie_class_index=find(squeeze(equal_to_winner(tie_indexi(i),tie_indexj(i),:)));
|
wolffd@0
|
137 fortuna=randperm(length(tie_class_index));
|
wolffd@0
|
138 vote_index(tie_indexi(i),tie_indexj(i))=tie_class_index(fortuna(1));
|
wolffd@0
|
139 end
|
wolffd@0
|
140
|
wolffd@0
|
141 C=ClassIndex(vote_index)';
|
wolffd@0
|
142 end
|
wolffd@0
|
143
|
wolffd@0
|
144 %% Build output %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
145
|
wolffd@0
|
146 % Relative amount of classes in K neighbors for each classifiee
|
wolffd@0
|
147
|
wolffd@0
|
148 if K==1,
|
wolffd@0
|
149 P=zeros(N_data,N_class);
|
wolffd@0
|
150 if nargout>1,
|
wolffd@0
|
151 for i=1:N_data,
|
wolffd@0
|
152 P(i,ClassIndex==C(i))=1;
|
wolffd@0
|
153 end
|
wolffd@0
|
154 end
|
wolffd@0
|
155 else
|
wolffd@0
|
156 P=shiftdim(classcounter,1)./repmat(shiftdim(1:K,-1), [N_data N_class 1]);
|
wolffd@0
|
157 end
|
wolffd@0
|
158
|