wolffd@0
|
1 function sData = som_read_data(filename, varargin)
|
wolffd@0
|
2
|
wolffd@0
|
3 %SOM_READ_DATA Read data from an ascii file in SOM_PAK format.
|
wolffd@0
|
4 %
|
wolffd@0
|
5 % sD = som_read_data(filename, dim, [missing])
|
wolffd@0
|
6 % sD = som_read_data(filename, [missing])
|
wolffd@0
|
7 %
|
wolffd@0
|
8 % sD = som_read_data('system.data');
|
wolffd@0
|
9 % sD = som_read_data('system.data',10);
|
wolffd@0
|
10 % sD = som_read_data('system.data','*');
|
wolffd@0
|
11 % sD = som_read_data('system.data',10,'*');
|
wolffd@0
|
12 %
|
wolffd@0
|
13 % Input and output arguments ([]'s are optional):
|
wolffd@0
|
14 % filename (string) input file name
|
wolffd@0
|
15 % dim (scalar) input space dimension
|
wolffd@0
|
16 % [missing] (string) string which indicates a missing component
|
wolffd@0
|
17 % value, 'NaN' by default
|
wolffd@0
|
18 %
|
wolffd@0
|
19 % sD (struct) data struct
|
wolffd@0
|
20 %
|
wolffd@0
|
21 % Reads data from an ascii file. The file must be in SOM_PAK format,
|
wolffd@0
|
22 % except that it may lack the input space dimension from the first
|
wolffd@0
|
23 % line.
|
wolffd@0
|
24 %
|
wolffd@0
|
25 % For more help, try 'type som_read_data' or check out online documentation.
|
wolffd@0
|
26 % See also SOM_WRITE_DATA, SOM_READ_COD, SOM_WRITE_COD, SOM_DATA_STRUCT.
|
wolffd@0
|
27
|
wolffd@0
|
28 %%%%%%%%%%%%% DETAILED DESCRIPTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
29 %
|
wolffd@0
|
30 % som_read_data
|
wolffd@0
|
31 %
|
wolffd@0
|
32 % PURPOSE
|
wolffd@0
|
33 %
|
wolffd@0
|
34 % Reads data from an ascii file in SOM_PAK format.
|
wolffd@0
|
35 %
|
wolffd@0
|
36 % SYNTAX
|
wolffd@0
|
37 %
|
wolffd@0
|
38 % sD = som_read_data(filename)
|
wolffd@0
|
39 % sD = som_read_data(..., dim)
|
wolffd@0
|
40 % sD = som_read_data(..., 'missing')
|
wolffd@0
|
41 % sD = som_read_data(..., dim, 'missing')
|
wolffd@0
|
42 %
|
wolffd@0
|
43 % DESCRIPTION
|
wolffd@0
|
44 %
|
wolffd@0
|
45 % This function is offered for compatibility with SOM_PAK, a SOM software
|
wolffd@0
|
46 % package in C. It reads data from a file in SOM_PAK format.
|
wolffd@0
|
47 %
|
wolffd@0
|
48 % The SOM_PAK data file format is as follows. The first line must
|
wolffd@0
|
49 % contain the input space dimension and nothing else. The following
|
wolffd@0
|
50 % lines are comment lines, empty lines or data lines. Unlike programs
|
wolffd@0
|
51 % in SOM_PAK, this function can also determine the input dimension
|
wolffd@0
|
52 % from the first data lines, if the input space dimension line is
|
wolffd@0
|
53 % missing. Note that the SOM_PAK format is not fully supported: data
|
wolffd@0
|
54 % vector 'weight' and 'fixed' properties are ignored (they are treated
|
wolffd@0
|
55 % as labels).
|
wolffd@0
|
56 %
|
wolffd@0
|
57 % Each data line contains one data vector and its labels. From the beginning
|
wolffd@0
|
58 % of the line, first are values of the vector components separated by
|
wolffd@0
|
59 % whitespaces, then labels also separated by whitespaces. If there are
|
wolffd@0
|
60 % missing values in the vector, the missing value marker needs to be
|
wolffd@0
|
61 % specified as the last input argument ('NaN' by default). The missing
|
wolffd@0
|
62 % values are stored as NaNs in the data struct.
|
wolffd@0
|
63 %
|
wolffd@0
|
64 % Comment lines start with '#'. Comment lines as well as empty lines are
|
wolffd@0
|
65 % ignored, except if the comment lines that start with '#n' or '#l'. In that
|
wolffd@0
|
66 % case the line should contain names of the vector components or label names
|
wolffd@0
|
67 % separated by whitespaces.
|
wolffd@0
|
68 %
|
wolffd@0
|
69 % NOTE: The minimum value Matlab is able to deal with (realmax)
|
wolffd@0
|
70 % should not appear in the input file. This is because function sscanf is
|
wolffd@0
|
71 % not able to read NaNs: the NaNs are in the read phase converted to value
|
wolffd@0
|
72 % realmax.
|
wolffd@0
|
73 %
|
wolffd@0
|
74 % REQUIRED INPUT ARGUMENTS
|
wolffd@0
|
75 %
|
wolffd@0
|
76 % filename (string) input filename
|
wolffd@0
|
77 %
|
wolffd@0
|
78 % OPTIONAL INPUT ARGUMENTS
|
wolffd@0
|
79 %
|
wolffd@0
|
80 % dim (scalar) input space dimension
|
wolffd@0
|
81 % missing (string) string used to denote missing components (NaNs);
|
wolffd@0
|
82 % default is 'NaN'
|
wolffd@0
|
83 %
|
wolffd@0
|
84 % OUTPUT ARGUMENTS
|
wolffd@0
|
85 %
|
wolffd@0
|
86 % sD (struct) the resulting data struct
|
wolffd@0
|
87 %
|
wolffd@0
|
88 % EXAMPLES
|
wolffd@0
|
89 %
|
wolffd@0
|
90 % The basic usage is:
|
wolffd@0
|
91 % sD = som_read_data('system.data');
|
wolffd@0
|
92 %
|
wolffd@0
|
93 % If you know the input space dimension beforehand, and the file does
|
wolffd@0
|
94 % not contain it on the first line, it helps if you specify it as the
|
wolffd@0
|
95 % second argument:
|
wolffd@0
|
96 % sD = som_read_data('system.data',9);
|
wolffd@0
|
97 %
|
wolffd@0
|
98 % If the missing components in the data are marked with some other
|
wolffd@0
|
99 % characters than with 'NaN', you can specify it with the last argument:
|
wolffd@0
|
100 % sD = som_read_data('system.data',9,'*')
|
wolffd@0
|
101 % sD = som_read_data('system.data','NaN')
|
wolffd@0
|
102 %
|
wolffd@0
|
103 % Here's an example data file:
|
wolffd@0
|
104 %
|
wolffd@0
|
105 % 5
|
wolffd@0
|
106 % #n one two three four five
|
wolffd@0
|
107 % #l ID
|
wolffd@0
|
108 % 10 2 3 4 5 1stline label
|
wolffd@0
|
109 % 0.4 0.3 0.2 0.5 0.1 2ndline label1 label2
|
wolffd@0
|
110 % # comment line: missing components are indicated by 'x':s
|
wolffd@0
|
111 % 1 x 1 x 1 3rdline missing_components
|
wolffd@0
|
112 % x 1 2 2 2
|
wolffd@0
|
113 % x x x x x 5thline emptyline
|
wolffd@0
|
114 %
|
wolffd@0
|
115 % SEE ALSO
|
wolffd@0
|
116 %
|
wolffd@0
|
117 % som_write_data Writes data structs/matrices to a file in SOM_PAK format.
|
wolffd@0
|
118 % som_read_cod Read a map from a file in SOM_PAK format.
|
wolffd@0
|
119 % som_write_cod Writes data struct into a file in SOM_PAK format.
|
wolffd@0
|
120 % som_data_struct Creates data structs.
|
wolffd@0
|
121
|
wolffd@0
|
122 % Copyright (c) 1997-2000 by the SOM toolbox programming team.
|
wolffd@0
|
123 % http://www.cis.hut.fi/projects/somtoolbox/
|
wolffd@0
|
124
|
wolffd@0
|
125 % Version 1.0beta ecco 221097
|
wolffd@0
|
126 % Version 2.0beta ecco 060899, juuso 151199
|
wolffd@0
|
127
|
wolffd@0
|
128 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
129 %% check arguments
|
wolffd@0
|
130
|
wolffd@0
|
131 error(nargchk(1, 3, nargin)) % check no. of input args is correct
|
wolffd@0
|
132
|
wolffd@0
|
133 dont_care = 'NaN'; % default don't care string
|
wolffd@0
|
134 comment_start = '#'; % the char a SOM_PAK command line starts with
|
wolffd@0
|
135 comp_name_line = '#n'; % string denoting a special command line,
|
wolffd@0
|
136 % which contains names of each component
|
wolffd@0
|
137 label_name_line = '#l'; % string denoting a special command line,
|
wolffd@0
|
138 % which contains names of each label
|
wolffd@0
|
139 block_size = 1000; % block size used in file read
|
wolffd@0
|
140
|
wolffd@0
|
141 kludge = num2str(realmax, 100); % used in sscanf
|
wolffd@0
|
142
|
wolffd@0
|
143
|
wolffd@0
|
144 % open input file
|
wolffd@0
|
145
|
wolffd@0
|
146 fid = fopen(filename);
|
wolffd@0
|
147 if fid < 0
|
wolffd@0
|
148 error(['Cannot open ' filename]);
|
wolffd@0
|
149 end
|
wolffd@0
|
150
|
wolffd@0
|
151 % process input arguments
|
wolffd@0
|
152
|
wolffd@0
|
153 if nargin == 2
|
wolffd@0
|
154 if isstr(varargin{1})
|
wolffd@0
|
155 dont_care = varargin{1};
|
wolffd@0
|
156 else
|
wolffd@0
|
157 dim = varargin{1};
|
wolffd@0
|
158 end
|
wolffd@0
|
159 elseif nargin == 3
|
wolffd@0
|
160 dim = varargin{1};
|
wolffd@0
|
161 dont_care = varargin{2};
|
wolffd@0
|
162 end
|
wolffd@0
|
163
|
wolffd@0
|
164 % if the data dimension is not specified, find out what it is
|
wolffd@0
|
165
|
wolffd@0
|
166 if nargin == 1 | (nargin == 2 & isstr(varargin{1}))
|
wolffd@0
|
167
|
wolffd@0
|
168 fpos1 = ftell(fid); c1 = 0; % read first non-comment line
|
wolffd@0
|
169 while c1 == 0,
|
wolffd@0
|
170 line1 = strrep(fgetl(fid), dont_care, kludge);
|
wolffd@0
|
171 [l1, c1] = sscanf(line1, '%f ');
|
wolffd@0
|
172 end
|
wolffd@0
|
173
|
wolffd@0
|
174 fpos2 = ftell(fid); c2 = 0; % read second non-comment line
|
wolffd@0
|
175 while c2 == 0,
|
wolffd@0
|
176 line2 = strrep(fgetl(fid), dont_care, kludge);
|
wolffd@0
|
177 [l2, c2] = sscanf(line2, '%f ');
|
wolffd@0
|
178 end
|
wolffd@0
|
179
|
wolffd@0
|
180 if (c1 == 1 & c2 ~= 1) | (c1 == c2 & c1 == 1 & l1 == 1)
|
wolffd@0
|
181 dim = l1;
|
wolffd@0
|
182 fseek(fid, fpos2, -1);
|
wolffd@0
|
183 elseif (c1 == c2)
|
wolffd@0
|
184 dim = c1;
|
wolffd@0
|
185 fseek(fid, fpos1, -1);
|
wolffd@0
|
186 warning on
|
wolffd@0
|
187 warning(['Automatically determined data dimension is ' ...
|
wolffd@0
|
188 num2str(dim) '. Is it correct?']);
|
wolffd@0
|
189 else
|
wolffd@0
|
190 error(['Invalid header line: ' line1]);
|
wolffd@0
|
191 end
|
wolffd@0
|
192 end
|
wolffd@0
|
193
|
wolffd@0
|
194 % check the dimension is valid
|
wolffd@0
|
195
|
wolffd@0
|
196 if dim < 1 | dim ~= round(dim)
|
wolffd@0
|
197 error(['Illegal data dimension: ' num2str(dim)]);
|
wolffd@0
|
198 end
|
wolffd@0
|
199
|
wolffd@0
|
200 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
wolffd@0
|
201 %% read data
|
wolffd@0
|
202
|
wolffd@0
|
203 sData = som_data_struct(zeros(1, dim), 'name', filename);
|
wolffd@0
|
204 lnum = 0; % data vector counter
|
wolffd@0
|
205 data_temp = zeros(block_size, dim);
|
wolffd@0
|
206 labs_temp = cell(block_size, 1);
|
wolffd@0
|
207 comp_names = sData.comp_names;
|
wolffd@0
|
208 label_names = sData.label_names;
|
wolffd@0
|
209 form = [repmat('%g',[1 dim-1]) '%g%[^ \t]'];
|
wolffd@0
|
210
|
wolffd@0
|
211 limit = block_size;
|
wolffd@0
|
212 while 1,
|
wolffd@0
|
213 li = fgetl(fid); % read next line
|
wolffd@0
|
214 if ~isstr(li), break, end; % is this the end of file?
|
wolffd@0
|
215
|
wolffd@0
|
216 % all missing vectors are replaced by value realmax because
|
wolffd@0
|
217 % sscanf is not able to read NaNs
|
wolffd@0
|
218 li = strrep(li, dont_care, kludge);
|
wolffd@0
|
219 [data, c, err, n] = sscanf(li, form);
|
wolffd@0
|
220 if c < dim % if there were less numbers than dim on the input file line
|
wolffd@0
|
221 if c == 0
|
wolffd@0
|
222 if strncmp(li, comp_name_line, 2) % component name line?
|
wolffd@0
|
223 li = strrep(li(3:end), kludge, dont_care); i = 0; c = 1;
|
wolffd@0
|
224 while c
|
wolffd@0
|
225 [s, c, e, n] = sscanf(li, '%s%[^ \t]');
|
wolffd@0
|
226 if ~isempty(s), i = i + 1; comp_names{i} = s; li = li(n:end); end
|
wolffd@0
|
227 end
|
wolffd@0
|
228
|
wolffd@0
|
229 if i ~= dim
|
wolffd@0
|
230 error(['Illegal number of component names: ' num2str(i) ...
|
wolffd@0
|
231 ' (dimension is ' num2str(dim) ')']);
|
wolffd@0
|
232 end
|
wolffd@0
|
233 elseif strncmp(li, label_name_line, 2) % label name line?
|
wolffd@0
|
234 li = strrep(li(3:end), kludge, dont_care); i = 0; c = 1;
|
wolffd@0
|
235 while c
|
wolffd@0
|
236 [s, c, e, n] = sscanf(li, '%s%[^ \t]');
|
wolffd@0
|
237 if ~isempty(s), i = i + 1; label_names{i} = s; li = li(n:end); end
|
wolffd@0
|
238 end
|
wolffd@0
|
239 elseif ~strncmp(li, comment_start, 1) % not a comment, is it error?
|
wolffd@0
|
240 [s, c, e, n] = sscanf(li, '%s%[^ \t]');
|
wolffd@0
|
241 if c
|
wolffd@0
|
242 error(['Invalid vector on input file data line ' ...
|
wolffd@0
|
243 num2str(lnum+1) ': [' deblank(li) ']']),
|
wolffd@0
|
244 end
|
wolffd@0
|
245 end
|
wolffd@0
|
246 else
|
wolffd@0
|
247 error(['Only ' num2str(c) ' vector components on input file data line ' ...
|
wolffd@0
|
248 num2str(lnum+1) ' (dimension is ' num2str(dim) ')']);
|
wolffd@0
|
249 end
|
wolffd@0
|
250
|
wolffd@0
|
251 else
|
wolffd@0
|
252
|
wolffd@0
|
253 lnum = lnum + 1; % this was a line containing data vector
|
wolffd@0
|
254 data_temp(lnum, 1:dim) = data'; % add data to struct
|
wolffd@0
|
255
|
wolffd@0
|
256 if lnum == limit % reserve more memory if necessary
|
wolffd@0
|
257 data_temp(lnum+1:lnum+block_size, 1:dim) = zeros(block_size, dim);
|
wolffd@0
|
258 [dummy nl] = size(labs_temp);
|
wolffd@0
|
259 labs_temp(lnum+1:lnum+block_size,1:nl) = cell(block_size, nl);
|
wolffd@0
|
260 limit = limit + block_size;
|
wolffd@0
|
261 end
|
wolffd@0
|
262
|
wolffd@0
|
263 % read labels
|
wolffd@0
|
264
|
wolffd@0
|
265 if n < length(li)
|
wolffd@0
|
266 li = strrep(li(n:end), kludge, dont_care); i = 0; n = 1; c = 1;
|
wolffd@0
|
267 while c
|
wolffd@0
|
268 [s, c, e, n_new] = sscanf(li(n:end), '%s%[^ \t]');
|
wolffd@0
|
269 if c, i = i + 1; labs_temp{lnum, i} = s; n = n + n_new - 1; end
|
wolffd@0
|
270 end
|
wolffd@0
|
271 end
|
wolffd@0
|
272 end
|
wolffd@0
|
273 end
|
wolffd@0
|
274
|
wolffd@0
|
275 % close input file
|
wolffd@0
|
276 if fclose(fid) < 0, error(['Cannot close file ' filename]);
|
wolffd@0
|
277 else fprintf(2, '\rdata read ok \n'); end
|
wolffd@0
|
278
|
wolffd@0
|
279 % set values
|
wolffd@0
|
280 data_temp(data_temp == realmax) = NaN;
|
wolffd@0
|
281 sData.data = data_temp(1:lnum,:);
|
wolffd@0
|
282 sData.labels = labs_temp(1:lnum,:);
|
wolffd@0
|
283 sData.comp_names = comp_names;
|
wolffd@0
|
284 sData.label_names = label_names;
|
wolffd@0
|
285
|
wolffd@0
|
286 return;
|
wolffd@0
|
287
|
wolffd@0
|
288 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |