Mercurial > hg > camir-aes2014
comparison toolboxes/MIRtoolbox1.3.2/somtoolbox/som_read_data.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function sData = som_read_data(filename, varargin) | |
2 | |
3 %SOM_READ_DATA Read data from an ascii file in SOM_PAK format. | |
4 % | |
5 % sD = som_read_data(filename, dim, [missing]) | |
6 % sD = som_read_data(filename, [missing]) | |
7 % | |
8 % sD = som_read_data('system.data'); | |
9 % sD = som_read_data('system.data',10); | |
10 % sD = som_read_data('system.data','*'); | |
11 % sD = som_read_data('system.data',10,'*'); | |
12 % | |
13 % Input and output arguments ([]'s are optional): | |
14 % filename (string) input file name | |
15 % dim (scalar) input space dimension | |
16 % [missing] (string) string which indicates a missing component | |
17 % value, 'NaN' by default | |
18 % | |
19 % sD (struct) data struct | |
20 % | |
21 % Reads data from an ascii file. The file must be in SOM_PAK format, | |
22 % except that it may lack the input space dimension from the first | |
23 % line. | |
24 % | |
25 % For more help, try 'type som_read_data' or check out online documentation. | |
26 % See also SOM_WRITE_DATA, SOM_READ_COD, SOM_WRITE_COD, SOM_DATA_STRUCT. | |
27 | |
28 %%%%%%%%%%%%% DETAILED DESCRIPTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
29 % | |
30 % som_read_data | |
31 % | |
32 % PURPOSE | |
33 % | |
34 % Reads data from an ascii file in SOM_PAK format. | |
35 % | |
36 % SYNTAX | |
37 % | |
38 % sD = som_read_data(filename) | |
39 % sD = som_read_data(..., dim) | |
40 % sD = som_read_data(..., 'missing') | |
41 % sD = som_read_data(..., dim, 'missing') | |
42 % | |
43 % DESCRIPTION | |
44 % | |
45 % This function is offered for compatibility with SOM_PAK, a SOM software | |
46 % package in C. It reads data from a file in SOM_PAK format. | |
47 % | |
48 % The SOM_PAK data file format is as follows. The first line must | |
49 % contain the input space dimension and nothing else. The following | |
50 % lines are comment lines, empty lines or data lines. Unlike programs | |
51 % in SOM_PAK, this function can also determine the input dimension | |
52 % from the first data lines, if the input space dimension line is | |
53 % missing. Note that the SOM_PAK format is not fully supported: data | |
54 % vector 'weight' and 'fixed' properties are ignored (they are treated | |
55 % as labels). | |
56 % | |
57 % Each data line contains one data vector and its labels. From the beginning | |
58 % of the line, first are values of the vector components separated by | |
59 % whitespaces, then labels also separated by whitespaces. If there are | |
60 % missing values in the vector, the missing value marker needs to be | |
61 % specified as the last input argument ('NaN' by default). The missing | |
62 % values are stored as NaNs in the data struct. | |
63 % | |
64 % Comment lines start with '#'. Comment lines as well as empty lines are | |
65 % ignored, except if the comment lines that start with '#n' or '#l'. In that | |
66 % case the line should contain names of the vector components or label names | |
67 % separated by whitespaces. | |
68 % | |
69 % NOTE: The minimum value Matlab is able to deal with (realmax) | |
70 % should not appear in the input file. This is because function sscanf is | |
71 % not able to read NaNs: the NaNs are in the read phase converted to value | |
72 % realmax. | |
73 % | |
74 % REQUIRED INPUT ARGUMENTS | |
75 % | |
76 % filename (string) input filename | |
77 % | |
78 % OPTIONAL INPUT ARGUMENTS | |
79 % | |
80 % dim (scalar) input space dimension | |
81 % missing (string) string used to denote missing components (NaNs); | |
82 % default is 'NaN' | |
83 % | |
84 % OUTPUT ARGUMENTS | |
85 % | |
86 % sD (struct) the resulting data struct | |
87 % | |
88 % EXAMPLES | |
89 % | |
90 % The basic usage is: | |
91 % sD = som_read_data('system.data'); | |
92 % | |
93 % If you know the input space dimension beforehand, and the file does | |
94 % not contain it on the first line, it helps if you specify it as the | |
95 % second argument: | |
96 % sD = som_read_data('system.data',9); | |
97 % | |
98 % If the missing components in the data are marked with some other | |
99 % characters than with 'NaN', you can specify it with the last argument: | |
100 % sD = som_read_data('system.data',9,'*') | |
101 % sD = som_read_data('system.data','NaN') | |
102 % | |
103 % Here's an example data file: | |
104 % | |
105 % 5 | |
106 % #n one two three four five | |
107 % #l ID | |
108 % 10 2 3 4 5 1stline label | |
109 % 0.4 0.3 0.2 0.5 0.1 2ndline label1 label2 | |
110 % # comment line: missing components are indicated by 'x':s | |
111 % 1 x 1 x 1 3rdline missing_components | |
112 % x 1 2 2 2 | |
113 % x x x x x 5thline emptyline | |
114 % | |
115 % SEE ALSO | |
116 % | |
117 % som_write_data Writes data structs/matrices to a file in SOM_PAK format. | |
118 % som_read_cod Read a map from a file in SOM_PAK format. | |
119 % som_write_cod Writes data struct into a file in SOM_PAK format. | |
120 % som_data_struct Creates data structs. | |
121 | |
122 % Copyright (c) 1997-2000 by the SOM toolbox programming team. | |
123 % http://www.cis.hut.fi/projects/somtoolbox/ | |
124 | |
125 % Version 1.0beta ecco 221097 | |
126 % Version 2.0beta ecco 060899, juuso 151199 | |
127 | |
128 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
129 %% check arguments | |
130 | |
131 error(nargchk(1, 3, nargin)) % check no. of input args is correct | |
132 | |
133 dont_care = 'NaN'; % default don't care string | |
134 comment_start = '#'; % the char a SOM_PAK command line starts with | |
135 comp_name_line = '#n'; % string denoting a special command line, | |
136 % which contains names of each component | |
137 label_name_line = '#l'; % string denoting a special command line, | |
138 % which contains names of each label | |
139 block_size = 1000; % block size used in file read | |
140 | |
141 kludge = num2str(realmax, 100); % used in sscanf | |
142 | |
143 | |
144 % open input file | |
145 | |
146 fid = fopen(filename); | |
147 if fid < 0 | |
148 error(['Cannot open ' filename]); | |
149 end | |
150 | |
151 % process input arguments | |
152 | |
153 if nargin == 2 | |
154 if isstr(varargin{1}) | |
155 dont_care = varargin{1}; | |
156 else | |
157 dim = varargin{1}; | |
158 end | |
159 elseif nargin == 3 | |
160 dim = varargin{1}; | |
161 dont_care = varargin{2}; | |
162 end | |
163 | |
164 % if the data dimension is not specified, find out what it is | |
165 | |
166 if nargin == 1 | (nargin == 2 & isstr(varargin{1})) | |
167 | |
168 fpos1 = ftell(fid); c1 = 0; % read first non-comment line | |
169 while c1 == 0, | |
170 line1 = strrep(fgetl(fid), dont_care, kludge); | |
171 [l1, c1] = sscanf(line1, '%f '); | |
172 end | |
173 | |
174 fpos2 = ftell(fid); c2 = 0; % read second non-comment line | |
175 while c2 == 0, | |
176 line2 = strrep(fgetl(fid), dont_care, kludge); | |
177 [l2, c2] = sscanf(line2, '%f '); | |
178 end | |
179 | |
180 if (c1 == 1 & c2 ~= 1) | (c1 == c2 & c1 == 1 & l1 == 1) | |
181 dim = l1; | |
182 fseek(fid, fpos2, -1); | |
183 elseif (c1 == c2) | |
184 dim = c1; | |
185 fseek(fid, fpos1, -1); | |
186 warning on | |
187 warning(['Automatically determined data dimension is ' ... | |
188 num2str(dim) '. Is it correct?']); | |
189 else | |
190 error(['Invalid header line: ' line1]); | |
191 end | |
192 end | |
193 | |
194 % check the dimension is valid | |
195 | |
196 if dim < 1 | dim ~= round(dim) | |
197 error(['Illegal data dimension: ' num2str(dim)]); | |
198 end | |
199 | |
200 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
201 %% read data | |
202 | |
203 sData = som_data_struct(zeros(1, dim), 'name', filename); | |
204 lnum = 0; % data vector counter | |
205 data_temp = zeros(block_size, dim); | |
206 labs_temp = cell(block_size, 1); | |
207 comp_names = sData.comp_names; | |
208 label_names = sData.label_names; | |
209 form = [repmat('%g',[1 dim-1]) '%g%[^ \t]']; | |
210 | |
211 limit = block_size; | |
212 while 1, | |
213 li = fgetl(fid); % read next line | |
214 if ~isstr(li), break, end; % is this the end of file? | |
215 | |
216 % all missing vectors are replaced by value realmax because | |
217 % sscanf is not able to read NaNs | |
218 li = strrep(li, dont_care, kludge); | |
219 [data, c, err, n] = sscanf(li, form); | |
220 if c < dim % if there were less numbers than dim on the input file line | |
221 if c == 0 | |
222 if strncmp(li, comp_name_line, 2) % component name line? | |
223 li = strrep(li(3:end), kludge, dont_care); i = 0; c = 1; | |
224 while c | |
225 [s, c, e, n] = sscanf(li, '%s%[^ \t]'); | |
226 if ~isempty(s), i = i + 1; comp_names{i} = s; li = li(n:end); end | |
227 end | |
228 | |
229 if i ~= dim | |
230 error(['Illegal number of component names: ' num2str(i) ... | |
231 ' (dimension is ' num2str(dim) ')']); | |
232 end | |
233 elseif strncmp(li, label_name_line, 2) % label name line? | |
234 li = strrep(li(3:end), kludge, dont_care); i = 0; c = 1; | |
235 while c | |
236 [s, c, e, n] = sscanf(li, '%s%[^ \t]'); | |
237 if ~isempty(s), i = i + 1; label_names{i} = s; li = li(n:end); end | |
238 end | |
239 elseif ~strncmp(li, comment_start, 1) % not a comment, is it error? | |
240 [s, c, e, n] = sscanf(li, '%s%[^ \t]'); | |
241 if c | |
242 error(['Invalid vector on input file data line ' ... | |
243 num2str(lnum+1) ': [' deblank(li) ']']), | |
244 end | |
245 end | |
246 else | |
247 error(['Only ' num2str(c) ' vector components on input file data line ' ... | |
248 num2str(lnum+1) ' (dimension is ' num2str(dim) ')']); | |
249 end | |
250 | |
251 else | |
252 | |
253 lnum = lnum + 1; % this was a line containing data vector | |
254 data_temp(lnum, 1:dim) = data'; % add data to struct | |
255 | |
256 if lnum == limit % reserve more memory if necessary | |
257 data_temp(lnum+1:lnum+block_size, 1:dim) = zeros(block_size, dim); | |
258 [dummy nl] = size(labs_temp); | |
259 labs_temp(lnum+1:lnum+block_size,1:nl) = cell(block_size, nl); | |
260 limit = limit + block_size; | |
261 end | |
262 | |
263 % read labels | |
264 | |
265 if n < length(li) | |
266 li = strrep(li(n:end), kludge, dont_care); i = 0; n = 1; c = 1; | |
267 while c | |
268 [s, c, e, n_new] = sscanf(li(n:end), '%s%[^ \t]'); | |
269 if c, i = i + 1; labs_temp{lnum, i} = s; n = n + n_new - 1; end | |
270 end | |
271 end | |
272 end | |
273 end | |
274 | |
275 % close input file | |
276 if fclose(fid) < 0, error(['Cannot close file ' filename]); | |
277 else fprintf(2, '\rdata read ok \n'); end | |
278 | |
279 % set values | |
280 data_temp(data_temp == realmax) = NaN; | |
281 sData.data = data_temp(1:lnum,:); | |
282 sData.labels = labs_temp(1:lnum,:); | |
283 sData.comp_names = comp_names; | |
284 sData.label_names = label_names; | |
285 | |
286 return; | |
287 | |
288 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |