comparison core/magnatagatune/AnnotDB.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 % The thesaurus class is a basic component of all
2 % genre and tag information managing the whole Vocabulary.
3
4 classdef AnnotDB < handle
5
6 % public properties
7 properties (SetAccess = private)
8 lexicon = {};
9
10 end
11
12 properties(Hidden, Access = private)
13
14 annotsdb; % a numowners x numannots sparse binary / prob matrix
15 annots_oid; % ownerid to pos in annots conversion
16
17 binary = 0; % indicator whether the db contains binary or scored annots
18 end
19
20 methods
21
22 % ---
23 % simple constructor
24 % ---
25 function db = AnnotDB(lexicon, annots, ids)
26 % db = AnnotDB(lexicon, annots, annotation_ids)
27 % lexicon: the list of all individual annotation elements or
28 % clips_by_annot or
29 % lexids_by_clip
30
31 % annots: either a clips x numel(lexicon) binary matrix or
32 % ...
33 %
34 % annotation_ids: clip ids for the binary case,
35
36 if nargin >= 1
37
38 % ---
39 % NOTE: two ways of supplying the annots are allowed:
40 % 1. clip ids for each lexical element
41 % 2. binary matrix
42 % ---
43 if ischar(lexicon)
44
45 if strcmp(lexicon, 'clips_by_annot')
46
47 % ---
48 % preset the lexicon and hash ids
49 % ---
50 db.lexicon = unique(lower(annots));
51
52 if iscell(ids)
53 db.annots_oid = unique([ids{:}]);
54 else
55 db.annots_oid = unique(ids);
56 end
57
58 db.annotsdb = sparse(numel(db.annots_oid),...
59 numel( db.lexicon));
60
61 % for all annotations
62 for i = 1:numel(annots)
63
64 % for all ids in set
65 % is this a cell or just a single index<
66 if iscell(ids)
67 for j = 1:numel(ids{i})
68
69 db.add_pair(ids{i}(j), annots{i});
70 end
71 else
72 % single ndex case
73 db.add_pair(ids(i), annots{i});
74 end
75 end
76
77 elseif strcmp(lexicon, 'annots_by_clip')
78
79 end
80 % this is the binary case
81 else
82
83 db.lexicon = lexicon;
84 db.annotsdb = sparse(0,0);
85 if nargin >= 2
86
87 db.annotsdb = sparse(annots);
88 db.annots_oid = ids;
89 else
90 db.annotsdb = sparse(0, numel(db.lexicon));
91 end
92 end
93 end
94 end
95
96 % ---
97 % retrieve annot-substructure for given clip ids,
98 % collecting std = [or = all] ,[and = common]
99 % annots for these
100 % ---
101 function new_db = subset(db, ownerids, mode)
102 % new_db = subset(db, ownerids, {'and', ['or']})
103
104 if nargin < 3
105 mode = 'or';
106 end
107
108 % ---
109 % create new DB
110 % we make sure the tag id index keeps
111 % the same for subsets by copying the whole
112 % lexicon
113 % ---
114 new_db = AnnotDB(db.lexicon);
115
116 switch lower(mode)
117 case 'and'
118
119 % ---
120 % TODO: implement this and
121 % improve speed below
122 % ---
123 case 'or'
124
125 % successively fill with given annots
126 for i = 1:numel(ownerids)
127
128 % ---
129 % we retrieve annots for each clip
130 % and add them to the new database
131 % ---
132 [annot, score] = annots(db, ownerids(i));
133 for j = 1:numel(annot)
134
135 new_db.add_pair(ownerids(i), annot{j}, score(j));
136 end
137 end
138 otherwise
139 error 'illegal owner id combination mode. possibly forgot brackets';
140 end
141 end
142
143 % retrieve annot-substructure for complement
144 % of given clip ids
145 function [new_db] = exclude(db, ownerids)
146
147 % get complement of clip ids
148 ownerids = setdiff(db.annots_oid, ownerids);
149
150 new_db = subset(db, ownerids);
151 end
152
153 % ---
154 % retrieve clip by annot.
155 % if multiple annots are given, the clips
156 % containing all of them (logical and) are
157 % returned
158 % ---
159 function oids = owner(db, annotstr, mode)
160
161 if nargin < 3
162 mode = 'and';
163 end
164
165 if ~iscell(annotstr)
166 annotstr = {annotstr};
167 end
168
169 annotid = [];
170 for i = 1:numel(annotstr)
171
172 annotid = [annotid strcellfind(db.lexicon, annotstr{i})];
173 end
174
175 oids = owner_for_annotid(db, annotid, mode);
176 end
177
178 % retrieve owner ids by clip
179 function ownerids = owner_for_annotid(db, annotid, mode)
180 % ownerids = ownerids_for_annotid(db, annotid, {['and'], 'or'})
181
182 if isempty(annotid)
183 ownerids = [];
184 return
185 end
186 if nargin < 3
187 mode = 'and';
188 end
189
190 switch lower(mode)
191 case 'or'
192 % search for all appearing owners
193 candidates = sum(db.annotsdb(:, annotid), 2) > 0;
194
195 case 'and'
196 % search for the common owners
197 candidates = sum(db.annotsdb(:, annotid), 2) == ...
198 numel(annotid);
199 otherwise
200 error 'illegal tag combination mode';
201 end
202
203
204 % get positions in database
205 pos = find(candidates);
206
207 % return owner ids
208 ownerids = db.annots_oid(pos);
209 end
210
211 % retrieve annotid by clip
212 function [aid, score] = annotids_for_owner(db, ownerid, mode)
213
214 % single query case
215 if numel(ownerid) == 1
216
217 pos = owner_pos(db, ownerid);
218
219 % get positions in database
220 aid = find(db.annotsdb(pos, :) > 0);
221
222 score = db.annotsdb(pos, aid);
223
224
225 % sort ids for output
226 if ~db.binary
227
228 [score, idx] = sort(score, 'descend');
229 aid = aid(idx);
230 end
231 else
232 if nargin < 3
233 mode = 'or';
234 end
235
236 % ---
237 % the query contained multiple ids
238 %
239 % we dont return the single results but
240 % the statistics for this subset of clips
241 % ---
242 new_db = db.subset(ownerid, mode);
243 [null, score, aid] = new_db.stats_count();
244
245 % cut off at score > 0 to abandon unused tags
246 u = find(score > 0,1,'last');
247 score = score(1:u);
248 aid = aid(1:u);
249 end
250 end
251
252 % retrieve annotation by clip
253 function [out, score, aid] = annots(db, ownerid)
254
255 [aid, score] = db.annotids_for_owner( ownerid);
256
257 out = db.get_annot_name(aid);
258 end
259
260
261 % retrieve annot name given a annot id
262 function out = get_annot_name(db, annotid)
263
264 out = {};
265 for i = 1:numel(annotid)
266
267 out{i} = db.lexicon{annotid(i)};
268 end
269 end
270
271 % return annotation id for annotation string
272 function aid = get_annot_id(db, annotstr)
273
274 if ~iscell(annotstr)
275
276 % expensive search within annot list
277 aid = strcellfind(db.lexicon, annotstr);
278 else
279
280 % search seperately for each annot
281 for i = 1:numel(annotstr)
282 aid(i) = strcellfind(db.lexicon, annotstr{i});
283 end
284 end
285 end
286
287 % ---
288 % return statistics on saved annotations.
289 % = returns the sum of the scores and
290 % sortec lexicon
291 % ---
292 function [labels, score, annotids] = stats(db)
293
294 % out = zeros(1, size(db.annotsdb,2));
295 score = full(sum(db.annotsdb, 1));
296 [score, annotids] = sort(score,'descend');
297
298 % prepare labels
299 labels = db.lexicon(annotids);
300 end
301
302 % ---
303 % return statistics on saved annotations.
304 % = returns the number of annotations and
305 % sortec lexicon
306 % ---
307 function [labels, score, annotids] = stats_count(db)
308
309 % out = zeros(1, size(db.annotsdb,2));
310 score = full(sum(db.annotsdb > 0, 1));
311 [score, annotids] = sort(score,'descend');
312
313 % prepare labels
314 labels = db.lexicon(annotids);
315 end
316
317 % this is a stub for a tag cloud-like output
318 function [out] = annots_cloud(db, ownerid)
319
320 % ---
321 % TODO: actually output tag-cloud
322 % this output is aimed at input into a web interface
323 % we successfully used http://www.wordle.net/
324 % ---
325
326 if nargin > 1
327 db2 = db.subset(ownerid);
328 else
329 db2 = db;
330 end
331
332 [labels, score, annotids] = stats(db2);
333
334 % ---
335 % Note: for performance issues we compress this data
336 % to a maximum value of 1001
337 % ---
338 score = ceil((score./max(score))*100);
339
340 out = '';
341 for i = 1:numel(annotids)
342
343 % repeat the tag according to score
344 annot = strrep(labels{i},' ','-');
345 for j = 1:score(i)
346 out = sprintf('%s; %s',annot, out);
347 end
348 end
349 end
350
351
352 function out = size(db)
353 % returns the size of this db
354
355 out = numel(db.lexicon);
356 end
357
358 function add_pair(db, ownerid, annot, score)
359 % add_pair(db, owner, annot) adds an annot and owner and can
360 % increase the lexicon size
361
362 if nargin < 4
363 score = 1;
364 end
365
366 aid = strcellfind(db.lexicon, annot);
367
368 % create new position for annotation if neccesary
369 if isempty(aid)
370
371 aid = numel(db.lexicon) + 1;
372
373 % add to lexicon
374 db.lexicon = {db.lexicon{:}, annot};
375
376 % enhance annotation matrix
377 db.annotsdb = [db.annotsdb, ...
378 sparse(size(db.annotsdb,1), 1)];
379 end
380
381
382 % create new position for clip if neccesary
383 pos = owner_pos(db, ownerid);
384 if isempty(pos)
385
386 pos = numel(db.annots_oid) +1;
387
388 % add to oid
389 db.annots_oid = [db.annots_oid, ownerid];
390
391 % enhance annotation matrix
392 db.annotsdb = [db.annotsdb; ...
393 sparse(1, size(db.annotsdb, 2))];
394 end
395
396 % save data to database
397 db.annotsdb(pos, aid) = score;
398 end
399
400 end
401
402
403 methods(Hidden)
404
405 function pos = owner_pos(db, ownerid)
406
407 % returns database position for owner id
408 pos = find(db.annots_oid == ownerid);
409 end
410 end
411
412 end