annotate core/magnatagatune/AnnotDB.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 % The thesaurus class is a basic component of all
wolffd@0 2 % genre and tag information managing the whole Vocabulary.
wolffd@0 3
wolffd@0 4 classdef AnnotDB < handle
wolffd@0 5
wolffd@0 6 % public properties
wolffd@0 7 properties (SetAccess = private)
wolffd@0 8 lexicon = {};
wolffd@0 9
wolffd@0 10 end
wolffd@0 11
wolffd@0 12 properties(Hidden, Access = private)
wolffd@0 13
wolffd@0 14 annotsdb; % a numowners x numannots sparse binary / prob matrix
wolffd@0 15 annots_oid; % ownerid to pos in annots conversion
wolffd@0 16
wolffd@0 17 binary = 0; % indicator whether the db contains binary or scored annots
wolffd@0 18 end
wolffd@0 19
wolffd@0 20 methods
wolffd@0 21
wolffd@0 22 % ---
wolffd@0 23 % simple constructor
wolffd@0 24 % ---
wolffd@0 25 function db = AnnotDB(lexicon, annots, ids)
wolffd@0 26 % db = AnnotDB(lexicon, annots, annotation_ids)
wolffd@0 27 % lexicon: the list of all individual annotation elements or
wolffd@0 28 % clips_by_annot or
wolffd@0 29 % lexids_by_clip
wolffd@0 30
wolffd@0 31 % annots: either a clips x numel(lexicon) binary matrix or
wolffd@0 32 % ...
wolffd@0 33 %
wolffd@0 34 % annotation_ids: clip ids for the binary case,
wolffd@0 35
wolffd@0 36 if nargin >= 1
wolffd@0 37
wolffd@0 38 % ---
wolffd@0 39 % NOTE: two ways of supplying the annots are allowed:
wolffd@0 40 % 1. clip ids for each lexical element
wolffd@0 41 % 2. binary matrix
wolffd@0 42 % ---
wolffd@0 43 if ischar(lexicon)
wolffd@0 44
wolffd@0 45 if strcmp(lexicon, 'clips_by_annot')
wolffd@0 46
wolffd@0 47 % ---
wolffd@0 48 % preset the lexicon and hash ids
wolffd@0 49 % ---
wolffd@0 50 db.lexicon = unique(lower(annots));
wolffd@0 51
wolffd@0 52 if iscell(ids)
wolffd@0 53 db.annots_oid = unique([ids{:}]);
wolffd@0 54 else
wolffd@0 55 db.annots_oid = unique(ids);
wolffd@0 56 end
wolffd@0 57
wolffd@0 58 db.annotsdb = sparse(numel(db.annots_oid),...
wolffd@0 59 numel( db.lexicon));
wolffd@0 60
wolffd@0 61 % for all annotations
wolffd@0 62 for i = 1:numel(annots)
wolffd@0 63
wolffd@0 64 % for all ids in set
wolffd@0 65 % is this a cell or just a single index<
wolffd@0 66 if iscell(ids)
wolffd@0 67 for j = 1:numel(ids{i})
wolffd@0 68
wolffd@0 69 db.add_pair(ids{i}(j), annots{i});
wolffd@0 70 end
wolffd@0 71 else
wolffd@0 72 % single ndex case
wolffd@0 73 db.add_pair(ids(i), annots{i});
wolffd@0 74 end
wolffd@0 75 end
wolffd@0 76
wolffd@0 77 elseif strcmp(lexicon, 'annots_by_clip')
wolffd@0 78
wolffd@0 79 end
wolffd@0 80 % this is the binary case
wolffd@0 81 else
wolffd@0 82
wolffd@0 83 db.lexicon = lexicon;
wolffd@0 84 db.annotsdb = sparse(0,0);
wolffd@0 85 if nargin >= 2
wolffd@0 86
wolffd@0 87 db.annotsdb = sparse(annots);
wolffd@0 88 db.annots_oid = ids;
wolffd@0 89 else
wolffd@0 90 db.annotsdb = sparse(0, numel(db.lexicon));
wolffd@0 91 end
wolffd@0 92 end
wolffd@0 93 end
wolffd@0 94 end
wolffd@0 95
wolffd@0 96 % ---
wolffd@0 97 % retrieve annot-substructure for given clip ids,
wolffd@0 98 % collecting std = [or = all] ,[and = common]
wolffd@0 99 % annots for these
wolffd@0 100 % ---
wolffd@0 101 function new_db = subset(db, ownerids, mode)
wolffd@0 102 % new_db = subset(db, ownerids, {'and', ['or']})
wolffd@0 103
wolffd@0 104 if nargin < 3
wolffd@0 105 mode = 'or';
wolffd@0 106 end
wolffd@0 107
wolffd@0 108 % ---
wolffd@0 109 % create new DB
wolffd@0 110 % we make sure the tag id index keeps
wolffd@0 111 % the same for subsets by copying the whole
wolffd@0 112 % lexicon
wolffd@0 113 % ---
wolffd@0 114 new_db = AnnotDB(db.lexicon);
wolffd@0 115
wolffd@0 116 switch lower(mode)
wolffd@0 117 case 'and'
wolffd@0 118
wolffd@0 119 % ---
wolffd@0 120 % TODO: implement this and
wolffd@0 121 % improve speed below
wolffd@0 122 % ---
wolffd@0 123 case 'or'
wolffd@0 124
wolffd@0 125 % successively fill with given annots
wolffd@0 126 for i = 1:numel(ownerids)
wolffd@0 127
wolffd@0 128 % ---
wolffd@0 129 % we retrieve annots for each clip
wolffd@0 130 % and add them to the new database
wolffd@0 131 % ---
wolffd@0 132 [annot, score] = annots(db, ownerids(i));
wolffd@0 133 for j = 1:numel(annot)
wolffd@0 134
wolffd@0 135 new_db.add_pair(ownerids(i), annot{j}, score(j));
wolffd@0 136 end
wolffd@0 137 end
wolffd@0 138 otherwise
wolffd@0 139 error 'illegal owner id combination mode. possibly forgot brackets';
wolffd@0 140 end
wolffd@0 141 end
wolffd@0 142
wolffd@0 143 % retrieve annot-substructure for complement
wolffd@0 144 % of given clip ids
wolffd@0 145 function [new_db] = exclude(db, ownerids)
wolffd@0 146
wolffd@0 147 % get complement of clip ids
wolffd@0 148 ownerids = setdiff(db.annots_oid, ownerids);
wolffd@0 149
wolffd@0 150 new_db = subset(db, ownerids);
wolffd@0 151 end
wolffd@0 152
wolffd@0 153 % ---
wolffd@0 154 % retrieve clip by annot.
wolffd@0 155 % if multiple annots are given, the clips
wolffd@0 156 % containing all of them (logical and) are
wolffd@0 157 % returned
wolffd@0 158 % ---
wolffd@0 159 function oids = owner(db, annotstr, mode)
wolffd@0 160
wolffd@0 161 if nargin < 3
wolffd@0 162 mode = 'and';
wolffd@0 163 end
wolffd@0 164
wolffd@0 165 if ~iscell(annotstr)
wolffd@0 166 annotstr = {annotstr};
wolffd@0 167 end
wolffd@0 168
wolffd@0 169 annotid = [];
wolffd@0 170 for i = 1:numel(annotstr)
wolffd@0 171
wolffd@0 172 annotid = [annotid strcellfind(db.lexicon, annotstr{i})];
wolffd@0 173 end
wolffd@0 174
wolffd@0 175 oids = owner_for_annotid(db, annotid, mode);
wolffd@0 176 end
wolffd@0 177
wolffd@0 178 % retrieve owner ids by clip
wolffd@0 179 function ownerids = owner_for_annotid(db, annotid, mode)
wolffd@0 180 % ownerids = ownerids_for_annotid(db, annotid, {['and'], 'or'})
wolffd@0 181
wolffd@0 182 if isempty(annotid)
wolffd@0 183 ownerids = [];
wolffd@0 184 return
wolffd@0 185 end
wolffd@0 186 if nargin < 3
wolffd@0 187 mode = 'and';
wolffd@0 188 end
wolffd@0 189
wolffd@0 190 switch lower(mode)
wolffd@0 191 case 'or'
wolffd@0 192 % search for all appearing owners
wolffd@0 193 candidates = sum(db.annotsdb(:, annotid), 2) > 0;
wolffd@0 194
wolffd@0 195 case 'and'
wolffd@0 196 % search for the common owners
wolffd@0 197 candidates = sum(db.annotsdb(:, annotid), 2) == ...
wolffd@0 198 numel(annotid);
wolffd@0 199 otherwise
wolffd@0 200 error 'illegal tag combination mode';
wolffd@0 201 end
wolffd@0 202
wolffd@0 203
wolffd@0 204 % get positions in database
wolffd@0 205 pos = find(candidates);
wolffd@0 206
wolffd@0 207 % return owner ids
wolffd@0 208 ownerids = db.annots_oid(pos);
wolffd@0 209 end
wolffd@0 210
wolffd@0 211 % retrieve annotid by clip
wolffd@0 212 function [aid, score] = annotids_for_owner(db, ownerid, mode)
wolffd@0 213
wolffd@0 214 % single query case
wolffd@0 215 if numel(ownerid) == 1
wolffd@0 216
wolffd@0 217 pos = owner_pos(db, ownerid);
wolffd@0 218
wolffd@0 219 % get positions in database
wolffd@0 220 aid = find(db.annotsdb(pos, :) > 0);
wolffd@0 221
wolffd@0 222 score = db.annotsdb(pos, aid);
wolffd@0 223
wolffd@0 224
wolffd@0 225 % sort ids for output
wolffd@0 226 if ~db.binary
wolffd@0 227
wolffd@0 228 [score, idx] = sort(score, 'descend');
wolffd@0 229 aid = aid(idx);
wolffd@0 230 end
wolffd@0 231 else
wolffd@0 232 if nargin < 3
wolffd@0 233 mode = 'or';
wolffd@0 234 end
wolffd@0 235
wolffd@0 236 % ---
wolffd@0 237 % the query contained multiple ids
wolffd@0 238 %
wolffd@0 239 % we dont return the single results but
wolffd@0 240 % the statistics for this subset of clips
wolffd@0 241 % ---
wolffd@0 242 new_db = db.subset(ownerid, mode);
wolffd@0 243 [null, score, aid] = new_db.stats_count();
wolffd@0 244
wolffd@0 245 % cut off at score > 0 to abandon unused tags
wolffd@0 246 u = find(score > 0,1,'last');
wolffd@0 247 score = score(1:u);
wolffd@0 248 aid = aid(1:u);
wolffd@0 249 end
wolffd@0 250 end
wolffd@0 251
wolffd@0 252 % retrieve annotation by clip
wolffd@0 253 function [out, score, aid] = annots(db, ownerid)
wolffd@0 254
wolffd@0 255 [aid, score] = db.annotids_for_owner( ownerid);
wolffd@0 256
wolffd@0 257 out = db.get_annot_name(aid);
wolffd@0 258 end
wolffd@0 259
wolffd@0 260
wolffd@0 261 % retrieve annot name given a annot id
wolffd@0 262 function out = get_annot_name(db, annotid)
wolffd@0 263
wolffd@0 264 out = {};
wolffd@0 265 for i = 1:numel(annotid)
wolffd@0 266
wolffd@0 267 out{i} = db.lexicon{annotid(i)};
wolffd@0 268 end
wolffd@0 269 end
wolffd@0 270
wolffd@0 271 % return annotation id for annotation string
wolffd@0 272 function aid = get_annot_id(db, annotstr)
wolffd@0 273
wolffd@0 274 if ~iscell(annotstr)
wolffd@0 275
wolffd@0 276 % expensive search within annot list
wolffd@0 277 aid = strcellfind(db.lexicon, annotstr);
wolffd@0 278 else
wolffd@0 279
wolffd@0 280 % search seperately for each annot
wolffd@0 281 for i = 1:numel(annotstr)
wolffd@0 282 aid(i) = strcellfind(db.lexicon, annotstr{i});
wolffd@0 283 end
wolffd@0 284 end
wolffd@0 285 end
wolffd@0 286
wolffd@0 287 % ---
wolffd@0 288 % return statistics on saved annotations.
wolffd@0 289 % = returns the sum of the scores and
wolffd@0 290 % sortec lexicon
wolffd@0 291 % ---
wolffd@0 292 function [labels, score, annotids] = stats(db)
wolffd@0 293
wolffd@0 294 % out = zeros(1, size(db.annotsdb,2));
wolffd@0 295 score = full(sum(db.annotsdb, 1));
wolffd@0 296 [score, annotids] = sort(score,'descend');
wolffd@0 297
wolffd@0 298 % prepare labels
wolffd@0 299 labels = db.lexicon(annotids);
wolffd@0 300 end
wolffd@0 301
wolffd@0 302 % ---
wolffd@0 303 % return statistics on saved annotations.
wolffd@0 304 % = returns the number of annotations and
wolffd@0 305 % sortec lexicon
wolffd@0 306 % ---
wolffd@0 307 function [labels, score, annotids] = stats_count(db)
wolffd@0 308
wolffd@0 309 % out = zeros(1, size(db.annotsdb,2));
wolffd@0 310 score = full(sum(db.annotsdb > 0, 1));
wolffd@0 311 [score, annotids] = sort(score,'descend');
wolffd@0 312
wolffd@0 313 % prepare labels
wolffd@0 314 labels = db.lexicon(annotids);
wolffd@0 315 end
wolffd@0 316
wolffd@0 317 % this is a stub for a tag cloud-like output
wolffd@0 318 function [out] = annots_cloud(db, ownerid)
wolffd@0 319
wolffd@0 320 % ---
wolffd@0 321 % TODO: actually output tag-cloud
wolffd@0 322 % this output is aimed at input into a web interface
wolffd@0 323 % we successfully used http://www.wordle.net/
wolffd@0 324 % ---
wolffd@0 325
wolffd@0 326 if nargin > 1
wolffd@0 327 db2 = db.subset(ownerid);
wolffd@0 328 else
wolffd@0 329 db2 = db;
wolffd@0 330 end
wolffd@0 331
wolffd@0 332 [labels, score, annotids] = stats(db2);
wolffd@0 333
wolffd@0 334 % ---
wolffd@0 335 % Note: for performance issues we compress this data
wolffd@0 336 % to a maximum value of 1001
wolffd@0 337 % ---
wolffd@0 338 score = ceil((score./max(score))*100);
wolffd@0 339
wolffd@0 340 out = '';
wolffd@0 341 for i = 1:numel(annotids)
wolffd@0 342
wolffd@0 343 % repeat the tag according to score
wolffd@0 344 annot = strrep(labels{i},' ','-');
wolffd@0 345 for j = 1:score(i)
wolffd@0 346 out = sprintf('%s; %s',annot, out);
wolffd@0 347 end
wolffd@0 348 end
wolffd@0 349 end
wolffd@0 350
wolffd@0 351
wolffd@0 352 function out = size(db)
wolffd@0 353 % returns the size of this db
wolffd@0 354
wolffd@0 355 out = numel(db.lexicon);
wolffd@0 356 end
wolffd@0 357
wolffd@0 358 function add_pair(db, ownerid, annot, score)
wolffd@0 359 % add_pair(db, owner, annot) adds an annot and owner and can
wolffd@0 360 % increase the lexicon size
wolffd@0 361
wolffd@0 362 if nargin < 4
wolffd@0 363 score = 1;
wolffd@0 364 end
wolffd@0 365
wolffd@0 366 aid = strcellfind(db.lexicon, annot);
wolffd@0 367
wolffd@0 368 % create new position for annotation if neccesary
wolffd@0 369 if isempty(aid)
wolffd@0 370
wolffd@0 371 aid = numel(db.lexicon) + 1;
wolffd@0 372
wolffd@0 373 % add to lexicon
wolffd@0 374 db.lexicon = {db.lexicon{:}, annot};
wolffd@0 375
wolffd@0 376 % enhance annotation matrix
wolffd@0 377 db.annotsdb = [db.annotsdb, ...
wolffd@0 378 sparse(size(db.annotsdb,1), 1)];
wolffd@0 379 end
wolffd@0 380
wolffd@0 381
wolffd@0 382 % create new position for clip if neccesary
wolffd@0 383 pos = owner_pos(db, ownerid);
wolffd@0 384 if isempty(pos)
wolffd@0 385
wolffd@0 386 pos = numel(db.annots_oid) +1;
wolffd@0 387
wolffd@0 388 % add to oid
wolffd@0 389 db.annots_oid = [db.annots_oid, ownerid];
wolffd@0 390
wolffd@0 391 % enhance annotation matrix
wolffd@0 392 db.annotsdb = [db.annotsdb; ...
wolffd@0 393 sparse(1, size(db.annotsdb, 2))];
wolffd@0 394 end
wolffd@0 395
wolffd@0 396 % save data to database
wolffd@0 397 db.annotsdb(pos, aid) = score;
wolffd@0 398 end
wolffd@0 399
wolffd@0 400 end
wolffd@0 401
wolffd@0 402
wolffd@0 403 methods(Hidden)
wolffd@0 404
wolffd@0 405 function pos = owner_pos(db, ownerid)
wolffd@0 406
wolffd@0 407 % returns database position for owner id
wolffd@0 408 pos = find(db.annots_oid == ownerid);
wolffd@0 409 end
wolffd@0 410 end
wolffd@0 411
wolffd@0 412 end