view core/magnatagatune/AnnotDB.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line source
% The thesaurus class is a basic component of all 
% genre and tag information managing the whole Vocabulary.

classdef AnnotDB < handle
    
   % public properties
   properties (SetAccess = private)
       lexicon = {};
      
   end
   
   properties(Hidden, Access = private)

       annotsdb; % a numowners x numannots sparse binary / prob matrix
       annots_oid; % ownerid to pos in annots conversion
       
       binary = 0; % indicator whether the db contains binary or scored annots
   end
   
   methods
       
       % ---
       % simple constructor
       % ---
       function db = AnnotDB(lexicon, annots, ids)
           % db = AnnotDB(lexicon, annots, annotation_ids)
           % lexicon: the list of all individual annotation elements or
           %          clips_by_annot or
           %          lexids_by_clip 
           
           % annots: either a clips x numel(lexicon) binary matrix  or 
           %            ...
           %
           % annotation_ids: clip ids for the binary case, 
           
           if nargin >= 1

               % ---
               % NOTE: two ways of supplying the annots are allowed:
               % 1. clip ids for each lexical element
               % 2. binary matrix
               % ---
               if ischar(lexicon)
                   
                   if strcmp(lexicon, 'clips_by_annot')
                       
                       % ---
                       %  preset the lexicon and hash ids
                       % ---
                       db.lexicon = unique(lower(annots));
                       
                       if iscell(ids) 
                           db.annots_oid = unique([ids{:}]);
                       else
                           db.annots_oid = unique(ids);
                       end
                       
                       db.annotsdb = sparse(numel(db.annots_oid),...
                           numel( db.lexicon));
                       
%                      for all annotations
                       for i = 1:numel(annots)
                           
%                          for all ids in set
                           % is this a cell or just a single index<
                           if iscell(ids)
                               for j = 1:numel(ids{i})

                                   db.add_pair(ids{i}(j), annots{i});
                               end
                           else
                               % single ndex case
                               db.add_pair(ids(i), annots{i});
                           end
                       end

                   elseif strcmp(lexicon, 'annots_by_clip')
                       
                   end
                       % this is the binary case
               else

                   db.lexicon = lexicon;
                   db.annotsdb = sparse(0,0);
                   if nargin >= 2

                       db.annotsdb = sparse(annots);
                       db.annots_oid = ids;
                   else
                       db.annotsdb = sparse(0, numel(db.lexicon));
                   end
               end
           end
       end
       
       % ---
       % retrieve annot-substructure for given clip ids, 
       % collecting std = [or = all] ,[and = common]
       % annots for these
       % ---
       function new_db = subset(db, ownerids, mode)   
       % new_db = subset(db, ownerids, {'and', ['or']}) 
       
           if nargin < 3 
               mode = 'or';
           end
       
           % ---
           % create new DB
           % we make sure the tag id index keeps 
           % the same for subsets by copying the whole 
           % lexicon
           % ---
           new_db = AnnotDB(db.lexicon);
           
           switch lower(mode)
               case 'and'

               % ---
               % TODO: implement this and 
               % improve speed below
               % ---
               case 'or'
                   
                   % successively fill with given annots
                   for i = 1:numel(ownerids)

                       % ---
                       % we retrieve annots for each clip
                       % and add them to the new database
                       % ---
                       [annot, score] = annots(db, ownerids(i));
                       for j = 1:numel(annot)

                           new_db.add_pair(ownerids(i), annot{j}, score(j));
                       end   
                   end
              otherwise
                   error 'illegal owner id combination mode. possibly forgot brackets';
           end       
       end
       
        % retrieve annot-substructure for complement
        % of given clip ids
       function [new_db] = exclude(db, ownerids)
           
           % get complement of clip ids
           ownerids = setdiff(db.annots_oid, ownerids);
           
           new_db = subset(db, ownerids);
       end
       
       % ---
       % retrieve clip by annot.
       % if multiple annots are given, the clips 
       % containing all of them (logical and) are 
       % returned
       % ---
       function oids = owner(db, annotstr, mode)       
           
           if nargin < 3 
               mode = 'and';
           end
           
           if ~iscell(annotstr)
               annotstr = {annotstr};
           end
           
           annotid = [];
           for i = 1:numel(annotstr)
               
                annotid = [annotid strcellfind(db.lexicon, annotstr{i})];
           end
           
           oids = owner_for_annotid(db, annotid, mode);
       end
       
      % retrieve owner ids by clip
      function ownerids = owner_for_annotid(db, annotid, mode)
      % ownerids = ownerids_for_annotid(db, annotid, {['and'], 'or'})     
          
           if isempty(annotid)
               ownerids = [];
               return
           end
           if nargin < 3 
               mode = 'and';
           end
 
           switch lower(mode)
               case 'or'
                   % search for all appearing owners
                   candidates = sum(db.annotsdb(:, annotid), 2) > 0;
                   
               case 'and'
                   % search for the common owners
                   candidates = sum(db.annotsdb(:, annotid), 2) == ...
                       numel(annotid);
               otherwise
                   error 'illegal tag combination mode';
           end

           
           % get positions in database
           pos = find(candidates);
           
           % return owner ids
           ownerids = db.annots_oid(pos);
       end
       
       % retrieve annotid by clip
       function [aid, score] = annotids_for_owner(db, ownerid, mode)
           
           % single query case
           if numel(ownerid) == 1
               
               pos = owner_pos(db, ownerid);

               % get positions in database
               aid = find(db.annotsdb(pos, :) > 0);

               score = db.annotsdb(pos, aid);


              % sort ids for output
               if ~db.binary

                  [score, idx] = sort(score, 'descend');
                  aid = aid(idx);
               end
           else
               if nargin < 3 
                   mode = 'or';
               end
               
               % ---
               % the query contained multiple ids
               %
               % we dont return the single results but 
               % the statistics for this subset of clips
               % ---
               new_db = db.subset(ownerid, mode);
               [null, score, aid] = new_db.stats_count();
               
               % cut off at score > 0 to abandon unused tags
               u = find(score > 0,1,'last');
               score = score(1:u);
               aid = aid(1:u);
           end
       end
        
      % retrieve annotation by clip
       function [out, score, aid] = annots(db, ownerid)
           
           [aid, score] = db.annotids_for_owner( ownerid);
           
           out = db.get_annot_name(aid);
       end
       
      
       % retrieve annot name given a annot id
       function out = get_annot_name(db, annotid)
           
           out = {};
           for i = 1:numel(annotid)
               
               out{i} = db.lexicon{annotid(i)};
           end
       end
       
       % return annotation id for annotation string
       function aid = get_annot_id(db, annotstr)
           
           if ~iscell(annotstr)
               
               % expensive search within annot list
               aid = strcellfind(db.lexicon, annotstr);
           else
               
               % search seperately for each annot 
               for i = 1:numel(annotstr)
                   aid(i) = strcellfind(db.lexicon, annotstr{i});
               end
           end
       end 
       
       % ---
       % return statistics on saved annotations.
       % = returns the sum of the scores and 
       % sortec lexicon
       % ---
       function [labels, score, annotids] = stats(db)
           
           % out = zeros(1, size(db.annotsdb,2));
           score = full(sum(db.annotsdb, 1));
           [score, annotids] = sort(score,'descend');
           
           % prepare labels
           labels = db.lexicon(annotids);
       end
       
       % ---
       % return statistics on saved annotations.
       % = returns the number of annotations and 
       % sortec lexicon
       % ---
       function [labels, score, annotids] = stats_count(db)
           
           % out = zeros(1, size(db.annotsdb,2));
           score = full(sum(db.annotsdb > 0, 1));
           [score, annotids] = sort(score,'descend');
           
           % prepare labels
           labels = db.lexicon(annotids);
       end
       
        % this is a stub for a tag cloud-like output
       function [out] = annots_cloud(db, ownerid)
           
       % ---
       % TODO: actually output tag-cloud
       % this output is aimed at input into a web interface
       % we successfully used http://www.wordle.net/
       % ---
       
       if nargin > 1
           db2 = db.subset(ownerid);
       else
           db2 = db;
       end

           [labels, score, annotids] = stats(db2);
           
           % ---
           % Note: for performance issues we compress this data
           % to a maximum value of 1001
           % ---
           score = ceil((score./max(score))*100);
           
           out = '';
           for i = 1:numel(annotids)
               
               % repeat the tag according to score
               annot = strrep(labels{i},' ','-');
               for j = 1:score(i)
                out = sprintf('%s; %s',annot, out);
               end
           end
       end
       
       
       function out = size(db)
           % returns the size of this db
           
           out = numel(db.lexicon);
       end

       function add_pair(db, ownerid, annot, score)
%            add_pair(db, owner, annot) adds an annot and owner and can
%            increase the lexicon size

            if nargin < 4 
                score = 1;
            end
            
            aid = strcellfind(db.lexicon, annot);     
            
            % create new position for annotation if neccesary
            if isempty(aid) 
                
                aid = numel(db.lexicon) + 1;
                
                % add to lexicon
                db.lexicon = {db.lexicon{:}, annot};
                
                % enhance annotation matrix
                db.annotsdb = [db.annotsdb, ...
                    sparse(size(db.annotsdb,1), 1)];
            end
            
            
            % create new position for clip if neccesary
            pos = owner_pos(db, ownerid);
            if isempty(pos) 
                
                pos = numel(db.annots_oid) +1;
                
                % add to oid
                db.annots_oid = [db.annots_oid, ownerid];
                
                % enhance annotation matrix
                db.annotsdb = [db.annotsdb; ...
                    sparse(1, size(db.annotsdb, 2))];
            end
            
            % save data to database
            db.annotsdb(pos, aid) = score;
       end 
       
   end
   
   
   methods(Hidden)
       
       function pos = owner_pos(db, ownerid)

        % returns database position for owner id 
            pos = find(db.annots_oid == ownerid);
       end
   end
   
end