jordan@1: function [data dset_origin] = collect_all_mirex_annotations(base_directory, dsets, algos)
jordan@1: % function [data dset_origin] = collect_all_mirex_annotations(base_directory, dsets, algos)
jordan@1: %
jordan@1: % GET ALL THE DATA!
jordan@1: % This function collects annotations from all years of MIREX evaluation.
jordan@1: % Annotation data (onsets and labels) all go in a single structure, including file
jordan@1: % locations.
jordan@1: %
jordan@1: % BASE_DIRECTORY should be the "mirex_path" specified in "get_mirex_estimates.rb",
jordan@1: % or whatever directory contains all the downloaded MIREX data. For example:
jordan@2: % "/Users/me/Desktop/MIREX_data"
jordan@1: %
jordan@1: % DSETS should contain the names of the datasets. The default value is all of them:
jordan@1: %   {'mrx09','mrx10_1','mrx10_2','sal'}
jordan@1: % Keep the DSETS in a consistent order across your work, because the index of the dataset
jordan@1: % is important for some of the other functions.
jordan@1: %
jordan@1: % ALGOS should contain the name of at least one algorithm, so that the data can be
jordan@1: % accessed correctly. (Only the first algo is used, since the annotation is the same
jordan@1: % for each.) The default value is {'KSP1'}.
jordan@1: %
jordan@1: % The output DATA structure contains the following fields:
jordan@1: %
jordan@1: % DATA(i).TIM = onset times of annotation
jordan@1: % DATA(i).LAB = labels of sections
jordan@1: % DATA(i).FILE = file from which the above information derives
jordan@1: % DATA(i).DSET = numerical index of the dataset
jordan@1: %
jordan@1: % Dependencies:
jordan@1: %   - load_annotation.m
jordan@1: 
jordan@1: % YEAR(i).NAMES will contain the NAMES of all the individual song files from YEAR i.
jordan@1: year = {};
jordan@1: 
jordan@1: % MRXTRUTH will contain an entry for every song in the MIREX evaluation, containing information about the 
jordan@1: mrxtruth = {};
jordan@1: 
jordan@1: % Collect one CSV file from each year (the song names are identical in each algo CSV, so it is only necessary to have one valid ALGO here)
jordan@1: algos = {'KSP1','KSP2','KSP3','MHRAF1','OYZS1','SBV1','SMGA1','SMGA2','SP1'};
jordan@1: dsets = {'mrx09','mrx10_1','mrx10_2','sal'};
jordan@1: csv_files = {};
jordan@1: for i=1:length(dsets),
jordan@1:     csv_files{end+1} = fullfile(base_directory,dsets{i},algos{1},'per_track_results.csv');
jordan@1: end
jordan@1: 
jordan@1: 
jordan@1: % Use the CSV files to discover the names of all the songs.
jordan@1: fprintf('About to open some CSV files to extract the names of the songs in MIREX. If you see lots of errors, please ensure that the files exist in the correct location.\n')
jordan@1: for i=1:length(csv_files),
jordan@1:     try
jordan@1:         fid = fopen(csv_files{i});
jordan@1:         names_tmp = textscan(fid,'%s%s%*[^\n]','Delimiter',',');
jordan@1:         fclose(fid);
jordan@1:         year(i).names = names_tmp{2}(2:end);
jordan@1:     catch
jordan@1:         fprintf('Error opening or reading the following CSV file:\n   %s\n',csv_files{i});
jordan@1:     end
jordan@1: end
jordan@1: 
jordan@4: fprintf('OK, done with that.\n\n')
jordan@4: 
jordan@1: % For every dataset (DSET), look through all the names (YEAR(k).NAMES), and load the annotation.
jordan@1: fprintf('About to load all the ground truth files published by MIREX. If you see lots of errors, please ensure that the files exist in the correct location, and that the function ''load_annotation'' exists.\n')
jordan@1: for k=1:length(dsets),
jordan@1:     dset = dsets{k};
jordan@1:     algo = algos{1};
jordan@1:     for i=1:length(year(k).names),
jordan@1:         % FYI: GT stands for 'ground truth', in contrast to PRED for 'prediction'.
jordan@1:         gt = fullfile(base_directory,dset,algo,strcat(year(k).names{i},'_gt.txt'));
jordan@1:         try
jordan@1:             [mrxtruth(end+1).tim mrxtruth(end+1).lab] = load_annotation(gt,'two_column');
jordan@1:             mrxtruth(end).file = gt;
jordan@1:             mrxtruth(end).dset = k;
jordan@1:             if isempty(mrxtruth(end).tim),
jordan@1:                 % Sometimes the annotation might be empty! This can be bad news.
jordan@1:                 % If this happens, print out the name of the offending file, and delete this from the structure of annotations.
jordan@1:                 fprintf(mrxtruth(end).file)
jordan@1:                 fprintf('\n')
jordan@1:                 mrxtruth = mrxtruth(1:end-1);
jordan@1:             end
jordan@1:         catch
jordan@1:             fprintf('Error opening the following ground truth file:\n   %s\n',gt);
jordan@1:         end
jordan@1:     end
jordan@1: end
jordan@1: 
jordan@4: fprintf('OK, done with that.\n\n')
jordan@4: 
jordan@1: % It can be useful to have a separate structure pointing to the index of the dataset.
jordan@1: % This is an optional output of the function.
jordan@1: 
jordan@1: data = mrxtruth;
jordan@1: dset_origin = zeros(length(data),1);
jordan@1: for i=1:length(data),
jordan@1:     dset_origin(i) = data(i).dset;
jordan@1: end
jordan@1: 
jordan@1: 
jordan@1: % Did this actually happen? That some of the onset times are in the incorrect order?
jordan@1: % Why yes, it did. It happens probably due to some floating point error...
jordan@1: % What you would see is two boundaries a tiny distance apart, where the later one
jordan@1: % appeared first, like: "145.0000468     silence; 145.0000000     end".
jordan@1: % In such cases, a perfectly acceptable fix is to just resort the times. They should
jordan@1: % be in sorted order anyway!
jordan@1: 
jordan@1: for i=1:length(data),
jordan@1:     data(i).tim = sort(data(i).tim);
jordan@1: end