annotate collect_all_public_annotations.m @ 2:624231da830b

Removed name from comments. Updated readme extensively. Renamed 2 files without significant changes. Added EP data as a bonus.
author Jordan Smith <jordan.smith@eecs.qmul.ac.uk>
date Fri, 20 Sep 2013 17:05:34 +0100
parents 818a4b5f3384
children 92b5a46bc67b
rev   line source
jordan@1 1 function [publictruth dset_origin] = collect_all_public_annotations(base_directory)
jordan@1 2 % function function [data dset_origin] = collect_all_public_annotations(base_directory)
jordan@1 3 %
jordan@1 4 % GET ALL THE DATA!
jordan@1 5 % This function collects annotations from many public repositories of structural analyses.
jordan@1 6 % Annotation data (onsets and labels) all go in a single structure, including file
jordan@1 7 % locations. Refer to the README file to see what these repositories are and how to
jordan@1 8 % download them.
jordan@1 9 %
jordan@1 10 % BASE_DIRECTORY should be the "mirex_path" specified in "get_mirex_estimates.rb",
jordan@1 11 % or whatever directory contains all the downloaded MIREX data. For example:
jordan@2 12 % "/Users/me/Desktop/MIREX_data"
jordan@1 13 %
jordan@1 14 % Before running this script, you must have downloaded the original repositories
jordan@1 15 % to the "mirex_path" directory, and unzipped them. If you did that, then this script
jordan@1 16 % should be able to find and interpret all the annotations. (Except for those in .xml
jordan@1 17 % format, which should be pre-processed. Again, refer to the README.)
jordan@1 18 %
jordan@1 19 % The output DATA structure contains the following fields for the ith song:
jordan@1 20 %
jordan@1 21 % DATA(i).TIM = onset times of annotation
jordan@1 22 % DATA(i).LAB = labels of sections
jordan@1 23 % DATA(i).FILE = file from which the above information derives
jordan@1 24 % DATA(i).DSET = numerical indices of the main dataset (e.g., QM, RWC, etc.) and the
jordan@1 25 % subset (e.g., within QM: 'CaroleKing', 'Queen', etc.)
jordan@1 26 %
jordan@1 27 % Dependencies:
jordan@1 28 % - load_annotation.m
jordan@1 29
jordan@1 30 public_dir = fullfile(base_directory,'public_data');
jordan@1 31
jordan@1 32 % Assemble lists of all the directories where the data live. This section is very hacky!!!
jordan@1 33
jordan@1 34 % RWC
jordan@1 35 rwc_dirs = {fullfile(public_dir,'AIST.RWC-MDB-C-2001.CHORUS'), fullfile(public_dir,'AIST.RWC-MDB-G-2001.CHORUS'), fullfile(public_dir,'AIST.RWC-MDB-J-2001.CHORUS'), fullfile(public_dir,'AIST.RWC-MDB-P-2001.CHORUS')};
jordan@1 36
jordan@1 37 % QM, i.e., Isophonics data from Queen Mary
jordan@1 38 qm_dirs = {fullfile(public_dir,'Carole%20King%20Annotations'), fullfile(public_dir,'Michael%20Jackson%20Annotations'), fullfile(public_dir,'Queen%20Annotations'), fullfile(public_dir,'The%20Beatles%20Annotations'), fullfile(public_dir,'Zweieck%20Annotations')};
jordan@1 39
jordan@1 40 % EP, i.e., data released by Ewald Peiszer
jordan@1 41 ep_dir = fullfile(public_dir,'ep_groundtruth_txt/groundtruth');
jordan@1 42 % Or, you could download the original data, and convert the XML files to LAB files using
jordan@2 43 % the included Ruby script xml2lab.rb.
jordan@1 44 % ep_dir = fullfile(public_dir,'ep_groundtruth/groundtruth');
jordan@1 45
jordan@1 46 % IRISA
jordan@1 47 irisa_dirs = {fullfile(public_dir,'IRISA.RWC-MDB-P-2001.BLOCKS'), fullfile(public_dir,'IRISA.RWC-MDB-P-2012.SEMLAB_v003_reduced'), fullfile(public_dir,'IRISA.RWC-MDB-P-2012.SEMLAB_v003_full')};
jordan@1 48
jordan@1 49 % TUT Beatles
jordan@1 50 fullfile(public_dir,'TUT','*');
jordan@1 51 [tmp tutfiles] = fileattrib(fullfile(public_dir,'TUT','*'));
jordan@1 52 tut_dirs = {};
jordan@1 53 for i=1:length(tutfiles),
jordan@1 54 if tutfiles(i).directory==1,
jordan@1 55 tut_dirs{end+1} = tutfiles(i).Name;
jordan@1 56 end
jordan@1 57 end
jordan@1 58
jordan@1 59 % UPF Beatles
jordan@1 60 upf_dirs = {fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/01_-_Please_please_me_1963'), fullfile(public_dir,' /Users/jordan/Desktop/MIREX_data/public_data/02_-_With_The_Beatles_1963'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/03_-_A_hard_days_night_1964'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/04_-_Beatles_for_sale_1964'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/05_-_Help_1965'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/06_-_Rubber_Soul'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/07_-_Revolver'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/08_-_Sgt._Pepper''s_Lonely_Hearts_Club_Band'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/09_-_Magical_Mystery_Tour'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/10_-_The_Beatles\ \(White\ Album\)\ CD1'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/10_-_The_Beatles\ \(White\ Album\)\ CD2'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/11_-_Abbey_Road'), fullfile(public_dir,'/Users/jordan/Desktop/MIREX_data/public_data/12_-_Let_it_Be')};
jordan@1 61
jordan@1 62 % SALAMI
jordan@1 63 salami_dir = fullfile(public_dir,'SALAMI_data_v1.2/data');
jordan@1 64
jordan@1 65 dset_origin = [];
jordan@1 66
jordan@1 67 publictruth = {};
jordan@1 68
jordan@1 69 % Load RWC data
jordan@1 70 for i=1:length(rwc_dirs),
jordan@1 71 [tmp all_files tmp1] = fileattrib(strcat(rwc_dirs{i},'/*'));
jordan@1 72 for j=1:length(all_files),
jordan@1 73 if all_files(j).directory==0,
jordan@1 74 try
jordan@1 75 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'lab');
jordan@1 76 publictruth(end).file = all_files(j).Name;
jordan@1 77 dset_origin = [dset_origin; 1 i]; % NB: This '1' is HARD-CODED.
jordan@1 78 if isempty(publictruth(end).tim),
jordan@1 79 fprintf('The following file appears to be empty:\n %s\n',publictruth(end).file)
jordan@1 80 publictruth = publictruth(1:end-1);
jordan@1 81 dset_origin = dset_origin(1:end-1,:);
jordan@1 82 end
jordan@1 83 catch
jordan@1 84 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 85 % NB: many flags will be thrown here because many of the RWC files are empty.
jordan@1 86 end
jordan@1 87 end
jordan@1 88 end
jordan@1 89 end
jordan@1 90
jordan@1 91 % Load Isophonics data
jordan@1 92 for i=1:length(qm_dirs),
jordan@1 93 tmp_dir_name = fullfile(qm_dirs{i},'seglab','*');
jordan@1 94 [tmp all_files tmp1] = fileattrib(tmp_dir_name);
jordan@1 95 for j=1:length(all_files),
jordan@1 96 [tmp1 tmp2 tmp_file_extension] = fileparts(all_files(j).Name);
jordan@1 97 if all_files(j).directory==0 & all_files(j).GroupRead==1 & isequal(tmp_file_extension,'.lab'),
jordan@1 98 try
jordan@1 99 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'lab');
jordan@1 100 publictruth(end).file = all_files(j).Name;
jordan@1 101 dset_origin = [dset_origin; 2 i]; % NB: This '2' is HARD-CODED.
jordan@1 102 if isempty(publictruth(end).tim),
jordan@1 103 fprintf('The following file appears to be empty:\n %s\n',publictruth(end).file)
jordan@1 104 publictruth = publictruth(1:end-1);
jordan@1 105 dset_origin = dset_origin(1:end-1,:);
jordan@1 106 end
jordan@1 107 catch
jordan@1 108 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 109 end
jordan@1 110 end
jordan@1 111 end
jordan@1 112 end
jordan@1 113
jordan@1 114 % Load EP data
jordan@1 115 [tmp all_files tmp1] = fileattrib(strcat(ep_dir,'/*.txt'));
jordan@1 116 for j=1:length(all_files),
jordan@1 117 if all_files(j).directory==0 & all_files(j).GroupRead==1,
jordan@1 118 try
jordan@1 119 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'two_column');
jordan@1 120 publictruth(end).file = all_files(j).Name;
jordan@1 121 dset_origin = [dset_origin; 3 1]; % NB: This '3' is HARD-CODED.
jordan@1 122 if isempty(publictruth(end).tim),
jordan@1 123 fprintf(publictruth(end).file)
jordan@1 124 fprintf('\n')
jordan@1 125 publictruth = publictruth(1:end-1);
jordan@1 126 dset_origin = dset_origin(1:end-1,:);
jordan@1 127 end
jordan@1 128 catch
jordan@1 129 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 130 end
jordan@1 131 end
jordan@1 132 end
jordan@1 133
jordan@1 134 % Load IRISA data
jordan@1 135 for i=1:length(irisa_dirs),
jordan@1 136 [tmp all_files tmp1] = fileattrib(strcat(irisa_dirs{i},'/*.lab'));
jordan@1 137 for j=1:length(all_files),
jordan@1 138 if all_files(j).directory==0 & all_files(j).GroupRead==1,
jordan@1 139 try
jordan@1 140 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'lab');
jordan@1 141 publictruth(end).file = all_files(j).Name;
jordan@1 142 dset_origin = [dset_origin; 4 i]; % NB: This '4' is HARD-CODED.
jordan@1 143 if isempty(publictruth(end).tim),
jordan@1 144 fprintf('The following file appears to be empty:\n %s\n',publictruth(end).file)
jordan@1 145 publictruth = publictruth(1:end-1);
jordan@1 146 dset_origin = dset_origin(1:end-1,:);
jordan@1 147 end
jordan@1 148 catch
jordan@1 149 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 150 end
jordan@1 151 end
jordan@1 152 end
jordan@1 153 end
jordan@1 154
jordan@1 155 % Load TUT data
jordan@1 156 for i=1:length(tut_dirs),
jordan@1 157 [tmp all_files tmp1] = fileattrib(strcat(tut_dirs{i},'/*.lab'));
jordan@1 158 for j=1:length(all_files),
jordan@1 159 if all_files(j).directory==0 & all_files(j).GroupRead==1,
jordan@1 160 try
jordan@1 161 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'lab');
jordan@1 162 publictruth(end).file = all_files(j).Name;
jordan@1 163 dset_origin = [dset_origin; 5 i]; % NB: This '5' is HARD-CODED.
jordan@1 164 if isempty(publictruth(end).tim),
jordan@1 165 fprintf('The following file appears to be empty:\n %s\n',publictruth(end).file)
jordan@1 166 publictruth = publictruth(1:end-1);
jordan@1 167 dset_origin = dset_origin(1:end-1,:);
jordan@1 168 end
jordan@1 169 catch
jordan@1 170 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 171 end
jordan@1 172 end
jordan@1 173 end
jordan@1 174 end
jordan@1 175
jordan@1 176 % Load SALAMI data
jordan@1 177 [tmp all_files tmp1] = fileattrib(strcat(salami_dir,'/*'));
jordan@1 178 for j=1:length(all_files),
jordan@1 179 if all_files(j).directory == 0 & all_files(j).GroupRead==1,
jordan@1 180 if strcmp(all_files(3).Name(end-12:end),'uppercase.txt'),
jordan@1 181 try
jordan@1 182 [publictruth(end+1).tim publictruth(end+1).lab] = load_annotation(all_files(j).Name,'two_column');
jordan@1 183 publictruth(end).file = all_files(j).Name;
jordan@1 184 dset_origin = [dset_origin; 6 1]; % NB: This '6' is HARD-CODED.
jordan@1 185 if isempty(publictruth(end).tim),
jordan@1 186 fprintf('The following file appears to be empty:\n %s\n',publictruth(end).file)
jordan@1 187 publictruth = publictruth(1:end-1);
jordan@1 188 dset_origin = dset_origin(1:end-1,:);
jordan@1 189 end
jordan@1 190 catch
jordan@1 191 fprintf('Error opening or reading the following file. (It might be empty, or not a song file.)\n %s\n',all_files(j).Name);
jordan@1 192 end
jordan@1 193 end
jordan@1 194 end
jordan@1 195 end
jordan@1 196
jordan@1 197 % Would you believe that in some of the annotations, two times are in the wrong order? It is simply appalling.
jordan@1 198 % We fix this here.
jordan@1 199 for i=1:length(publictruth),
jordan@1 200 if ~isequal(publictruth(i).tim,sort(publictruth(i).tim)),
jordan@1 201 publictruth(i).tim = sort(publictruth(i).tim);
jordan@1 202 fprintf('Fixed order of time points in this file:%s\n',publictruth(i).file)
jordan@1 203 end
jordan@1 204 end