Mercurial > hg > mirex-meta-analysis

--- a/1-get_mirex_estimates.rb	Fri Sep 20 17:12:46 2013 +0100
+++ b/1-get_mirex_estimates.rb	Sat Dec 07 18:18:50 2013 +0000
@@ -1,7 +1,7 @@
 require "CSV"
 require "open-uri"
 # require "simplexml"
-mirex_path = "/Users/me/Desktop/MIREX_data"    # EDIT THIS TO BE YOUR OWN DESIRED PATH.
+mirex_path = "/Users/jordan/Desktop/MIREX_data"    # EDIT THIS TO BE YOUR OWN DESIRED PATH.
                                                # IT WILL NEED TO HOLD ROUGHLY 70 MB OF DATA.

 def url_download(uri, filename=".")
@@ -48,6 +48,8 @@
 # Define list of algorithms and datasets:
 algos = ["SP1", "SMGA2", "MHRAF1", "SMGA1", "SBV1", "KSP2", "OYZS1", "KSP3", "KSP1"]
 datasets = ["mrx09", "mrx10_1", "mrx10_2", "sal"]
+year = "2012"
+puts "Thanks for starting the script! Stay tuned for periodic updates."

 # Create appropriate directory tree and download CSV files:
 puts("Downloading CSV files...\n")
@@ -61,11 +63,16 @@
         Dir.mkdir(algo_path) unless File.directory?(algo_path)
         # Download the CSV file to this directory:
         algocsvpath = File.join(mirex_path,dset,algo,"per_track_results.csv")
-        csv_path = File.join("http://nema.lis.illinois.edu/nema_out/mirex2012/results/struct",dset,algo,"per_track_results.csv")
+        csv_path = File.join("http://nema.lis.illinois.edu/nema_out/mirex",year,"/results/struct",dset,algo,"per_track_results.csv")
         url_download(csv_path, algocsvpath)
     end
 end

+puts "..done with that."
+
+puts "Now we will download all the files output by each algorithm. This could take a while depending on your connection."
+puts "Since this script points to " + datasets.length.to_s + " datasets and " + algos.length.to_s + " algorithms, you should expect to wait however long it takes between each of the next lines to appear, times " + (datasets.length*algos.length).to_s + "."
+
 # Read each CSV file and download all the json files it points to:
 datasets.each do |dset|
     algos.each do |algo|
@@ -77,7 +84,7 @@
         # For each line in the spreadsheet, extract the songid and download the corresponding json document.
         csv_data.each do |line|
             song_id = line[1]
-            url = "http://nema.lis.illinois.edu/nema_out/mirex2012/results/struct/" + dset + "/" + algo.downcase + "segments" + song_id.delete("_") + ".js"
+            url = "http://nema.lis.illinois.edu/nema_out/mirex" + year + "/results/struct/" + dset + "/" + algo.downcase + "segments" + song_id.delete("_") + ".js"
             download_path = File.join(download_folder,song_id + ".js")
             # download_path = download_folder + "/" + song_id + ".js"
             url_download(url, download_path)
@@ -86,12 +93,19 @@
     puts("Done with " + dset + " dataset!\n")
 end

+puts "..done with that."
+
+puts "Now, a much faster step: turning all the json files you downloaded into simpler text files."
 # Scan for all the json files, and convert each one into two text files, one for the algorithm output, one for the annotation:
 all_json_files = Dir.glob(File.join(mirex_path,"*","*","*.js"))
 all_json_files.each do |file|
     convert_file(file)
+    puts file
 end

+puts "..done with that."
+
+puts "Now, PART 2 of the script: we download all the zip files (from various websites) that contain the public collections of ground truth files. This will only take a couple minutes, depending on connection speed (it's about 4 MB total)."


 # # # #         PART 2:  GET (AND CONVERT) THE ANNOTATION DATA PUBLISHED BY OTHERS
@@ -108,4 +122,8 @@
 end

 # # # #         NOW, PLEASE EXIT THE SCRIPT, AND UNZIP ALL THOSE PACKAGES.
-# # # #         WHEN YOU'RE DONE, GO ONTO THE PARENT MATLAB FILE TO RUN THE ANALYSES.
\ No newline at end of file
+# # # #         WHEN YOU'RE DONE, GO ONTO THE PARENT MATLAB FILE TO RUN THE ANALYSES.
+puts "..done with that.\n\n"
+puts "Script apppears to have ended successfully. All files were downloaded and saved to " + public_data_path +"."
+puts "To continue please unpack all zip files, start MATLAB, and run 2-generate_smith2013_ismir.m. You can read more on README."
+puts "Important: be sure that the zip files unpack into the correct file structure. Again, see README for details."
\ No newline at end of file
--- a/collect_all_mirex_algo_output_data.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/collect_all_mirex_algo_output_data.m	Sat Dec 07 18:18:50 2013 +0000
@@ -53,6 +53,7 @@
         fprintf('Error opening or reading the following CSV file:\n   %s\n',csv_files{i});
     end
 end
+fprintf('OK, done with that.\n\n')


 fprintf('About to go through all the algorithm outputs and load all the predicted song descriptions. If you see lots of errors, please ensure that the files exist in the correct location.\n')
@@ -73,4 +74,5 @@
             end
         end
     end
-end
\ No newline at end of file
+end
+fprintf('OK, done with that.\n\n')
--- a/collect_all_mirex_annotations.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/collect_all_mirex_annotations.m	Sat Dec 07 18:18:50 2013 +0000
@@ -57,6 +57,8 @@
     end
 end

+fprintf('OK, done with that.\n\n')
+
 % For every dataset (DSET), look through all the names (YEAR(k).NAMES), and load the annotation.
 fprintf('About to load all the ground truth files published by MIREX. If you see lots of errors, please ensure that the files exist in the correct location, and that the function ''load_annotation'' exists.\n')
 for k=1:length(dsets),
@@ -82,6 +84,8 @@
     end
 end

+fprintf('OK, done with that.\n\n')
+
 % It can be useful to have a separate structure pointing to the index of the dataset.
 % This is an optional output of the function.
--- a/collect_all_mirex_results.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/collect_all_mirex_results.m	Sat Dec 07 18:18:50 2013 +0000
@@ -51,4 +51,5 @@
             data(k).algo(j).results(:,i-2) = data_tmp{i};
         end
     end
-end
\ No newline at end of file
+end
+fprintf('Oh by the way, I just collected all the results spreadsheets into a data structure. That was fast.\n\n')
\ No newline at end of file
--- a/collect_all_public_annotations.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/collect_all_public_annotations.m	Sat Dec 07 18:18:50 2013 +0000
@@ -29,6 +29,9 @@

 public_dir = fullfile(base_directory,'public_data');

+fprintf('OK, now we will be collecting all the public annotations into a data structure. You SHOULD expect a number of errors here, because some of the source annotations you just downloaded are actually empty. Namely, a bunch of the AIST ones: \nAIST.RWC-MDB-C-2001.CHORUS/RM-C025_A.CHORUS.TXT\nAIST.RWC-MDB-C-2001.CHORUS/RM-C025_D.CHORUS.TXT\nAIST.RWC-MDB-G-2001.CHORUS/RM-G040.CHORUS.TXT\nAIST.RWC-MDB-G-2001.CHORUS/RM-G042.CHORUS.TXT\nEtc... a whole bunch.\n')
+fprintf('It might also tell you it cannot read some README files. Do not worry about this. Finally, you will also see the script notify you that it has fixed some of the points in some Beatles annotations. This is because sometimes the numbers in the file are actually out of order. You can look up one of the songs to see an example. It is not really a big issue.\n\n')
+
 % Assemble lists of all the directories where the data live. This section is very hacky!!!

 % RWC
@@ -202,3 +205,5 @@
         fprintf('Fixed order of time points in this file:%s\n',publictruth(i).file)
     end
 end
+
+fprintf('Phew, OK! If you are worried about these errors, be sure to read the comments printed just before it.\n\n')
\ No newline at end of file
--- a/compile_datacubes.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/compile_datacubes.m	Sat Dec 07 18:18:50 2013 +0000
@@ -96,7 +96,7 @@
     % It can be nice to see a progress meter... It took me about 30 seconds to compute 100 songs, and there are ~1500 songs.
     if mod(i,100)==0,
         toc
-        fprintf('Getting there. We have done %i songs so far.\n',i)
+        fprintf('Getting there. We have done %i out of %i songs so far.\n',i,size(datacube,1))
     end
 end
 fprintf('Done!\nJust tidying up now.......')
--- a/do_correlation_analyses.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/do_correlation_analyses.m	Sat Dec 07 18:18:50 2013 +0000
@@ -12,72 +12,200 @@
 % is that we select the songs, metrics and algorithms to compare, and then choose
 % whether to take the median across all songs or across all algorithms.

+fprintf('We are making Figure 1a now.\n')
+
 [asig pval a a_] = do_correlation(megadatacube, lab_measures, indexing_info(1).manual_set, [1:9],...
     0, 0, 1, 0, indexing_info(1).labels, 0.05);
 saveas(gcf,'./plots/fig1a.jpg')

+fprintf('We are making Figure 1b now.\n')
+
 [asig pval a a_] = do_correlation(megadatacube, lab_measures, indexing_info(1).manual_set, [1:9],...
     0, 1, 0, 0, indexing_info(1).labels, 0.05);
 saveas(gcf,'./plots/fig1b.jpg')

+fprintf('We are making Figure 2a now. (This one usually takes a while.)\n')
+
 [asig pval a a_] = do_correlation(megadatacube, seg_measures, indexing_info(2).manual_set, [1:9],...
     0, 0, 1, 0, indexing_info(2).labels, 0.05);
 saveas(gcf,'./plots/fig2a.jpg')

+fprintf('We are making Figure 2b now.)\n')
+
 [asig pval a a_] = do_correlation(megadatacube, seg_measures, indexing_info(2).manual_set, [1:9],...
     0, 1, 0, 0, indexing_info(2).labels, 0.05);
 saveas(gcf,'./plots/fig2b.jpg')

+fprintf('We are making Figure 3 now.\n')
+
 [asig pval a a_] = do_correlation_fig3_only(megadatacube, lab_measures, [indexing_info(1).manual_set indexing_info(2).manual_set], [1:9], 0, 1, 0, 0, indexing_info(2).all_labels([indexing_info(1).manual_set indexing_info(2).manual_set]), 1, indexing_info(3).manual_set, indexing_info(3).labels);
 saveas(gcf,'./plots/fig3.jpg')


+
+% Now we are done making figures. The following sequences of commands generate output to validate some of the statements in the article.
+
+
+
+% Section 3.1: "Does this indicate that the algorithms are better at boundary precision than recall? In fact, the opposite is the case: average bp6 bp.5 was simply consistently worse for most algorithms."
+% For all algos:
+mean(median(megadatacube(:,indexing_info(2).manual_set([3 4 7 8]),:),3),1)
+% For each algo:
+mean(megadatacube(:,indexing_info(2).manual_set([3 4 7 8]),:),1)
+% Recall (the second pair of values) surpass precision (the first pair of values) for most of the algorithm runs. There are two exceptions: algorithms 4 (R a little less than P) and 5 (P much better than R).
+
+
+% Are the trends qualitatively similar across datasets? (Section 3.1: "...the findings of this section were consistent across the datasets, albeit with some variation in significance levels.")
+% % % Fig 1a
+% All the datasets:
+figure,[asig pval a a_] = do_correlation(megadatacube, lab_measures, indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
+% Isophonics et al.:
+figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
+% RWC (AIST):
+figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
+% SALAMI:
+figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
+% % % Fig 1b
+% All the datasets:
+figure, [asig pval a a_] = do_correlation(megadatacube, lab_measures, indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
+% Isophonics et al.:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
+% RWC (AIST):
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
+% SALAMI:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
+% % % Fig 2a
+% All the datasets:
+figure, [asig pval a a_] = do_correlation(megadatacube, seg_measures, indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
+% Isophonics et al.:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
+% RWC (INRIA):
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,2), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
+% RWC (AIST):
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
+% SALAMI:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
+% % % Fig 2b
+% All the datasets:
+figure, [asig pval a a_] = do_correlation(megadatacube, seg_measures, indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+% Isophonics et al.:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+% RWC (INRIA):
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,2), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+% RWC (AIST):
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+% SALAMI:
+figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+
+
+% Section 3.2: "While the middle half of the values of nsa [number of segments in annotation] ranges from 7 and 13 segments, the middle values for nse [number of segments for estimated description] for most algorithms range from 11 to 20 segments. The two exceptions are MHRAF and OYZS [algorithms 4 and 5], for which both msle and nse match the distributions seen in the annotations."
+
+% Index 17 gives the number of segments in the annotation; 21 gives the number of segments in the estimated description of the algorithm.
+% Boxplot shows general trend of overestimating number of segments.
+H = boxplot(megadatacube(:,[17 21],:))
+% Take the middle half of the data for annotated and estimated segments. Look at the range.
+
+tmp = sort(megadatacube(:,17,:));
+tmp = sort(tmp(:));
+tmp(round(length(tmp)/4)), tmp(3*round(length(tmp)/4))
+% The middle half of the annotated descriptions have 7 to 13 segments.
+
+tmp2 = sort(megadatacube(:,21,:));
+[tmp2(round(length(tmp2)/4),:,:), tmp2(round(length(tmp2)*3/4),:,:)]
+% Setting aside algorithms 4 and 5, the others all have middle ranges of roughly 11 to 24.
+tmp2 = sort(tmp2(:));
+tmp2(round(length(tmp2)/4)), tmp2(3*round(length(tmp2)/4))
+% Averaging the other algorithms together, the middle range is exactly 10 to 20.
+
+
+
+
+
 do blah
 % % % % % % % % % % % % The rest of this is still under construction, so I have inserted an error in the previous line to halt the script.

-% Are the trends qualitatively similar across datasets?
-% Fig 1a
-figure,[asig pval a a_] = do_correlation(megadatacube, lab_measures, indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
-figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
-figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
-figure,[asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(1).manual_set, [1:9], -1, 0, 1, -1, indexing_info(1).labels, 1);
-% Fig 1b
-figure, [asig pval a a_] = do_correlation(megadatacube, lab_measures, sind_manual1, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(1).manual_set, [1:9], -1, 1, 0, -1, indexing_info(1).labels, 1);
-% Fig 2a
-figure, [asig pval a a_] = do_correlation(megadatacube, seg_measures, sind_manual2, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,2), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(2).manual_set, [1:9], -1, 0, 1, -1, indexing_info(2).labels, 1);
-% Fig 2b
-figure, [asig pval a a_] = do_correlation(megadatacube, seg_measures, sind_manual2, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,1), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,2), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,3), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
-figure, [asig pval a a_] = do_correlation(megadatacube, ismember(mirex_dset_origin,4), indexing_info(2).manual_set, [1:9], -1, 1, 0, -1, indexing_info(2).labels, 1);
+%   %   %   %   %   %   %   %   %   %   %   ENd OF REAL WORK AREA   %   %   %   %   %   %   %   %   %   %   %   %   %


-% "Does this indicate that the algorithms are better at boundary precision than recall? In fact, the opposite is the case: average bp6 bp.5 was simply consistently worse for most algorithms."
-% For all algos:
-mean(median(megadatacube(:,sind_manual2,:),3),1)
-% For each algo:
-mean(megadatacube(:,sind_manual2,:),1)

+% Look at best 10 and worst 10 songs in each dataset, according to PW_F metric.
+% Average results across algorithms for this one.
+unique_algorithms = [3 4 5 6 7];
+tmp = datacube;
+tmp(:,:,3) = mean(tmp(:,:,[1:3,9]),3);
+tmp(:,:,7) = mean(tmp(:,:,7:8),3);
+tmp = mean(tmp(mirex_dset_origin==1,:,unique_algorithms),3);
+[tmp1 order] = sortrows(tmp,-3);
+order1 = lab_measures(order);
+pub_songids = mir2pub(order);
+values = tmp1((pub_songids>0),3);
+filenames = {};
+for i=1:length(pub_songids),
+    if pub_songids(i)>0,
+        filenames{end+1} = public_truth(pub_songids(i)).file;
+    end
+end

-H = boxplot(megadatacube(:,[17 21],:))
+mirid = pub2mir(336);
+make_structure_image(mirid, miranns, MD, mirdset, X, MR)
+saveas(gcf,'./plots/MJ_dont_care.jpg')
+make_structure_image(121, miranns, MD, mirdset, X, MR)
+saveas(gcf,'./plots/play_the_game.jpg')

-tmp = sort(megadatacube(:,17,:));
-tmp2 = sort(megadatacube(:,21,:));
-tmp2(round(length(tmp2)/4),:,:), tmp2(round(length(tmp2)*3/4),:,:)
+% Plot difficulty by album:

-tmp2 = sort(tmp2(:));
-tmp2(round(length(tmp2)/4)), tmp2(3*round(length(tmp2)/4))

+genres = {};
+subgenres = {};
+issalami = zeros(length(filenames),1);
+for i=1:length(filenames),
+    file = filenames{i};
+    if strfind(file,'SALAMI_data'),
+        issalami(i)=1;
+        salami_id = file(79:85);
+        salami_id = salami_id(1:strfind(salami_id,'/')-1);
+        salami_row = find(aaux.metadata{1}==str2num(salami_id));
+        genres{end+1} = cell2mat(aaux.metadata{15}(salami_row));
+        subgenres{end+1} = cell2mat(aaux.metadata{16}(salami_row));
+    end
+end
+gs = grp2idx(genres);
+subgs = grp2idx(subgenres);
+boxplot(values(find(issalami)),transpose(genres))
+axis([0.5 5.5 0 1])
+saveas(gcf,'salami_breakdown.png')
+boxplot(values(find(issalami)),transpose(subgenres),'colors',cmap(round(gs*63/6),:),'orientation','horizontal')

-%   %   %   %   %   %   %   %   %   %   %   ENd OF REAL WORK AREA   %   %   %   %   %   %   %   %   %   %   %   %   %
+[tmp1 tmp2] = hist(subgs,max(subgs)-1);
+tmp1 = find(tmp1>5);  % do these subgenres only
+tmp1 = ismember(subgs,tmp1);
+tmp2 = find(issalami);
+boxplot(values(tmp2(tmp1)),transpose(subgenres(tmp1)),'colors',cmap(round(gs(tmp1)*63/6),:),'orientation','horizontal')
+
+
+
+
+
+% Look at scatter plots so that we can qualitatively attribute the correlations to things (e.g., low-precision variance).
+tmpcube = mean(datacube,3);
+for i=1:4,
+    for j=i+1:5,
+        subplot(5,5,i+(j-1)*5)
+        scatter(tmpcube(:,i),tmpcube(:,j),'x')
+    end
+end
+
+
+
+
+
+
+
+
+
+
+
+


 clf,imagesc(a.*(abs(a)>.7))
@@ -194,74 +322,6 @@
 % BoxPlot of the number of segments in each algorithm output
 boxplot(reshape(newcube(:,7,:),[length(newcube),9,1]))

-% Look at best 10 and worst 10 songs in each dataset, according to PW_F metric.
-% Average results across algorithms for this one.
-unique_algorithms = [3 4 5 6 7];
-tmp = datacube;
-tmp(:,:,3) = mean(tmp(:,:,[1:3,9]),3);
-tmp(:,:,7) = mean(tmp(:,:,7:8),3);
-tmp = mean(tmp(lab_measures,:,unique_algorithms),3);
-[tmp1 order] = sortrows(tmp,-3);
-order1 = lab_measures(order);
-pub_songids = X.mir2pub(order1);
-values = tmp1((pub_songids>0),3);
-filenames = {};
-for i=1:length(pub_songids),
-    if pub_songids(i)>0,
-        filenames{end+1} = X.pubanns(pub_songids(i)).file;
-    end
-end
-
-mirid = pub2mir(336);
-make_structure_image(mirid, miranns, MD, mirdset, X, MR)
-saveas(gcf,'./plots/MJ_dont_care.jpg')
-make_structure_image(121, miranns, MD, mirdset, X, MR)
-saveas(gcf,'./plots/play_the_game.jpg')
-
-% Plot difficulty by album:
-
-
-genres = {};
-subgenres = {};
-issalami = zeros(length(filenames),1);
-for i=1:length(filenames),
-    file = filenames{i};
-    if strfind(file,'SALAMI_data'),
-        issalami(i)=1;
-        salami_id = file(79:85);
-        salami_id = salami_id(1:strfind(salami_id,'/')-1);
-        salami_row = find(aaux.metadata{1}==str2num(salami_id));
-        genres{end+1} = cell2mat(aaux.metadata{15}(salami_row));
-        subgenres{end+1} = cell2mat(aaux.metadata{16}(salami_row));
-    end
-end
-gs = grp2idx(genres);
-subgs = grp2idx(subgenres);
-boxplot(values(find(issalami)),transpose(genres))
-axis([0.5 5.5 0 1])
-saveas(gcf,'salami_breakdown.png')
-boxplot(values(find(issalami)),transpose(subgenres),'colors',cmap(round(gs*63/6),:),'orientation','horizontal')
-
-[tmp1 tmp2] = hist(subgs,max(subgs)-1);
-tmp1 = find(tmp1>5);  % do these subgenres only
-tmp1 = ismember(subgs,tmp1);
-tmp2 = find(issalami);
-boxplot(values(tmp2(tmp1)),transpose(subgenres(tmp1)),'colors',cmap(round(gs(tmp1)*63/6),:),'orientation','horizontal')
-
-
-
-
-
-% Look at scatter plots so that we can qualitatively attribute the correlations to things (e.g., low-precision variance).
-tmpcube = mean(datacube,3);
-for i=1:4,
-    for j=i+1:5,
-        subplot(5,5,i+(j-1)*5)
-        scatter(tmpcube(:,i),tmpcube(:,j),'x')
-    end
-end
-
-
--- a/match_mirex_to_public_data.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/match_mirex_to_public_data.m	Sat Dec 07 18:18:50 2013 +0000
@@ -51,9 +51,12 @@
 rel(4).rel_mir = find(mirex_dset_origin==4);
 rel(4).rel_pub = find(public_dset_origin(:,1)==6);

-metrics = [2 2 2 2];
+% The metric is the boundary f-measure. The quality threshold is the minimum value of this metric that we consider to indicate a match. 0.99 is really high!
 quality_threshes = [.99 0.99 0.99 0.99];

+fprintf('OK, we are going to look through each dataset 3 times, each time with a different length threshold. This is because the matching algorithm is slow and brute-force, and we want to speed it up.\n')
+fprintf('The first look, we consider every song within 5 seconds of the same length as the target song, and compare the structures.\n')
+fprintf('The second and third passes consider deviations of 10 and 15 seconds, respectively. But we ignore songs that have already been matched, which speeds things up, see?\n')
 for K=1:4,
     rel_mir = rel(K).rel_mir;
     rel_pub = rel(K).rel_pub;
@@ -68,21 +71,27 @@

     % Run the follow script, optionally several times with increasing values of length_thresh to search more widely.
     % (We reduce the search space each time, so using a longer threshold becomes more and more feasible on later interations.)
+    fprintf('Looking at dataset %i. First pass.\n',K)
     length_thresh = 5;
     [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);
+    fprintf('Looking at dataset %i. Second pass.\n',K)
     length_thresh = 10;
     [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);
+    fprintf('Looking at dataset %i. Third pass.\n',K)
     length_thresh = 15;
     [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);

     % The variable P will contain the quality of the matches between all the songs tested.
     P(K).pwf = pwf;
 end
+fprintf('\nOK, done matching! Phew.\n')

 % That was a lot of searching... We do not want to do it twice! Save the output.
+fprintf('Saving the output to ./match_mirex_to_public_data_results so that you do not have to repeat this step again.\n\n')
 save('./match_mirex_to_public_data_results','pub2mir','mir2pub','P');


+fprintf('Here is the first thing reported in the article: a table of how many matches you obtained.\n\n')
 % % Bonus work for Table 2:
 % How many MIREX songs did I find a match for in each category?
 fprintf('MIREX dataset......number of pieces.....number identified\n\n')
--- a/match_mirex_to_public_data_macro.m	Fri Sep 20 17:12:46 2013 +0100
+++ b/match_mirex_to_public_data_macro.m	Sat Dec 07 18:18:50 2013 +0000
@@ -4,7 +4,8 @@
 unmatched_mirdata = find(max(transpose(pwf))<.99);
 unmatched_pubdata = find(max(pwf)<.99);
 % This is how many more songs we have to match.
-length(unmatched_mirdata)
+
+fprintf('FYI: there are %i songs in this dataset that have not yet been matched.\n',length(unmatched_mirdata))
 if ~isempty(unmatched_mirdata) & ~isempty(unmatched_pubdata),

     for i=row(unmatched_mirdata),
@@ -15,7 +16,7 @@
                 pwf(i,j) = res(1);
             end
         end
-        toc
+        % toc
     end
 end
--- a/readme.txt	Fri Sep 20 17:12:46 2013 +0100
+++ b/readme.txt	Sat Dec 07 18:18:50 2013 +0000
@@ -20,13 +20,206 @@
 4. Run the Ruby script "1-get_mirex_estimates.rb" and wait a while for all the data to download.

 5. Unzip all the folders that you obtained.
-	Note: in this version, one of the repositories, the Ewald Peiszer repository, is included already as a zip file ("ep_groundtruth_txt.zip"). Please move this to
+	Note: in this version, one of the repositories, the Ewald Peiszer repository, is included already as a zip file ("ep_groundtruth_txt.zip"). If you set "./mirex_data" as the download path in Step 3, then just unzip it here. Otherwise, move it to wherever the rest of the zips are.
+	Note: due to inconsistencies in how different zipping programs handle things, the folder structure upon unzipping may be inconsistent. Please look at the Ground Truth Directory map below and make sure your files unzip in the same way. If they don't, you'll have to move things around until the structure matches.

 6. Run the Matlab script "2-generate_smith2013_ismir" and wait for all the data to be assembled, and for the figures to be generated. They will appear in "./plots". This repository includes what those pictures *should* look like. Hopefully you overwrite them with exact replicas.

 7. You're done! Hey, that wasn't so bad.


+===== Known issues =====
+
+1. Bug in how CSV files are parsed in Ruby v.1.9.3. Seems to work fine using an older version: try 1.8.7.
+
+
+===== Ground Truth Directory map =====
+
+When your ground truth is all downloaded and unzipped, it should look like this:
+
+*
+|-- AIST.RWC-MDB-C-2001.CHORUS
+|-- AIST.RWC-MDB-G-2001.CHORUS
+|-- AIST.RWC-MDB-J-2001.CHORUS
+|-- AIST.RWC-MDB-P-2001.CHORUS
+|-- Carole%20King%20Annotations
+|   |-- all
+|   |   |-- Carole King
+|   |   |   |-- Tapestry
+|   |-- chordlab
+|   |   |-- Carole King
+|   |   |   |-- Tapestry
+|   |-- keylab
+|   |   |-- Carole King
+|   |   |   |-- Tapestry
+|   |-- seglab
+|   |   |-- Carole King
+|   |   |   |-- Tapestry
+|-- ep_groundtruth
+|   |-- groundtruth
+|-- ep_groundtruth_txt
+|   |-- groundtruth
+|-- IRISA.RWC-MDB-P-2001.BLOCKS
+|-- IRISA.RWC-MDB-P-2012.SEMLAB_v003_full
+|-- IRISA.RWC-MDB-P-2012.SEMLAB_v003_reduced
+|-- Michael%20Jackson%20Annotations
+|   |-- all
+|   |   |-- Michael Jackson
+|   |   |   |-- Essential Michael Jackson [Disc 1]
+|   |   |   |-- Essential Michael Jackson [Disc 2]
+|   |-- seglab
+|   |   |-- Michael Jackson
+|   |   |   |-- Essential Michael Jackson [Disc 1]
+|   |   |   |-- Essential Michael Jackson [Disc 2]
+|-- Queen%20Annotations
+|   |-- all
+|   |   |-- Queen
+|   |   |   |-- Greatest Hits I
+|   |   |   |-- Greatest Hits II
+|   |   |   |-- Greatest Hits III
+|   |-- chordlab
+|   |   |-- Queen
+|   |   |   |-- Greatest Hits I
+|   |   |   |-- Greatest Hits II
+|   |-- keylab
+|   |   |-- Queen
+|   |   |   |-- Greatest Hits I
+|   |   |   |-- Greatest Hits II
+|   |-- seglab
+|   |   |-- Queen
+|   |   |   |-- Greatest Hits I
+|   |   |   |-- Greatest Hits II
+|   |   |   |-- Greatest Hits III
+|-- SALAMI_data_v1.2
+|   |-- data
+|   |   |-- 2
+|   |   |   |-- parsed
+|   |   |-- 4
+|   |   |   |-- parsed
+|   |   |-- 6
+|   |   |   |-- parsed
+|   |   |-- 8
+|   |   |   |-- parsed
+|   |   .
+|   |   .
+|   |   .
+|   |   |-- 1648
+|   |   |   |-- parsed
+|   |   |-- 1650
+|   |   |   |-- parsed
+|   |   |-- 1652
+|   |   |   |-- parsed
+|   |   |-- 1654
+|   |   |   |-- parsed
+|-- The%20Beatles%20Annotations
+|   |-- all
+|   |   |-- The Beatles
+|   |   |   |-- 01_-_Please_Please_Me
+|   |   |   |-- 02_-_With_the_Beatles
+|   |   |   |-- 03_-_A_Hard_Day's_Night
+|   |   |   |-- 04_-_Beatles_for_Sale
+|   |   |   |-- 05_-_Help!
+|   |   |   |-- 06_-_Rubber_Soul
+|   |   |   |-- 07_-_Revolver
+|   |   |   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |   |   |-- 09_-_Magical_Mystery_Tour
+|   |   |   |-- 10CD1_-_The_Beatles
+|   |   |   |-- 10CD2_-_The_Beatles
+|   |   |   |-- 11_-_Abbey_Road
+|   |   |   |-- 12_-_Let_It_Be
+|   |-- beat
+|   |   |-- The Beatles
+|   |   |   |-- 01_-_Please_Please_Me
+|   |   |   |-- 02_-_With_the_Beatles
+|   |   |   |-- 03_-_A_Hard_Day's_Night
+|   |   |   |-- 04_-_Beatles_for_Sale
+|   |   |   |-- 05_-_Help!
+|   |   |   |-- 06_-_Rubber_Soul
+|   |   |   |-- 07_-_Revolver
+|   |   |   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |   |   |-- 09_-_Magical_Mystery_Tour
+|   |   |   |-- 10CD1_-_The_Beatles
+|   |   |   |-- 10CD2_-_The_Beatles
+|   |   |   |-- 11_-_Abbey_Road
+|   |   |   |-- 12_-_Let_It_Be
+|   |-- chordlab
+|   |   |-- The Beatles
+|   |   |   |-- 01_-_Please_Please_Me
+|   |   |   |-- 02_-_With_the_Beatles
+|   |   |   |-- 03_-_A_Hard_Day's_Night
+|   |   |   |-- 04_-_Beatles_for_Sale
+|   |   |   |-- 05_-_Help!
+|   |   |   |-- 06_-_Rubber_Soul
+|   |   |   |-- 07_-_Revolver
+|   |   |   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |   |   |-- 09_-_Magical_Mystery_Tour
+|   |   |   |-- 10CD1_-_The_Beatles
+|   |   |   |-- 10CD2_-_The_Beatles
+|   |   |   |-- 11_-_Abbey_Road
+|   |   |   |-- 12_-_Let_It_Be
+|   |-- keylab
+|   |   |-- The Beatles
+|   |   |   |-- 01_-_Please_Please_Me
+|   |   |   |-- 02_-_With_the_Beatles
+|   |   |   |-- 03_-_A_Hard_Day's_Night
+|   |   |   |-- 04_-_Beatles_for_Sale
+|   |   |   |-- 05_-_Help!
+|   |   |   |-- 06_-_Rubber_Soul
+|   |   |   |-- 07_-_Revolver
+|   |   |   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |   |   |-- 09_-_Magical_Mystery_Tour
+|   |   |   |-- 10CD1_-_The_Beatles
+|   |   |   |-- 10CD2_-_The_Beatles
+|   |   |   |-- 11_-_Abbey_Road
+|   |   |   |-- 12_-_Let_It_Be
+|   |-- seglab
+|   |   |-- The Beatles
+|   |   |   |-- 01_-_Please_Please_Me
+|   |   |   |-- 02_-_With_the_Beatles
+|   |   |   |-- 03_-_A_Hard_Day's_Night
+|   |   |   |-- 04_-_Beatles_for_Sale
+|   |   |   |-- 05_-_Help!
+|   |   |   |-- 06_-_Rubber_Soul
+|   |   |   |-- 07_-_Revolver
+|   |   |   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |   |   |-- 09_-_Magical_Mystery_Tour
+|   |   |   |-- 10CD1_-_The_Beatles
+|   |   |   |-- 10CD2_-_The_Beatles
+|   |   |   |-- 11_-_Abbey_Road
+|   |   |   |-- 12_-_Let_It_Be
+|-- TUT
+|   |-- 01_-_Please_please_me_1963
+|   |-- 02_-_With_The_Beatles_1963
+|   |-- 03_-_A_hard_days_night_1964
+|   |-- 04_-_Beatles_for_sale_1964
+|   |-- 05_-_Help_1965
+|   |-- 06_-_Rubber_Soul.bak.bak
+|   |-- 07_-_Revolver
+|   |-- 08_-_Sgt._Pepper's_Lonely_Hearts_Club_Band
+|   |-- 09_-_Magical_Mystery_Tour
+|   |-- 10_-_The_Beatles_(White_Album)_CD1
+|   |-- 10_-_The_Beatles_(White_Album)_CD2
+|   |-- 11_-_Abbey_Road
+|   |-- 12_-_Let_it_Be
+|   |-- LICENSE
+|   |-- README
+|-- Zweieck%20Annotations
+    |-- all
+    |   |-- Zweieck
+    |   |   |-- Zwielicht
+    |-- beat
+    |   |-- Zweieck
+    |   |   |-- Zwielicht
+    |-- chordlab
+    |   |-- Zweieck
+    |   |   |-- Zwielicht
+    |-- keylab
+    |   |-- Zweieck
+    |   |   |-- Zwielicht
+    |-- seglab
+        |-- Zweieck
+            |-- Zwielicht
+

 ===== The MIT License (MIT) =====