jordan@1
|
1 function [pub2mir, mir2pub, P] = match_mirex_to_public_data(mirex_truth, public_truth, mirex_dset_origin, public_dset_origin, mir2pub_relevance)
|
jordan@1
|
2 % function [pub2mir, mir2pub] = match_mirex_to_public_data(mirex_truth,
|
jordan@1
|
3 % public_truth, mirex_dset_origin, public_dset_origin, mir2pub_relevance)
|
jordan@1
|
4 %
|
jordan@1
|
5 % This function looks through the collections of MIREX and PUBLIC annotations
|
jordan@1
|
6 % and attempts to find matches between them, i.e., possible public annotations
|
jordan@1
|
7 % that could be the same as the MIREX annotations.
|
jordan@1
|
8 %
|
jordan@1
|
9 % MIREX_TRUTH and PUBLIC_TRUTH are Nx1 and Mx1 structures where N and M are the number
|
jordan@1
|
10 % of MIREX and PUBLIC annotations available, respectively. Each element contains fields
|
jordan@1
|
11 % TIM, LAB and DSET which give the time points and labels of the annotation, as well as
|
jordan@1
|
12 % the index of the dataset.
|
jordan@1
|
13 %
|
jordan@1
|
14 % MIREX_DSET_ORIGIN and PUBLIC_DSET_ORIGIN contain the same information in the DSET
|
jordan@1
|
15 % field, but in an array.
|
jordan@1
|
16 %
|
jordan@1
|
17 % The output vectors PUB2MIR and MIR2PUB work in the following way. If the nth MIREX and
|
jordan@1
|
18 % the mth PUBLIC annotations are found to match, then PUB2MIR(m) = n and MIR2PUB(n) = m.
|
jordan@1
|
19 %
|
jordan@1
|
20 % MIR2PUB_RELEVANCE contains a simple Px2 array where each row contains (1) the index of
|
jordan@1
|
21 % a MIREX dataset and (2) the index of a PUBLIC dataset that are hypothesized to contain
|
jordan@1
|
22 % some of the same songs. Including this cuts down on the number of datasets that are
|
jordan@1
|
23 % searched for matches.
|
jordan@1
|
24 %
|
jordan@1
|
25 % For example, if the default values are kept in all the other scripts, then the correct
|
jordan@1
|
26 % relevance matches are:
|
jordan@1
|
27 %
|
jordan@1
|
28 % mir2pub_relevance = [1 2; 1 3; 1 5; 2 1; 2 4; 3 1; 3 4; 4 1; 4 2; 4 3; 4 4; 4 5; 4 6];
|
jordan@1
|
29 %
|
jordan@1
|
30 % That was based on the following assumptions:
|
jordan@1
|
31 % public_dset_origin: 1 = RWC [AIST], 2 = Isophonics, 3 = EP, 4 = IRISA [Euro and RWC], 5 = TUT, 6 = SALAMI
|
jordan@1
|
32 % mirex_dset_origin: 1 = 09 [Isophonics, Beatles, EP], 2 = 10a [RWC, boundaries only], 3 = 10b [RWC AIST], 4 = 12 [salami]
|
jordan@1
|
33
|
jordan@1
|
34
|
jordan@1
|
35 pub2mir = zeros(length(public_truth),1);
|
jordan@1
|
36 mir2pub = zeros(length(mirex_truth),1);
|
jordan@1
|
37
|
jordan@1
|
38 % public_dset_origin: 1 = RWC [AIST], 2 = Isophonics, 3 = EP, 4 = IRISA [Euro and RWC], 5 = TUT, 6 = SALAMI
|
jordan@1
|
39 % mirex_dset_origin: 1 = 09 [Isophonics, Beatles, EP], 2 = 10a [RWC, boundaries only], 3 = 10b [RWC AIST], 4 = 12 [salami]
|
jordan@1
|
40
|
jordan@1
|
41 % Look through all the MIREX annotations. For each one, look at the public annotations available.
|
jordan@1
|
42 % When you find a song which has the same length (within a second), compare the structures and save the output.
|
jordan@1
|
43
|
jordan@1
|
44 % We shall do this dataset by dataset. Starting with 2009, with first select the relevant datasets from the MIREX and public domains.
|
jordan@1
|
45 rel(1).rel_mir = find(mirex_dset_origin==1);
|
jordan@1
|
46 rel(1).rel_pub = find(public_dset_origin(:,1)==2 | public_dset_origin(:,1)==3 | public_dset_origin(:,1)==5);
|
jordan@1
|
47 rel(2).rel_mir = find(mirex_dset_origin==2);
|
jordan@1
|
48 rel(2).rel_pub = find(public_dset_origin(:,1)==4);
|
jordan@1
|
49 rel(3).rel_mir = find(mirex_dset_origin==3);
|
jordan@1
|
50 rel(3).rel_pub = find(public_dset_origin(:,1)==1);
|
jordan@1
|
51 rel(4).rel_mir = find(mirex_dset_origin==4);
|
jordan@1
|
52 rel(4).rel_pub = find(public_dset_origin(:,1)==6);
|
jordan@1
|
53
|
jordan@4
|
54 % The metric is the boundary f-measure. The quality threshold is the minimum value of this metric that we consider to indicate a match. 0.99 is really high!
|
jordan@1
|
55 quality_threshes = [.99 0.99 0.99 0.99];
|
jordan@1
|
56
|
jordan@4
|
57 fprintf('OK, we are going to look through each dataset 3 times, each time with a different length threshold. This is because the matching algorithm is slow and brute-force, and we want to speed it up.\n')
|
jordan@4
|
58 fprintf('The first look, we consider every song within 5 seconds of the same length as the target song, and compare the structures.\n')
|
jordan@4
|
59 fprintf('The second and third passes consider deviations of 10 and 15 seconds, respectively. But we ignore songs that have already been matched, which speeds things up, see?\n')
|
jordan@1
|
60 for K=1:4,
|
jordan@1
|
61 rel_mir = rel(K).rel_mir;
|
jordan@1
|
62 rel_pub = rel(K).rel_pub;
|
jordan@1
|
63 quality_thresh = quality_threshes(K);
|
jordan@1
|
64
|
jordan@1
|
65 % We maintain a list of the songs that have not been matched yet:
|
jordan@1
|
66 unmatched_mirdata = rel_mir;
|
jordan@1
|
67 unmatched_pubdata = rel_pub;
|
jordan@1
|
68
|
jordan@1
|
69 % We also make a matrix to hold the match between all the songs.
|
jordan@1
|
70 pwf = zeros(length(rel_mir),length(rel_pub));
|
jordan@1
|
71
|
jordan@1
|
72 % Run the follow script, optionally several times with increasing values of length_thresh to search more widely.
|
jordan@1
|
73 % (We reduce the search space each time, so using a longer threshold becomes more and more feasible on later interations.)
|
jordan@4
|
74 fprintf('Looking at dataset %i. First pass.\n',K)
|
jordan@1
|
75 length_thresh = 5;
|
jordan@1
|
76 [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);
|
jordan@4
|
77 fprintf('Looking at dataset %i. Second pass.\n',K)
|
jordan@1
|
78 length_thresh = 10;
|
jordan@1
|
79 [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);
|
jordan@4
|
80 fprintf('Looking at dataset %i. Third pass.\n',K)
|
jordan@1
|
81 length_thresh = 15;
|
jordan@1
|
82 [mir2pub pub2mir pwf] = match_mirex_to_public_data_macro(mir2pub, pub2mir, pwf, mirex_truth, public_truth, rel_mir, rel_pub, length_thresh, quality_thresh);
|
jordan@1
|
83
|
jordan@1
|
84 % The variable P will contain the quality of the matches between all the songs tested.
|
jordan@1
|
85 P(K).pwf = pwf;
|
jordan@1
|
86 end
|
jordan@4
|
87 fprintf('\nOK, done matching! Phew.\n')
|
jordan@1
|
88
|
jordan@1
|
89 % That was a lot of searching... We do not want to do it twice! Save the output.
|
jordan@4
|
90 fprintf('Saving the output to ./match_mirex_to_public_data_results so that you do not have to repeat this step again.\n\n')
|
jordan@1
|
91 save('./match_mirex_to_public_data_results','pub2mir','mir2pub','P');
|
jordan@1
|
92
|
jordan@1
|
93
|
jordan@4
|
94 fprintf('Here is the first thing reported in the article: a table of how many matches you obtained.\n\n')
|
jordan@1
|
95 % % Bonus work for Table 2:
|
jordan@1
|
96 % How many MIREX songs did I find a match for in each category?
|
jordan@1
|
97 fprintf('MIREX dataset......number of pieces.....number identified\n\n')
|
jordan@1
|
98 for K=1:4,
|
jordan@1
|
99 % This is the number of MIREX songs identified with public annotations.
|
jordan@1
|
100 tmp = sum(mir2pub(find(mirex_dset_origin==K))>0);
|
jordan@1
|
101 fprintf('Dataset %i .. %i .. %i\n',K,length(find(mirex_dset_origin==K)),tmp)
|
jordan@1
|
102 end
|
jordan@1
|
103 % Aslo, how many public annotations did I find a match for?
|
jordan@1
|
104 for K=1:6,
|
jordan@1
|
105 % This is the number of public songs that occurred in MIREX.
|
jordan@1
|
106 sum(pub2mir(find(public_dset_origin(:,1)==K))>0)
|
jordan@1
|
107 end
|
jordan@1
|
108 %
|
jordan@1
|
109 % mir2pub(find(mirex_dset_origin==2))
|
jordan@1
|
110 %
|
jordan@1
|
111 % % Confirm that the songs we are matching are actually the same:
|
jordan@1
|
112 % mir_id = find(pub2mir,1); % find the first matching public song
|
jordan@1
|
113 % pub_id = mir2pub(mir_id);
|
jordan@1
|
114 % mirex_truth(mir_id).tim
|
jordan@1
|
115 % public_truth(pub_id).tim
|
jordan@1
|
116 % % Are they the same? If not, something is going wrong!
|
jordan@1
|
117 %
|
jordan@1
|
118 % % How to identify a MIREX song based on its public match:
|
jordan@1
|
119 % % Thankfully, we retained the filenames of the public data.
|
jordan@1
|
120 % mir_id = find(pub2mir,1); % find the first matching public song
|
jordan@1
|
121 % pub_id = mir2pub(mir_id);
|
jordan@1
|
122 % mirex_truth(mir_id).file
|
jordan@1
|
123 % public_truth(pub_id).file |