matthiasm@8
|
1 function [chroma, t, salience, tuning, URIstring] = chromagenerator(filename, kammerton, nBin)
|
matthiasm@8
|
2
|
matthiasm@8
|
3 %% make URI string
|
matthiasm@8
|
4 URIstring = strcat('chromagenerator-', num2str(kammerton), '-',num2str(nBin));
|
matthiasm@8
|
5
|
matthiasm@8
|
6 %% parameters
|
matthiasm@8
|
7 % set the bins per semitone (i.e. binsperoctave/12)
|
matthiasm@8
|
8 % kammerton = 440;
|
matthiasm@8
|
9 % nBin = 7;
|
matthiasm@8
|
10 midbin = ceil(nBin/2);
|
matthiasm@8
|
11
|
matthiasm@8
|
12 % minimum and maximum MIDI bin, should be n octaves apart.
|
matthiasm@8
|
13 midimin = 21; % 69 has 440 Hz
|
matthiasm@8
|
14 % midimin = 45; % 69 has 440 Hz
|
matthiasm@8
|
15 midimax = 92;
|
matthiasm@8
|
16 % midimax = 68;
|
matthiasm@8
|
17 noterange = midimax - midimin + 1;
|
matthiasm@8
|
18
|
matthiasm@8
|
19 % midi note values including the fractions "between" actual MIDI notes
|
matthiasm@8
|
20 midibins = midimin-(midbin-1)/nBin:(1/nBin):midimax+(midbin-1)/nBin;
|
matthiasm@8
|
21 % the same in frequencies
|
matthiasm@8
|
22 fbins = kammerton * 2.^((midibins-69)/12);
|
matthiasm@8
|
23 nFbin = length(fbins);
|
matthiasm@8
|
24
|
matthiasm@8
|
25 % the sampling frequency everything is (down)sampled to
|
matthiasm@8
|
26 used_fs = 11025;
|
matthiasm@8
|
27
|
matthiasm@8
|
28 % intuitive setting of how long the frame is (quite short here!)
|
matthiasm@8
|
29 fracofsecond = 4;
|
matthiasm@8
|
30
|
matthiasm@8
|
31 hopspersecond = 20;
|
matthiasm@8
|
32
|
matthiasm@8
|
33 % calculate a convenient FFT length
|
matthiasm@8
|
34 nFFT = 2^nextpow2(used_fs/fracofsecond);
|
matthiasm@8
|
35 % hopsize in samples
|
matthiasm@8
|
36 hopsize = used_fs/hopspersecond/nFFT;
|
matthiasm@8
|
37
|
matthiasm@8
|
38 % "instrument" means a particular spectral shape
|
matthiasm@8
|
39 used_instruments = 1:2;
|
matthiasm@8
|
40 nInstrument = length(used_instruments);
|
matthiasm@8
|
41
|
matthiasm@8
|
42 %% generate dictionary
|
matthiasm@8
|
43 % (complex) tones for all fundamental frequencies are generated. then their
|
matthiasm@8
|
44 % amplitude spectrum is calculated and normalised according to the L2 norm.
|
matthiasm@8
|
45
|
matthiasm@8
|
46 % preliminary t for dictionary generation
|
matthiasm@8
|
47 t = (1:nFFT)/used_fs;
|
matthiasm@8
|
48 % initialise note dictionary
|
matthiasm@8
|
49 A = zeros(nFbin,nFFT/2,nInstrument);
|
matthiasm@8
|
50
|
matthiasm@8
|
51 % loop over all instruments
|
matthiasm@8
|
52 for iInstrument = used_instruments
|
matthiasm@8
|
53 % loop over all fundamental frequencies
|
matthiasm@8
|
54 for iFbin = 1:nFbin;
|
matthiasm@8
|
55 f0 = fbins(iFbin);
|
matthiasm@8
|
56 switch iInstrument
|
matthiasm@8
|
57 case 1
|
matthiasm@8
|
58 wave = sum([sin(t*2*pi*f0*1); ...
|
matthiasm@8
|
59 0.9^1 * sin(t*2*pi*f0*2); ...
|
matthiasm@8
|
60 0.9^2 * sin(t*2*pi*f0*3); ...
|
matthiasm@8
|
61 0.9^3 * sin(t*2*pi*f0*4); ...
|
matthiasm@8
|
62 ...0.9^4 * sin(t*2*pi*f0*5); ...
|
matthiasm@8
|
63 ]);
|
matthiasm@8
|
64 case 2
|
matthiasm@8
|
65 wave = sin(t*2*pi*f0*1);
|
matthiasm@8
|
66 end
|
matthiasm@8
|
67 % window wave and fft
|
matthiasm@8
|
68 wave = hamming(nFFT)' .* wave;
|
matthiasm@8
|
69 fullfft = fft(wave);
|
matthiasm@8
|
70 % calculate amplitude spectrum from fft
|
matthiasm@8
|
71 A(iFbin,:,iInstrument) = abs(fullfft(1:round(nFFT/2)));
|
matthiasm@8
|
72 end
|
matthiasm@8
|
73 % normalise (L2 norm)
|
matthiasm@8
|
74 A(:,:,iInstrument) = qnormalise(A(:,:,iInstrument),2,2);
|
matthiasm@8
|
75 end
|
matthiasm@8
|
76
|
matthiasm@8
|
77 %% get the fft frames from a wave file
|
matthiasm@8
|
78 %this is probably ending up not being used in a vamp plugin
|
matthiasm@8
|
79 % fprintf(1,'%s\n',filename)
|
matthiasm@8
|
80 % [audiosize,fs] = wavread(filename,'size');
|
matthiasm@8
|
81 % chunk_sec = 20;
|
matthiasm@8
|
82 % nChunk = ceil(audiosize(1)/fs/chunk_sec);
|
matthiasm@8
|
83 % songframes = [];
|
matthiasm@8
|
84 % start = 1;
|
matthiasm@8
|
85 % % extract fft from chunks (for less memory consumption)
|
matthiasm@8
|
86 % for iChunk = 1:nChunk
|
matthiasm@8
|
87 % samplemin = (iChunk-1) * chunk_sec * fs + 1;
|
matthiasm@8
|
88 % samplemax = min(audiosize(1), (iChunk * chunk_sec + 1) * fs);
|
matthiasm@8
|
89 % audiodata0 = wavread(filename,[samplemin samplemax]);
|
matthiasm@8
|
90 % audiodata = resample(mean(audiodata0,2),used_fs,fs,20);
|
matthiasm@8
|
91 % songframes0 = myframefft(audiodata,nFFT,hopsize,'hamming');
|
matthiasm@8
|
92 % if size(songframes0,2)>=start
|
matthiasm@8
|
93 % songframes = [songframes(:,1:end-start) abs(songframes0(1:round(nFFT/2),start:end))];
|
matthiasm@8
|
94 % end
|
matthiasm@8
|
95 % start = round(hopspersecond/2)+1;
|
matthiasm@8
|
96 % end
|
matthiasm@8
|
97 % clear songframes0
|
matthiasm@8
|
98 % clear audiodata
|
matthiasm@8
|
99 % clear audiodata0
|
matthiasm@8
|
100 % t = (0:T-1)./hopspersecond;
|
matthiasm@8
|
101
|
matthiasm@8
|
102 %%
|
matthiasm@8
|
103 [songframes,f,t]=chunkspectrogram(filename,4096,4096*7/8,4096,inf);
|
matthiasm@8
|
104 hopspersecond = 1/(t(2)-t(1));
|
matthiasm@8
|
105 songframes = abs(songframes);
|
matthiasm@8
|
106 %%
|
matthiasm@8
|
107 T = size(songframes,2);
|
matthiasm@8
|
108
|
matthiasm@8
|
109 fprintf(1, '%d different notes, %d frequency bins, %d time frames\n', noterange,nFFT/2, T)
|
matthiasm@8
|
110
|
matthiasm@8
|
111 notespectrum0 = zeros(nFbin, T, nInstrument);
|
matthiasm@8
|
112 songframes = qnormalise(songframes,2,1);
|
matthiasm@8
|
113 songframes(isnan(songframes)) = 0;
|
matthiasm@8
|
114
|
matthiasm@8
|
115 %% calculate cosine distance
|
matthiasm@8
|
116 for iInstrument = 1:nInstrument
|
matthiasm@8
|
117 for iTone = 1:nFbin
|
matthiasm@8
|
118 notespectrum0(iTone,:,iInstrument) = A(iTone,:,iInstrument) * songframes;
|
matthiasm@8
|
119 end
|
matthiasm@8
|
120 end
|
matthiasm@8
|
121 %% get only the bins with "more than 1 st. deviation"
|
matthiasm@8
|
122 notespectrum = zeros(size(notespectrum0));
|
matthiasm@8
|
123 wind_size = 6 * nBin;
|
matthiasm@8
|
124 wind = hamming(wind_size)/sum(hamming(wind_size));
|
matthiasm@8
|
125 stdthresh = 1;
|
matthiasm@8
|
126 for iInstrument = used_instruments
|
matthiasm@8
|
127 % calculate running mean
|
matthiasm@8
|
128 m = conv2(notespectrum0(:,:,iInstrument),wind(:),'same');
|
matthiasm@8
|
129 % calculate running standard deviation
|
matthiasm@8
|
130 stdev = sqrt(conv2((notespectrum0(:,:,iInstrument)-m).^2,wind(:),'same'));
|
matthiasm@8
|
131 % take only those tones that are above 1 standard deviation from the
|
matthiasm@8
|
132 % mean
|
matthiasm@8
|
133 notespectrum(:,:,iInstrument) = (notespectrum0(:,:,iInstrument)-m)./stdev;
|
matthiasm@8
|
134 temp = notespectrum(:,:,iInstrument);
|
matthiasm@8
|
135 temp(temp<stdthresh) = 0;
|
matthiasm@8
|
136 notespectrum(:,:,iInstrument) = temp;
|
matthiasm@8
|
137 end
|
matthiasm@8
|
138 % notes that are one were not "good enough" (not greater than 1 st.
|
matthiasm@8
|
139 % deviation reater than the mean) and are hence set to 0.
|
matthiasm@8
|
140 notespectrum(notespectrum<=stdthresh) = 0;
|
matthiasm@8
|
141 % we require that the notes be on in *both* "instruments". hence the final
|
matthiasm@8
|
142 % note spectrum note_s is their product, which works a bit like a minimum
|
matthiasm@8
|
143 note_s = notespectrum(:,:,1) .* notespectrum(:,:,2);
|
matthiasm@8
|
144
|
matthiasm@8
|
145 note_s(1:nBin*6,:) = 0;
|
matthiasm@8
|
146 note_s(isnan(note_s)) = 0;
|
matthiasm@8
|
147 %% tuning
|
matthiasm@8
|
148 % I wrap the note spectrum to one semitone, kind of generating the note
|
matthiasm@8
|
149 % spectrum modulo one semitone. one gets a matrix of size nBin x T. from
|
matthiasm@8
|
150 % that matrix, the tuning can be calculated by extracting the angle of the
|
matthiasm@8
|
151 % sum of all columns, the sum of all columns being a nBin-dimensional
|
matthiasm@8
|
152 % vector.
|
matthiasm@8
|
153 th = tuner(note_s(36*nBin+(1:12*nBin),:), nBin)
|
matthiasm@8
|
154 % th = tuner(notespectrum0(36*nBin+(1:12*nBin),:,2), nBin)
|
matthiasm@8
|
155 tuning = kammerton*2^((th/2/pi)/12)
|
matthiasm@8
|
156 % tune the spectrum by interpolating the bins such that now the middle bin
|
matthiasm@8
|
157 % of every semitone corresponds to the right frequency in equal
|
matthiasm@8
|
158 % temperament
|
matthiasm@8
|
159 tuned_s = zeros(size(note_s));
|
matthiasm@8
|
160 for iFrame = 1:T
|
matthiasm@8
|
161 tuned_s(:,iFrame) = interp1(1:nFbin, note_s(:,iFrame), (1:nFbin) + th / (2 * pi) * nBin,[],0);
|
matthiasm@8
|
162 end
|
matthiasm@8
|
163
|
matthiasm@8
|
164 %% reduce to 1 bin per semitone
|
matthiasm@8
|
165 % just sum the nBin bins that correspond to one semitone.
|
matthiasm@8
|
166 wrapped_tuned_s = reshape(tuned_s,[nBin,noterange*T]);
|
matthiasm@8
|
167 % if you want to emphasize the middle bin, use a fancy window, otherwise
|
matthiasm@8
|
168 % just use the rectangular window
|
matthiasm@8
|
169 wei = rectwin(nBin)';
|
matthiasm@8
|
170 salience.reduced = reshape(wei * wrapped_tuned_s,noterange,T);
|
matthiasm@8
|
171 % median filter in time direction, if you like
|
matthiasm@8
|
172 salience.reduced = medfilt1(salience.reduced,floor(hopspersecond/5)-1,[],2);
|
matthiasm@8
|
173
|
matthiasm@8
|
174 %% make chromagram
|
matthiasm@8
|
175 % actually the edge of the note spectrum is a bit noisy (due to the
|
matthiasm@8
|
176 % convolution three parts above. that's why I start some bins from the
|
matthiasm@8
|
177 % lower edge (i.e. midimin_used). maybe one could do this properly
|
matthiasm@8
|
178 % sometime.
|
matthiasm@8
|
179 midimin_used = midimin+6;
|
matthiasm@8
|
180 % uebergang is where bass and treble chroma overlap equally
|
matthiasm@8
|
181 uebergang = 50;
|
matthiasm@8
|
182
|
matthiasm@8
|
183 % some simple linear functions to generate the treble and bass profiles
|
matthiasm@8
|
184 leftfunc = zeros(2,noterange);
|
matthiasm@8
|
185 rightfunc = zeros(2,noterange);
|
matthiasm@8
|
186 leftfunc(1,:) = 2/12*((midimin:midimax)-midimin_used);
|
matthiasm@8
|
187 leftfunc(2,:) = 1/12*((midimin:midimax)-uebergang) + 0.5;
|
matthiasm@8
|
188 rightfunc(1,:) = -1/12*((midimin:midimax)-uebergang) + 0.5;
|
matthiasm@8
|
189 rightfunc(2,:) = -(.5)/12*((midimin:midimax)-midimax);
|
matthiasm@8
|
190
|
matthiasm@8
|
191 % initialise stuff
|
matthiasm@8
|
192 salience.bass = zeros(size(salience.reduced));
|
matthiasm@8
|
193 salience.treble = zeros(size(salience.reduced));
|
matthiasm@8
|
194 profiles = zeros(noterange,3);
|
matthiasm@8
|
195
|
matthiasm@8
|
196 % bass and treble profiles
|
matthiasm@8
|
197 for i = 1:2
|
matthiasm@8
|
198 profiles(:,i) = max(0,min(1,min(leftfunc(i,:),rightfunc(i,:))))';
|
matthiasm@8
|
199 end
|
matthiasm@8
|
200 % wide profile (i.e. both bass and treble)
|
matthiasm@8
|
201 profiles(:,3) = min(1,sum(profiles(:,1:2),2));
|
matthiasm@8
|
202
|
matthiasm@8
|
203 % calculate specific saliences (windowed in f0 direction)
|
matthiasm@8
|
204 salience.bass = salience.reduced .* profiles(:,ones(1,T));
|
matthiasm@8
|
205 salience.treble = salience.reduced .* profiles(:,2 * ones(1,T));
|
matthiasm@8
|
206 salience.wide = salience.reduced .* profiles(:,3 * ones(1,T));
|
matthiasm@8
|
207
|
matthiasm@8
|
208 % calculate chromagrams
|
matthiasm@8
|
209 chroma.bass = squeeze(max(reshape(salience.bass,[12,6,T]),[],2));
|
matthiasm@8
|
210 chroma.treble = squeeze(max(reshape(salience.treble,[12,6,T]),[],2));
|
matthiasm@8
|
211 chroma.wide = squeeze(max(reshape(salience.wide,[12,6,T]),[],2));
|
matthiasm@8
|
212
|