annotate _misc/featureextraction/chromagenerator.m @ 9:4ea6619cb3f5 tip

removed log files
author matthiasm
date Fri, 11 Apr 2014 15:55:11 +0100
parents b5b38998ef3b
children
rev   line source
matthiasm@8 1 function [chroma, t, salience, tuning, URIstring] = chromagenerator(filename, kammerton, nBin)
matthiasm@8 2
matthiasm@8 3 %% make URI string
matthiasm@8 4 URIstring = strcat('chromagenerator-', num2str(kammerton), '-',num2str(nBin));
matthiasm@8 5
matthiasm@8 6 %% parameters
matthiasm@8 7 % set the bins per semitone (i.e. binsperoctave/12)
matthiasm@8 8 % kammerton = 440;
matthiasm@8 9 % nBin = 7;
matthiasm@8 10 midbin = ceil(nBin/2);
matthiasm@8 11
matthiasm@8 12 % minimum and maximum MIDI bin, should be n octaves apart.
matthiasm@8 13 midimin = 21; % 69 has 440 Hz
matthiasm@8 14 % midimin = 45; % 69 has 440 Hz
matthiasm@8 15 midimax = 92;
matthiasm@8 16 % midimax = 68;
matthiasm@8 17 noterange = midimax - midimin + 1;
matthiasm@8 18
matthiasm@8 19 % midi note values including the fractions "between" actual MIDI notes
matthiasm@8 20 midibins = midimin-(midbin-1)/nBin:(1/nBin):midimax+(midbin-1)/nBin;
matthiasm@8 21 % the same in frequencies
matthiasm@8 22 fbins = kammerton * 2.^((midibins-69)/12);
matthiasm@8 23 nFbin = length(fbins);
matthiasm@8 24
matthiasm@8 25 % the sampling frequency everything is (down)sampled to
matthiasm@8 26 used_fs = 11025;
matthiasm@8 27
matthiasm@8 28 % intuitive setting of how long the frame is (quite short here!)
matthiasm@8 29 fracofsecond = 4;
matthiasm@8 30
matthiasm@8 31 hopspersecond = 20;
matthiasm@8 32
matthiasm@8 33 % calculate a convenient FFT length
matthiasm@8 34 nFFT = 2^nextpow2(used_fs/fracofsecond);
matthiasm@8 35 % hopsize in samples
matthiasm@8 36 hopsize = used_fs/hopspersecond/nFFT;
matthiasm@8 37
matthiasm@8 38 % "instrument" means a particular spectral shape
matthiasm@8 39 used_instruments = 1:2;
matthiasm@8 40 nInstrument = length(used_instruments);
matthiasm@8 41
matthiasm@8 42 %% generate dictionary
matthiasm@8 43 % (complex) tones for all fundamental frequencies are generated. then their
matthiasm@8 44 % amplitude spectrum is calculated and normalised according to the L2 norm.
matthiasm@8 45
matthiasm@8 46 % preliminary t for dictionary generation
matthiasm@8 47 t = (1:nFFT)/used_fs;
matthiasm@8 48 % initialise note dictionary
matthiasm@8 49 A = zeros(nFbin,nFFT/2,nInstrument);
matthiasm@8 50
matthiasm@8 51 % loop over all instruments
matthiasm@8 52 for iInstrument = used_instruments
matthiasm@8 53 % loop over all fundamental frequencies
matthiasm@8 54 for iFbin = 1:nFbin;
matthiasm@8 55 f0 = fbins(iFbin);
matthiasm@8 56 switch iInstrument
matthiasm@8 57 case 1
matthiasm@8 58 wave = sum([sin(t*2*pi*f0*1); ...
matthiasm@8 59 0.9^1 * sin(t*2*pi*f0*2); ...
matthiasm@8 60 0.9^2 * sin(t*2*pi*f0*3); ...
matthiasm@8 61 0.9^3 * sin(t*2*pi*f0*4); ...
matthiasm@8 62 ...0.9^4 * sin(t*2*pi*f0*5); ...
matthiasm@8 63 ]);
matthiasm@8 64 case 2
matthiasm@8 65 wave = sin(t*2*pi*f0*1);
matthiasm@8 66 end
matthiasm@8 67 % window wave and fft
matthiasm@8 68 wave = hamming(nFFT)' .* wave;
matthiasm@8 69 fullfft = fft(wave);
matthiasm@8 70 % calculate amplitude spectrum from fft
matthiasm@8 71 A(iFbin,:,iInstrument) = abs(fullfft(1:round(nFFT/2)));
matthiasm@8 72 end
matthiasm@8 73 % normalise (L2 norm)
matthiasm@8 74 A(:,:,iInstrument) = qnormalise(A(:,:,iInstrument),2,2);
matthiasm@8 75 end
matthiasm@8 76
matthiasm@8 77 %% get the fft frames from a wave file
matthiasm@8 78 %this is probably ending up not being used in a vamp plugin
matthiasm@8 79 % fprintf(1,'%s\n',filename)
matthiasm@8 80 % [audiosize,fs] = wavread(filename,'size');
matthiasm@8 81 % chunk_sec = 20;
matthiasm@8 82 % nChunk = ceil(audiosize(1)/fs/chunk_sec);
matthiasm@8 83 % songframes = [];
matthiasm@8 84 % start = 1;
matthiasm@8 85 % % extract fft from chunks (for less memory consumption)
matthiasm@8 86 % for iChunk = 1:nChunk
matthiasm@8 87 % samplemin = (iChunk-1) * chunk_sec * fs + 1;
matthiasm@8 88 % samplemax = min(audiosize(1), (iChunk * chunk_sec + 1) * fs);
matthiasm@8 89 % audiodata0 = wavread(filename,[samplemin samplemax]);
matthiasm@8 90 % audiodata = resample(mean(audiodata0,2),used_fs,fs,20);
matthiasm@8 91 % songframes0 = myframefft(audiodata,nFFT,hopsize,'hamming');
matthiasm@8 92 % if size(songframes0,2)>=start
matthiasm@8 93 % songframes = [songframes(:,1:end-start) abs(songframes0(1:round(nFFT/2),start:end))];
matthiasm@8 94 % end
matthiasm@8 95 % start = round(hopspersecond/2)+1;
matthiasm@8 96 % end
matthiasm@8 97 % clear songframes0
matthiasm@8 98 % clear audiodata
matthiasm@8 99 % clear audiodata0
matthiasm@8 100 % t = (0:T-1)./hopspersecond;
matthiasm@8 101
matthiasm@8 102 %%
matthiasm@8 103 [songframes,f,t]=chunkspectrogram(filename,4096,4096*7/8,4096,inf);
matthiasm@8 104 hopspersecond = 1/(t(2)-t(1));
matthiasm@8 105 songframes = abs(songframes);
matthiasm@8 106 %%
matthiasm@8 107 T = size(songframes,2);
matthiasm@8 108
matthiasm@8 109 fprintf(1, '%d different notes, %d frequency bins, %d time frames\n', noterange,nFFT/2, T)
matthiasm@8 110
matthiasm@8 111 notespectrum0 = zeros(nFbin, T, nInstrument);
matthiasm@8 112 songframes = qnormalise(songframes,2,1);
matthiasm@8 113 songframes(isnan(songframes)) = 0;
matthiasm@8 114
matthiasm@8 115 %% calculate cosine distance
matthiasm@8 116 for iInstrument = 1:nInstrument
matthiasm@8 117 for iTone = 1:nFbin
matthiasm@8 118 notespectrum0(iTone,:,iInstrument) = A(iTone,:,iInstrument) * songframes;
matthiasm@8 119 end
matthiasm@8 120 end
matthiasm@8 121 %% get only the bins with "more than 1 st. deviation"
matthiasm@8 122 notespectrum = zeros(size(notespectrum0));
matthiasm@8 123 wind_size = 6 * nBin;
matthiasm@8 124 wind = hamming(wind_size)/sum(hamming(wind_size));
matthiasm@8 125 stdthresh = 1;
matthiasm@8 126 for iInstrument = used_instruments
matthiasm@8 127 % calculate running mean
matthiasm@8 128 m = conv2(notespectrum0(:,:,iInstrument),wind(:),'same');
matthiasm@8 129 % calculate running standard deviation
matthiasm@8 130 stdev = sqrt(conv2((notespectrum0(:,:,iInstrument)-m).^2,wind(:),'same'));
matthiasm@8 131 % take only those tones that are above 1 standard deviation from the
matthiasm@8 132 % mean
matthiasm@8 133 notespectrum(:,:,iInstrument) = (notespectrum0(:,:,iInstrument)-m)./stdev;
matthiasm@8 134 temp = notespectrum(:,:,iInstrument);
matthiasm@8 135 temp(temp<stdthresh) = 0;
matthiasm@8 136 notespectrum(:,:,iInstrument) = temp;
matthiasm@8 137 end
matthiasm@8 138 % notes that are one were not "good enough" (not greater than 1 st.
matthiasm@8 139 % deviation reater than the mean) and are hence set to 0.
matthiasm@8 140 notespectrum(notespectrum<=stdthresh) = 0;
matthiasm@8 141 % we require that the notes be on in *both* "instruments". hence the final
matthiasm@8 142 % note spectrum note_s is their product, which works a bit like a minimum
matthiasm@8 143 note_s = notespectrum(:,:,1) .* notespectrum(:,:,2);
matthiasm@8 144
matthiasm@8 145 note_s(1:nBin*6,:) = 0;
matthiasm@8 146 note_s(isnan(note_s)) = 0;
matthiasm@8 147 %% tuning
matthiasm@8 148 % I wrap the note spectrum to one semitone, kind of generating the note
matthiasm@8 149 % spectrum modulo one semitone. one gets a matrix of size nBin x T. from
matthiasm@8 150 % that matrix, the tuning can be calculated by extracting the angle of the
matthiasm@8 151 % sum of all columns, the sum of all columns being a nBin-dimensional
matthiasm@8 152 % vector.
matthiasm@8 153 th = tuner(note_s(36*nBin+(1:12*nBin),:), nBin)
matthiasm@8 154 % th = tuner(notespectrum0(36*nBin+(1:12*nBin),:,2), nBin)
matthiasm@8 155 tuning = kammerton*2^((th/2/pi)/12)
matthiasm@8 156 % tune the spectrum by interpolating the bins such that now the middle bin
matthiasm@8 157 % of every semitone corresponds to the right frequency in equal
matthiasm@8 158 % temperament
matthiasm@8 159 tuned_s = zeros(size(note_s));
matthiasm@8 160 for iFrame = 1:T
matthiasm@8 161 tuned_s(:,iFrame) = interp1(1:nFbin, note_s(:,iFrame), (1:nFbin) + th / (2 * pi) * nBin,[],0);
matthiasm@8 162 end
matthiasm@8 163
matthiasm@8 164 %% reduce to 1 bin per semitone
matthiasm@8 165 % just sum the nBin bins that correspond to one semitone.
matthiasm@8 166 wrapped_tuned_s = reshape(tuned_s,[nBin,noterange*T]);
matthiasm@8 167 % if you want to emphasize the middle bin, use a fancy window, otherwise
matthiasm@8 168 % just use the rectangular window
matthiasm@8 169 wei = rectwin(nBin)';
matthiasm@8 170 salience.reduced = reshape(wei * wrapped_tuned_s,noterange,T);
matthiasm@8 171 % median filter in time direction, if you like
matthiasm@8 172 salience.reduced = medfilt1(salience.reduced,floor(hopspersecond/5)-1,[],2);
matthiasm@8 173
matthiasm@8 174 %% make chromagram
matthiasm@8 175 % actually the edge of the note spectrum is a bit noisy (due to the
matthiasm@8 176 % convolution three parts above. that's why I start some bins from the
matthiasm@8 177 % lower edge (i.e. midimin_used). maybe one could do this properly
matthiasm@8 178 % sometime.
matthiasm@8 179 midimin_used = midimin+6;
matthiasm@8 180 % uebergang is where bass and treble chroma overlap equally
matthiasm@8 181 uebergang = 50;
matthiasm@8 182
matthiasm@8 183 % some simple linear functions to generate the treble and bass profiles
matthiasm@8 184 leftfunc = zeros(2,noterange);
matthiasm@8 185 rightfunc = zeros(2,noterange);
matthiasm@8 186 leftfunc(1,:) = 2/12*((midimin:midimax)-midimin_used);
matthiasm@8 187 leftfunc(2,:) = 1/12*((midimin:midimax)-uebergang) + 0.5;
matthiasm@8 188 rightfunc(1,:) = -1/12*((midimin:midimax)-uebergang) + 0.5;
matthiasm@8 189 rightfunc(2,:) = -(.5)/12*((midimin:midimax)-midimax);
matthiasm@8 190
matthiasm@8 191 % initialise stuff
matthiasm@8 192 salience.bass = zeros(size(salience.reduced));
matthiasm@8 193 salience.treble = zeros(size(salience.reduced));
matthiasm@8 194 profiles = zeros(noterange,3);
matthiasm@8 195
matthiasm@8 196 % bass and treble profiles
matthiasm@8 197 for i = 1:2
matthiasm@8 198 profiles(:,i) = max(0,min(1,min(leftfunc(i,:),rightfunc(i,:))))';
matthiasm@8 199 end
matthiasm@8 200 % wide profile (i.e. both bass and treble)
matthiasm@8 201 profiles(:,3) = min(1,sum(profiles(:,1:2),2));
matthiasm@8 202
matthiasm@8 203 % calculate specific saliences (windowed in f0 direction)
matthiasm@8 204 salience.bass = salience.reduced .* profiles(:,ones(1,T));
matthiasm@8 205 salience.treble = salience.reduced .* profiles(:,2 * ones(1,T));
matthiasm@8 206 salience.wide = salience.reduced .* profiles(:,3 * ones(1,T));
matthiasm@8 207
matthiasm@8 208 % calculate chromagrams
matthiasm@8 209 chroma.bass = squeeze(max(reshape(salience.bass,[12,6,T]),[],2));
matthiasm@8 210 chroma.treble = squeeze(max(reshape(salience.treble,[12,6,T]),[],2));
matthiasm@8 211 chroma.wide = squeeze(max(reshape(salience.wide,[12,6,T]),[],2));
matthiasm@8 212