view _misc/featureextraction/chromagenerator.m @ 9:4ea6619cb3f5 tip

removed log files
author matthiasm
date Fri, 11 Apr 2014 15:55:11 +0100
parents b5b38998ef3b
children
line wrap: on
line source
function [chroma, t, salience, tuning, URIstring] = chromagenerator(filename, kammerton, nBin)

%% make URI string
URIstring = strcat('chromagenerator-', num2str(kammerton), '-',num2str(nBin));

%% parameters
% set the bins per semitone (i.e. binsperoctave/12)
% kammerton = 440;
% nBin = 7;
midbin = ceil(nBin/2);

% minimum and maximum MIDI bin, should be n octaves apart.
midimin = 21; % 69 has 440 Hz
% midimin = 45; % 69 has 440 Hz
midimax = 92; 
% midimax = 68; 
noterange = midimax - midimin + 1;

% midi note values including the fractions "between" actual MIDI notes
midibins = midimin-(midbin-1)/nBin:(1/nBin):midimax+(midbin-1)/nBin;
% the same in frequencies
fbins = kammerton * 2.^((midibins-69)/12);
nFbin = length(fbins);

% the sampling frequency everything is (down)sampled to
used_fs = 11025;

% intuitive setting of how long the frame is (quite short here!)
fracofsecond = 4; 

hopspersecond = 20;

% calculate a convenient FFT length
nFFT = 2^nextpow2(used_fs/fracofsecond);
% hopsize in samples
hopsize = used_fs/hopspersecond/nFFT;

% "instrument" means a particular spectral shape
used_instruments = 1:2;
nInstrument = length(used_instruments);

%% generate dictionary
% (complex) tones for all fundamental frequencies are generated. then their
% amplitude spectrum is calculated and normalised according to the L2 norm.

% preliminary t for dictionary generation
t = (1:nFFT)/used_fs;
% initialise note dictionary
A = zeros(nFbin,nFFT/2,nInstrument);

% loop over all instruments
for iInstrument = used_instruments
    % loop over all fundamental frequencies
    for iFbin = 1:nFbin;
        f0 = fbins(iFbin);
        switch iInstrument
            case 1
                wave = sum([sin(t*2*pi*f0*1); ...
                    0.9^1 * sin(t*2*pi*f0*2); ...
                    0.9^2 * sin(t*2*pi*f0*3); ...
                    0.9^3 * sin(t*2*pi*f0*4); ...
                    ...0.9^4 * sin(t*2*pi*f0*5); ...
                    ]);
            case 2
                wave = sin(t*2*pi*f0*1);
        end
        % window wave and fft
        wave = hamming(nFFT)' .* wave;
        fullfft = fft(wave);
        % calculate amplitude spectrum from fft
        A(iFbin,:,iInstrument) = abs(fullfft(1:round(nFFT/2)));
    end
    % normalise (L2 norm)
    A(:,:,iInstrument) = qnormalise(A(:,:,iInstrument),2,2);
end

%% get the fft frames from a wave file
%this is probably ending up not being used in a vamp plugin
% fprintf(1,'%s\n',filename)
% [audiosize,fs] = wavread(filename,'size');
% chunk_sec = 20;
% nChunk = ceil(audiosize(1)/fs/chunk_sec);
% songframes = [];
% start = 1;
% % extract fft from chunks (for less memory consumption)
% for iChunk = 1:nChunk
%     samplemin = (iChunk-1) * chunk_sec * fs + 1;
%     samplemax = min(audiosize(1), (iChunk * chunk_sec + 1) * fs);    
%     audiodata0 = wavread(filename,[samplemin samplemax]);   
%     audiodata = resample(mean(audiodata0,2),used_fs,fs,20);
%     songframes0 = myframefft(audiodata,nFFT,hopsize,'hamming');
%     if size(songframes0,2)>=start
%         songframes = [songframes(:,1:end-start) abs(songframes0(1:round(nFFT/2),start:end))];
%     end
%     start = round(hopspersecond/2)+1;
% end
% clear songframes0
% clear audiodata
% clear audiodata0
% t = (0:T-1)./hopspersecond;

%%
[songframes,f,t]=chunkspectrogram(filename,4096,4096*7/8,4096,inf);
hopspersecond = 1/(t(2)-t(1));
songframes = abs(songframes);
%%
T = size(songframes,2);

fprintf(1, '%d different notes, %d frequency bins, %d time frames\n', noterange,nFFT/2, T)

notespectrum0 = zeros(nFbin, T, nInstrument);
songframes = qnormalise(songframes,2,1);
songframes(isnan(songframes)) = 0;

%% calculate cosine distance
for iInstrument = 1:nInstrument
    for iTone = 1:nFbin
        notespectrum0(iTone,:,iInstrument) = A(iTone,:,iInstrument) * songframes;
    end
end
%% get only the bins with "more than 1 st. deviation"
notespectrum = zeros(size(notespectrum0));
wind_size = 6 * nBin;
wind = hamming(wind_size)/sum(hamming(wind_size));
stdthresh = 1;
for iInstrument = used_instruments
    % calculate running mean
    m = conv2(notespectrum0(:,:,iInstrument),wind(:),'same');
    % calculate running standard deviation
    stdev = sqrt(conv2((notespectrum0(:,:,iInstrument)-m).^2,wind(:),'same'));
    % take only those tones that are above 1 standard deviation from the
    % mean
    notespectrum(:,:,iInstrument) = (notespectrum0(:,:,iInstrument)-m)./stdev;
    temp = notespectrum(:,:,iInstrument);
    temp(temp<stdthresh) = 0;
    notespectrum(:,:,iInstrument) = temp;
end
% notes that are one were not "good enough" (not greater than 1 st.
% deviation reater than the mean) and are hence set to 0.
notespectrum(notespectrum<=stdthresh) = 0;
% we require that the notes be on in *both* "instruments". hence the final
% note spectrum note_s is their product, which works a bit like a minimum
note_s =  notespectrum(:,:,1) .* notespectrum(:,:,2);

note_s(1:nBin*6,:) = 0;
note_s(isnan(note_s)) = 0;
%% tuning
% I wrap the note spectrum to one semitone, kind of generating the note
% spectrum modulo one semitone. one gets a matrix of size nBin x T. from
% that matrix, the tuning can be calculated by extracting the angle of the
% sum of all columns, the sum of all columns being a nBin-dimensional
% vector.
th = tuner(note_s(36*nBin+(1:12*nBin),:), nBin)
% th = tuner(notespectrum0(36*nBin+(1:12*nBin),:,2), nBin)
tuning = kammerton*2^((th/2/pi)/12)
% tune the spectrum by interpolating the bins such that now the middle bin
% of every semitone corresponds to the right frequency in equal
% temperament
tuned_s = zeros(size(note_s));
for iFrame = 1:T
    tuned_s(:,iFrame) = interp1(1:nFbin, note_s(:,iFrame), (1:nFbin) + th / (2 * pi) * nBin,[],0);
end

%% reduce to 1 bin per semitone
% just sum the nBin bins that correspond to one semitone.
wrapped_tuned_s = reshape(tuned_s,[nBin,noterange*T]);
% if you want to emphasize the middle bin, use a fancy window, otherwise
% just use the rectangular window
wei = rectwin(nBin)';
salience.reduced = reshape(wei * wrapped_tuned_s,noterange,T);
% median filter in time direction, if you like
salience.reduced = medfilt1(salience.reduced,floor(hopspersecond/5)-1,[],2);

%% make chromagram
% actually the edge of the note spectrum is a bit noisy (due to the
% convolution three parts above. that's why I start some bins from the
% lower edge (i.e. midimin_used). maybe one could do this properly
% sometime.
midimin_used = midimin+6;
% uebergang is where bass and treble chroma overlap equally
uebergang = 50;

% some simple linear functions to generate the treble and bass profiles
leftfunc = zeros(2,noterange);
rightfunc = zeros(2,noterange);
leftfunc(1,:) = 2/12*((midimin:midimax)-midimin_used);
leftfunc(2,:) = 1/12*((midimin:midimax)-uebergang) + 0.5;
rightfunc(1,:) = -1/12*((midimin:midimax)-uebergang) + 0.5;
rightfunc(2,:) = -(.5)/12*((midimin:midimax)-midimax);

% initialise stuff
salience.bass = zeros(size(salience.reduced));
salience.treble = zeros(size(salience.reduced));
profiles = zeros(noterange,3);

% bass and treble profiles
for i = 1:2
   profiles(:,i) = max(0,min(1,min(leftfunc(i,:),rightfunc(i,:))))';
end
% wide profile (i.e. both bass and treble)
profiles(:,3) = min(1,sum(profiles(:,1:2),2));

% calculate specific saliences (windowed in f0 direction)
salience.bass = salience.reduced .* profiles(:,ones(1,T));
salience.treble = salience.reduced .* profiles(:,2 * ones(1,T));
salience.wide = salience.reduced .* profiles(:,3 * ones(1,T));

% calculate chromagrams
chroma.bass = squeeze(max(reshape(salience.bass,[12,6,T]),[],2));
chroma.treble = squeeze(max(reshape(salience.treble,[12,6,T]),[],2));
chroma.wide = squeeze(max(reshape(salience.wide,[12,6,T]),[],2));