annotate src/opus-1.3/silk/VAD.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 7aeed7906520
children
rev   line source
Chris@69 1 /***********************************************************************
Chris@69 2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
Chris@69 3 Redistribution and use in source and binary forms, with or without
Chris@69 4 modification, are permitted provided that the following conditions
Chris@69 5 are met:
Chris@69 6 - Redistributions of source code must retain the above copyright notice,
Chris@69 7 this list of conditions and the following disclaimer.
Chris@69 8 - Redistributions in binary form must reproduce the above copyright
Chris@69 9 notice, this list of conditions and the following disclaimer in the
Chris@69 10 documentation and/or other materials provided with the distribution.
Chris@69 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
Chris@69 12 names of specific contributors, may be used to endorse or promote
Chris@69 13 products derived from this software without specific prior written
Chris@69 14 permission.
Chris@69 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Chris@69 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Chris@69 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Chris@69 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
Chris@69 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
Chris@69 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
Chris@69 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
Chris@69 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
Chris@69 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
Chris@69 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
Chris@69 25 POSSIBILITY OF SUCH DAMAGE.
Chris@69 26 ***********************************************************************/
Chris@69 27
Chris@69 28 #ifdef HAVE_CONFIG_H
Chris@69 29 #include "config.h"
Chris@69 30 #endif
Chris@69 31
Chris@69 32 #include "main.h"
Chris@69 33 #include "stack_alloc.h"
Chris@69 34
Chris@69 35 /* Silk VAD noise level estimation */
Chris@69 36 # if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
Chris@69 37 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
Chris@69 38 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
Chris@69 39 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
Chris@69 40 );
Chris@69 41 #endif
Chris@69 42
Chris@69 43 /**********************************/
Chris@69 44 /* Initialization of the Silk VAD */
Chris@69 45 /**********************************/
Chris@69 46 opus_int silk_VAD_Init( /* O Return value, 0 if success */
Chris@69 47 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
Chris@69 48 )
Chris@69 49 {
Chris@69 50 opus_int b, ret = 0;
Chris@69 51
Chris@69 52 /* reset state memory */
Chris@69 53 silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
Chris@69 54
Chris@69 55 /* init noise levels */
Chris@69 56 /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
Chris@69 57 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 58 psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
Chris@69 59 }
Chris@69 60
Chris@69 61 /* Initialize state */
Chris@69 62 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 63 psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
Chris@69 64 psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
Chris@69 65 }
Chris@69 66 psSilk_VAD->counter = 15;
Chris@69 67
Chris@69 68 /* init smoothed energy-to-noise ratio*/
Chris@69 69 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 70 psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */
Chris@69 71 }
Chris@69 72
Chris@69 73 return( ret );
Chris@69 74 }
Chris@69 75
Chris@69 76 /* Weighting factors for tilt measure */
Chris@69 77 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
Chris@69 78
Chris@69 79 /***************************************/
Chris@69 80 /* Get the speech activity level in Q8 */
Chris@69 81 /***************************************/
Chris@69 82 opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */
Chris@69 83 silk_encoder_state *psEncC, /* I/O Encoder state */
Chris@69 84 const opus_int16 pIn[] /* I PCM input */
Chris@69 85 )
Chris@69 86 {
Chris@69 87 opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
Chris@69 88 opus_int decimated_framelength1, decimated_framelength2;
Chris@69 89 opus_int decimated_framelength;
Chris@69 90 opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
Chris@69 91 opus_int32 sumSquared, smooth_coef_Q16;
Chris@69 92 opus_int16 HPstateTmp;
Chris@69 93 VARDECL( opus_int16, X );
Chris@69 94 opus_int32 Xnrg[ VAD_N_BANDS ];
Chris@69 95 opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
Chris@69 96 opus_int32 speech_nrg, x_tmp;
Chris@69 97 opus_int X_offset[ VAD_N_BANDS ];
Chris@69 98 opus_int ret = 0;
Chris@69 99 silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
Chris@69 100 SAVE_STACK;
Chris@69 101
Chris@69 102 /* Safety checks */
Chris@69 103 silk_assert( VAD_N_BANDS == 4 );
Chris@69 104 celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
Chris@69 105 celt_assert( psEncC->frame_length <= 512 );
Chris@69 106 celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
Chris@69 107
Chris@69 108 /***********************/
Chris@69 109 /* Filter and Decimate */
Chris@69 110 /***********************/
Chris@69 111 decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
Chris@69 112 decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
Chris@69 113 decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
Chris@69 114 /* Decimate into 4 bands:
Chris@69 115 0 L 3L L 3L 5L
Chris@69 116 - -- - -- --
Chris@69 117 8 8 2 4 4
Chris@69 118
Chris@69 119 [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz |
Chris@69 120
Chris@69 121 They're arranged to allow the minimal ( frame_length / 4 ) extra
Chris@69 122 scratch space during the downsampling process */
Chris@69 123 X_offset[ 0 ] = 0;
Chris@69 124 X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
Chris@69 125 X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
Chris@69 126 X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
Chris@69 127 ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
Chris@69 128
Chris@69 129 /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
Chris@69 130 silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ],
Chris@69 131 X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
Chris@69 132
Chris@69 133 /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
Chris@69 134 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
Chris@69 135 X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
Chris@69 136
Chris@69 137 /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
Chris@69 138 silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
Chris@69 139 X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
Chris@69 140
Chris@69 141 /*********************************************/
Chris@69 142 /* HP filter on lowest band (differentiator) */
Chris@69 143 /*********************************************/
Chris@69 144 X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
Chris@69 145 HPstateTmp = X[ decimated_framelength - 1 ];
Chris@69 146 for( i = decimated_framelength - 1; i > 0; i-- ) {
Chris@69 147 X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 );
Chris@69 148 X[ i ] -= X[ i - 1 ];
Chris@69 149 }
Chris@69 150 X[ 0 ] -= psSilk_VAD->HPstate;
Chris@69 151 psSilk_VAD->HPstate = HPstateTmp;
Chris@69 152
Chris@69 153 /*************************************/
Chris@69 154 /* Calculate the energy in each band */
Chris@69 155 /*************************************/
Chris@69 156 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 157 /* Find the decimated framelength in the non-uniformly divided bands */
Chris@69 158 decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
Chris@69 159
Chris@69 160 /* Split length into subframe lengths */
Chris@69 161 dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
Chris@69 162 dec_subframe_offset = 0;
Chris@69 163
Chris@69 164 /* Compute energy per sub-frame */
Chris@69 165 /* initialize with summed energy of last subframe */
Chris@69 166 Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
Chris@69 167 for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
Chris@69 168 sumSquared = 0;
Chris@69 169 for( i = 0; i < dec_subframe_length; i++ ) {
Chris@69 170 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
Chris@69 171 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
Chris@69 172 x_tmp = silk_RSHIFT(
Chris@69 173 X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
Chris@69 174 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
Chris@69 175
Chris@69 176 /* Safety check */
Chris@69 177 silk_assert( sumSquared >= 0 );
Chris@69 178 }
Chris@69 179
Chris@69 180 /* Add/saturate summed energy of current subframe */
Chris@69 181 if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
Chris@69 182 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
Chris@69 183 } else {
Chris@69 184 /* Look-ahead subframe */
Chris@69 185 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
Chris@69 186 }
Chris@69 187
Chris@69 188 dec_subframe_offset += dec_subframe_length;
Chris@69 189 }
Chris@69 190 psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
Chris@69 191 }
Chris@69 192
Chris@69 193 /********************/
Chris@69 194 /* Noise estimation */
Chris@69 195 /********************/
Chris@69 196 silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
Chris@69 197
Chris@69 198 /***********************************************/
Chris@69 199 /* Signal-plus-noise to noise ratio estimation */
Chris@69 200 /***********************************************/
Chris@69 201 sumSquared = 0;
Chris@69 202 input_tilt = 0;
Chris@69 203 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 204 speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
Chris@69 205 if( speech_nrg > 0 ) {
Chris@69 206 /* Divide, with sufficient resolution */
Chris@69 207 if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
Chris@69 208 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
Chris@69 209 } else {
Chris@69 210 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
Chris@69 211 }
Chris@69 212
Chris@69 213 /* Convert to log domain */
Chris@69 214 SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
Chris@69 215
Chris@69 216 /* Sum-of-squares */
Chris@69 217 sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
Chris@69 218
Chris@69 219 /* Tilt measure */
Chris@69 220 if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
Chris@69 221 /* Scale down SNR value for small subband speech energies */
Chris@69 222 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
Chris@69 223 }
Chris@69 224 input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
Chris@69 225 } else {
Chris@69 226 NrgToNoiseRatio_Q8[ b ] = 256;
Chris@69 227 }
Chris@69 228 }
Chris@69 229
Chris@69 230 /* Mean-of-squares */
Chris@69 231 sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
Chris@69 232
Chris@69 233 /* Root-mean-square approximation, scale to dBs, and write to output pointer */
Chris@69 234 pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
Chris@69 235
Chris@69 236 /*********************************/
Chris@69 237 /* Speech Probability Estimation */
Chris@69 238 /*********************************/
Chris@69 239 SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
Chris@69 240
Chris@69 241 /**************************/
Chris@69 242 /* Frequency Tilt Measure */
Chris@69 243 /**************************/
Chris@69 244 psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
Chris@69 245
Chris@69 246 /**************************************************/
Chris@69 247 /* Scale the sigmoid output based on power levels */
Chris@69 248 /**************************************************/
Chris@69 249 speech_nrg = 0;
Chris@69 250 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 251 /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
Chris@69 252 speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
Chris@69 253 }
Chris@69 254
Chris@69 255 if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
Chris@69 256 speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
Chris@69 257 }
Chris@69 258 /* Power scaling */
Chris@69 259 if( speech_nrg <= 0 ) {
Chris@69 260 SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
Chris@69 261 } else if( speech_nrg < 16384 ) {
Chris@69 262 speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
Chris@69 263
Chris@69 264 /* square-root */
Chris@69 265 speech_nrg = silk_SQRT_APPROX( speech_nrg );
Chris@69 266 SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
Chris@69 267 }
Chris@69 268
Chris@69 269 /* Copy the resulting speech activity in Q8 */
Chris@69 270 psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
Chris@69 271
Chris@69 272 /***********************************/
Chris@69 273 /* Energy Level and SNR estimation */
Chris@69 274 /***********************************/
Chris@69 275 /* Smoothing coefficient */
Chris@69 276 smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
Chris@69 277
Chris@69 278 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
Chris@69 279 smooth_coef_Q16 >>= 1;
Chris@69 280 }
Chris@69 281
Chris@69 282 for( b = 0; b < VAD_N_BANDS; b++ ) {
Chris@69 283 /* compute smoothed energy-to-noise ratio per band */
Chris@69 284 psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
Chris@69 285 NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
Chris@69 286
Chris@69 287 /* signal to noise ratio in dB per band */
Chris@69 288 SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
Chris@69 289 /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
Chris@69 290 psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
Chris@69 291 }
Chris@69 292
Chris@69 293 RESTORE_STACK;
Chris@69 294 return( ret );
Chris@69 295 }
Chris@69 296
Chris@69 297 /**************************/
Chris@69 298 /* Noise level estimation */
Chris@69 299 /**************************/
Chris@69 300 # if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
Chris@69 301 static OPUS_INLINE
Chris@69 302 #endif
Chris@69 303 void silk_VAD_GetNoiseLevels(
Chris@69 304 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
Chris@69 305 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
Chris@69 306 )
Chris@69 307 {
Chris@69 308 opus_int k;
Chris@69 309 opus_int32 nl, nrg, inv_nrg;
Chris@69 310 opus_int coef, min_coef;
Chris@69 311
Chris@69 312 /* Initially faster smoothing */
Chris@69 313 if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
Chris@69 314 min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
Chris@69 315 /* Increment frame counter */
Chris@69 316 psSilk_VAD->counter++;
Chris@69 317 } else {
Chris@69 318 min_coef = 0;
Chris@69 319 }
Chris@69 320
Chris@69 321 for( k = 0; k < VAD_N_BANDS; k++ ) {
Chris@69 322 /* Get old noise level estimate for current band */
Chris@69 323 nl = psSilk_VAD->NL[ k ];
Chris@69 324 silk_assert( nl >= 0 );
Chris@69 325
Chris@69 326 /* Add bias */
Chris@69 327 nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
Chris@69 328 silk_assert( nrg > 0 );
Chris@69 329
Chris@69 330 /* Invert energies */
Chris@69 331 inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
Chris@69 332 silk_assert( inv_nrg >= 0 );
Chris@69 333
Chris@69 334 /* Less update when subband energy is high */
Chris@69 335 if( nrg > silk_LSHIFT( nl, 3 ) ) {
Chris@69 336 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
Chris@69 337 } else if( nrg < nl ) {
Chris@69 338 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
Chris@69 339 } else {
Chris@69 340 coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
Chris@69 341 }
Chris@69 342
Chris@69 343 /* Initially faster smoothing */
Chris@69 344 coef = silk_max_int( coef, min_coef );
Chris@69 345
Chris@69 346 /* Smooth inverse energies */
Chris@69 347 psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
Chris@69 348 silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
Chris@69 349
Chris@69 350 /* Compute noise level by inverting again */
Chris@69 351 nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
Chris@69 352 silk_assert( nl >= 0 );
Chris@69 353
Chris@69 354 /* Limit noise levels (guarantee 7 bits of head room) */
Chris@69 355 nl = silk_min( nl, 0x00FFFFFF );
Chris@69 356
Chris@69 357 /* Store as part of state */
Chris@69 358 psSilk_VAD->NL[ k ] = nl;
Chris@69 359 }
Chris@69 360 }