cannam@154: /*********************************************************************** cannam@154: Copyright (c) 2017 Google Inc. cannam@154: Redistribution and use in source and binary forms, with or without cannam@154: modification, are permitted provided that the following conditions cannam@154: are met: cannam@154: - Redistributions of source code must retain the above copyright notice, cannam@154: this list of conditions and the following disclaimer. cannam@154: - Redistributions in binary form must reproduce the above copyright cannam@154: notice, this list of conditions and the following disclaimer in the cannam@154: documentation and/or other materials provided with the distribution. cannam@154: - Neither the name of Internet Society, IETF or IETF Trust, nor the cannam@154: names of specific contributors, may be used to endorse or promote cannam@154: products derived from this software without specific prior written cannam@154: permission. cannam@154: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" cannam@154: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE cannam@154: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE cannam@154: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE cannam@154: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR cannam@154: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF cannam@154: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS cannam@154: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN cannam@154: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) cannam@154: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE cannam@154: POSSIBILITY OF SUCH DAMAGE. cannam@154: ***********************************************************************/ cannam@154: cannam@154: #ifdef HAVE_CONFIG_H cannam@154: #include "config.h" cannam@154: #endif cannam@154: cannam@154: #include cannam@154: #include "SigProc_FIX.h" cannam@154: #include "define.h" cannam@154: cannam@154: #define QA 24 cannam@154: #define A_LIMIT SILK_FIX_CONST( 0.99975, QA ) cannam@154: cannam@154: #define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q))) cannam@154: cannam@154: /* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed, cannam@154: * since NEON has no 64-bit min, max or comparison instructions. cannam@154: * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not. cannam@154: * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000. cannam@154: * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62. cannam@154: * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1, cannam@154: * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed. cannam@154: * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000. cannam@154: * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions. cannam@154: * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF. cannam@154: */ cannam@154: cannam@154: /* Compute inverse of LPC prediction gain, and */ cannam@154: /* test if LPC coefficients are stable (all poles within unit circle) */ cannam@154: static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ cannam@154: opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */ cannam@154: const opus_int order /* I Prediction order */ cannam@154: ) cannam@154: { cannam@154: opus_int k, n, mult2Q; cannam@154: opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2; cannam@154: opus_int32 max, min; cannam@154: int32x4_t max_s32x4, min_s32x4; cannam@154: int32x2_t max_s32x2, min_s32x2; cannam@154: cannam@154: max_s32x4 = vdupq_n_s32( silk_int32_MIN ); cannam@154: min_s32x4 = vdupq_n_s32( silk_int32_MAX ); cannam@154: invGain_Q30 = SILK_FIX_CONST( 1, 30 ); cannam@154: for( k = order - 1; k > 0; k-- ) { cannam@154: int32x2_t rc_Q31_s32x2, rc_mult2_s32x2; cannam@154: int64x2_t mult2Q_s64x2; cannam@154: cannam@154: /* Check for stability */ cannam@154: if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { cannam@154: return 0; cannam@154: } cannam@154: cannam@154: /* Set RC equal to negated AR coef */ cannam@154: rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA ); cannam@154: cannam@154: /* rc_mult1_Q30 range: [ 1 : 2^30 ] */ cannam@154: rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) ); cannam@154: silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */ cannam@154: silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) ); cannam@154: cannam@154: /* Update inverse gain */ cannam@154: /* invGain_Q30 range: [ 0 : 2^30 ] */ cannam@154: invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); cannam@154: silk_assert( invGain_Q30 >= 0 ); cannam@154: silk_assert( invGain_Q30 <= ( 1 << 30 ) ); cannam@154: if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) { cannam@154: return 0; cannam@154: } cannam@154: cannam@154: /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */ cannam@154: mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) ); cannam@154: rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 ); cannam@154: cannam@154: /* Update AR coefficient */ cannam@154: rc_Q31_s32x2 = vdup_n_s32( rc_Q31 ); cannam@154: mult2Q_s64x2 = vdupq_n_s64( -mult2Q ); cannam@154: rc_mult2_s32x2 = vdup_n_s32( rc_mult2 ); cannam@154: cannam@154: for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) { cannam@154: /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */ cannam@154: int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4; cannam@154: int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2; cannam@154: tmp1_s32x4 = vld1q_s32( A_QA + n ); cannam@154: tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 ); cannam@154: tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 ); cannam@154: tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) ); cannam@154: t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 ); cannam@154: t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 ); cannam@154: t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 ); cannam@154: t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 ); cannam@154: t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 ); cannam@154: t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 ); cannam@154: t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 ); cannam@154: t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 ); cannam@154: t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 ); cannam@154: t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 ); cannam@154: t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 ); cannam@154: t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 ); cannam@154: t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) ); cannam@154: t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) ); cannam@154: s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) ); cannam@154: s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) ); cannam@154: max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 ); cannam@154: min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 ); cannam@154: max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 ); cannam@154: min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 ); cannam@154: t1_s32x4 = vrev64q_s32( t1_s32x4 ); cannam@154: t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) ); cannam@154: vst1q_s32( A_QA + n, t0_s32x4 ); cannam@154: vst1q_s32( A_QA + k - n - 4, t1_s32x4 ); cannam@154: } cannam@154: for( ; n < (k + 1) >> 1; n++ ) { cannam@154: opus_int64 tmp64; cannam@154: tmp1 = A_QA[ n ]; cannam@154: tmp2 = A_QA[ k - n - 1 ]; cannam@154: tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1, cannam@154: MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q); cannam@154: if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { cannam@154: return 0; cannam@154: } cannam@154: A_QA[ n ] = ( opus_int32 )tmp64; cannam@154: tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2, cannam@154: MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q); cannam@154: if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { cannam@154: return 0; cannam@154: } cannam@154: A_QA[ k - n - 1 ] = ( opus_int32 )tmp64; cannam@154: } cannam@154: } cannam@154: cannam@154: /* Check for stability */ cannam@154: if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { cannam@154: return 0; cannam@154: } cannam@154: cannam@154: max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) ); cannam@154: min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) ); cannam@154: max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) ); cannam@154: min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) ); cannam@154: max = vget_lane_s32( max_s32x2, 0 ); cannam@154: min = vget_lane_s32( min_s32x2, 0 ); cannam@154: if( ( max > 0 ) || ( min < -1 ) ) { cannam@154: return 0; cannam@154: } cannam@154: cannam@154: /* Set RC equal to negated AR coef */ cannam@154: rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA ); cannam@154: cannam@154: /* Range: [ 1 : 2^30 ] */ cannam@154: rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) ); cannam@154: cannam@154: /* Update inverse gain */ cannam@154: /* Range: [ 0 : 2^30 ] */ cannam@154: invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); cannam@154: silk_assert( invGain_Q30 >= 0 ); cannam@154: silk_assert( invGain_Q30 <= ( 1 << 30 ) ); cannam@154: if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) { cannam@154: return 0; cannam@154: } cannam@154: cannam@154: return invGain_Q30; cannam@154: } cannam@154: cannam@154: /* For input in Q12 domain */ cannam@154: opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ cannam@154: const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ cannam@154: const opus_int order /* I Prediction order */ cannam@154: ) cannam@154: { cannam@154: #ifdef OPUS_CHECK_ASM cannam@154: const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order ); cannam@154: #endif cannam@154: cannam@154: opus_int32 invGain_Q30; cannam@154: if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) { cannam@154: invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order ); cannam@154: } cannam@154: else { cannam@154: opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ]; cannam@154: opus_int32 DC_resp; cannam@154: int16x8_t t0_s16x8, t1_s16x8, t2_s16x8; cannam@154: int32x4_t t0_s32x4; cannam@154: const opus_int leftover = order & 7; cannam@154: cannam@154: /* Increase Q domain of the AR coefficients */ cannam@154: t0_s16x8 = vld1q_s16( A_Q12 + 0 ); cannam@154: t1_s16x8 = vld1q_s16( A_Q12 + 8 ); cannam@154: t2_s16x8 = vld1q_s16( A_Q12 + 16 ); cannam@154: t0_s32x4 = vpaddlq_s16( t0_s16x8 ); cannam@154: cannam@154: switch( order - leftover ) cannam@154: { cannam@154: case 24: cannam@154: t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 ); cannam@154: /* FALLTHROUGH */ cannam@154: cannam@154: case 16: cannam@154: t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 ); cannam@154: vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) ); cannam@154: vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) ); cannam@154: /* FALLTHROUGH */ cannam@154: cannam@154: case 8: cannam@154: { cannam@154: const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) ); cannam@154: const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 ); cannam@154: DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 ); cannam@154: vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) ); cannam@154: vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) ); cannam@154: } cannam@154: break; cannam@154: cannam@154: default: cannam@154: DC_resp = 0; cannam@154: break; cannam@154: } cannam@154: A_Q12 += order - leftover; cannam@154: cannam@154: switch( leftover ) cannam@154: { cannam@154: case 6: cannam@154: DC_resp += (opus_int32)A_Q12[ 5 ]; cannam@154: DC_resp += (opus_int32)A_Q12[ 4 ]; cannam@154: /* FALLTHROUGH */ cannam@154: cannam@154: case 4: cannam@154: DC_resp += (opus_int32)A_Q12[ 3 ]; cannam@154: DC_resp += (opus_int32)A_Q12[ 2 ]; cannam@154: /* FALLTHROUGH */ cannam@154: cannam@154: case 2: cannam@154: DC_resp += (opus_int32)A_Q12[ 1 ]; cannam@154: DC_resp += (opus_int32)A_Q12[ 0 ]; cannam@154: /* FALLTHROUGH */ cannam@154: cannam@154: default: cannam@154: break; cannam@154: } cannam@154: cannam@154: /* If the DC is unstable, we don't even need to do the full calculations */ cannam@154: if( DC_resp >= 4096 ) { cannam@154: invGain_Q30 = 0; cannam@154: } else { cannam@154: vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) ); cannam@154: vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) ); cannam@154: invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); cannam@154: } cannam@154: } cannam@154: cannam@154: #ifdef OPUS_CHECK_ASM cannam@154: silk_assert( invGain_Q30_c == invGain_Q30 ); cannam@154: #endif cannam@154: cannam@154: return invGain_Q30; cannam@154: }