sv-dependency-builds: src/opus-1.3/silk/arm/LPC_inv_pred_gain_neon

annotate src/opus-1.3/silk/arm/LPC_inv_pred_gain_neon_intr.c @ 81:7029a4916348

Merge build update

author	Chris Cannam
date	Thu, 31 Oct 2019 13:36:58 +0000
parents	7aeed7906520
children

rev	line source
Chris@69	1 /***********************************************************************
Chris@69	2 Copyright (c) 2017 Google Inc.
Chris@69	3 Redistribution and use in source and binary forms, with or without
Chris@69	4 modification, are permitted provided that the following conditions
Chris@69	5 are met:
Chris@69	6 - Redistributions of source code must retain the above copyright notice,
Chris@69	7 this list of conditions and the following disclaimer.
Chris@69	8 - Redistributions in binary form must reproduce the above copyright
Chris@69	9 notice, this list of conditions and the following disclaimer in the
Chris@69	10 documentation and/or other materials provided with the distribution.
Chris@69	11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
Chris@69	12 names of specific contributors, may be used to endorse or promote
Chris@69	13 products derived from this software without specific prior written
Chris@69	14 permission.
Chris@69	15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Chris@69	16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Chris@69	17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Chris@69	18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
Chris@69	19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
Chris@69	20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
Chris@69	21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
Chris@69	22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
Chris@69	23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
Chris@69	24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
Chris@69	25 POSSIBILITY OF SUCH DAMAGE.
Chris@69	26 ***********************************************************************/
Chris@69	27
Chris@69	28 #ifdef HAVE_CONFIG_H
Chris@69	29 #include "config.h"
Chris@69	30 #endif
Chris@69	31
Chris@69	32 #include <arm_neon.h>
Chris@69	33 #include "SigProc_FIX.h"
Chris@69	34 #include "define.h"
Chris@69	35
Chris@69	36 #define QA 24
Chris@69	37 #define A_LIMIT SILK_FIX_CONST( 0.99975, QA )
Chris@69	38
Chris@69	39 #define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
Chris@69	40
Chris@69	41 /* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed,
Chris@69	42 * since NEON has no 64-bit min, max or comparison instructions.
Chris@69	43 * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not.
Chris@69	44 * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000.
Chris@69	45 * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62.
Chris@69	46 * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1,
Chris@69	47 * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed.
Chris@69	48 * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000.
Chris@69	49 * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions.
Chris@69	50 * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF.
Chris@69	51 */
Chris@69	52
Chris@69	53 /* Compute inverse of LPC prediction gain, and */
Chris@69	54 /* test if LPC coefficients are stable (all poles within unit circle) */
Chris@69	55 static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
Chris@69	56 opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
Chris@69	57 const opus_int order /* I Prediction order */
Chris@69	58 )
Chris@69	59 {
Chris@69	60 opus_int k, n, mult2Q;
Chris@69	61 opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2;
Chris@69	62 opus_int32 max, min;
Chris@69	63 int32x4_t max_s32x4, min_s32x4;
Chris@69	64 int32x2_t max_s32x2, min_s32x2;
Chris@69	65
Chris@69	66 max_s32x4 = vdupq_n_s32( silk_int32_MIN );
Chris@69	67 min_s32x4 = vdupq_n_s32( silk_int32_MAX );
Chris@69	68 invGain_Q30 = SILK_FIX_CONST( 1, 30 );
Chris@69	69 for( k = order - 1; k > 0; k-- ) {
Chris@69	70 int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
Chris@69	71 int64x2_t mult2Q_s64x2;
Chris@69	72
Chris@69	73 /* Check for stability */
Chris@69	74 if( ( A_QA[ k ] > A_LIMIT ) \|\| ( A_QA[ k ] < -A_LIMIT ) ) {
Chris@69	75 return 0;
Chris@69	76 }
Chris@69	77
Chris@69	78 /* Set RC equal to negated AR coef */
Chris@69	79 rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA );
Chris@69	80
Chris@69	81 /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
Chris@69	82 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
Chris@69	83 silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */
Chris@69	84 silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
Chris@69	85
Chris@69	86 /* Update inverse gain */
Chris@69	87 /* invGain_Q30 range: [ 0 : 2^30 ] */
Chris@69	88 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
Chris@69	89 silk_assert( invGain_Q30 >= 0 );
Chris@69	90 silk_assert( invGain_Q30 <= ( 1 << 30 ) );
Chris@69	91 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
Chris@69	92 return 0;
Chris@69	93 }
Chris@69	94
Chris@69	95 /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
Chris@69	96 mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
Chris@69	97 rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
Chris@69	98
Chris@69	99 /* Update AR coefficient */
Chris@69	100 rc_Q31_s32x2 = vdup_n_s32( rc_Q31 );
Chris@69	101 mult2Q_s64x2 = vdupq_n_s64( -mult2Q );
Chris@69	102 rc_mult2_s32x2 = vdup_n_s32( rc_mult2 );
Chris@69	103
Chris@69	104 for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) {
Chris@69	105 /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */
Chris@69	106 int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4;
Chris@69	107 int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
Chris@69	108 tmp1_s32x4 = vld1q_s32( A_QA + n );
Chris@69	109 tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 );
Chris@69	110 tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 );
Chris@69	111 tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) );
Chris@69	112 t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 );
Chris@69	113 t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 );
Chris@69	114 t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 );
Chris@69	115 t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 );
Chris@69	116 t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 );
Chris@69	117 t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 );
Chris@69	118 t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 );
Chris@69	119 t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 );
Chris@69	120 t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 );
Chris@69	121 t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 );
Chris@69	122 t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 );
Chris@69	123 t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 );
Chris@69	124 t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) );
Chris@69	125 t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) );
Chris@69	126 s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) );
Chris@69	127 s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) );
Chris@69	128 max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 );
Chris@69	129 min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 );
Chris@69	130 max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 );
Chris@69	131 min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 );
Chris@69	132 t1_s32x4 = vrev64q_s32( t1_s32x4 );
Chris@69	133 t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) );
Chris@69	134 vst1q_s32( A_QA + n, t0_s32x4 );
Chris@69	135 vst1q_s32( A_QA + k - n - 4, t1_s32x4 );
Chris@69	136 }
Chris@69	137 for( ; n < (k + 1) >> 1; n++ ) {
Chris@69	138 opus_int64 tmp64;
Chris@69	139 tmp1 = A_QA[ n ];
Chris@69	140 tmp2 = A_QA[ k - n - 1 ];
Chris@69	141 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1,
Chris@69	142 MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q);
Chris@69	143 if( tmp64 > silk_int32_MAX \|\| tmp64 < silk_int32_MIN ) {
Chris@69	144 return 0;
Chris@69	145 }
Chris@69	146 A_QA[ n ] = ( opus_int32 )tmp64;
Chris@69	147 tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2,
Chris@69	148 MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q);
Chris@69	149 if( tmp64 > silk_int32_MAX \|\| tmp64 < silk_int32_MIN ) {
Chris@69	150 return 0;
Chris@69	151 }
Chris@69	152 A_QA[ k - n - 1 ] = ( opus_int32 )tmp64;
Chris@69	153 }
Chris@69	154 }
Chris@69	155
Chris@69	156 /* Check for stability */
Chris@69	157 if( ( A_QA[ k ] > A_LIMIT ) \|\| ( A_QA[ k ] < -A_LIMIT ) ) {
Chris@69	158 return 0;
Chris@69	159 }
Chris@69	160
Chris@69	161 max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) );
Chris@69	162 min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) );
Chris@69	163 max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) );
Chris@69	164 min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) );
Chris@69	165 max = vget_lane_s32( max_s32x2, 0 );
Chris@69	166 min = vget_lane_s32( min_s32x2, 0 );
Chris@69	167 if( ( max > 0 ) \|\| ( min < -1 ) ) {
Chris@69	168 return 0;
Chris@69	169 }
Chris@69	170
Chris@69	171 /* Set RC equal to negated AR coef */
Chris@69	172 rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA );
Chris@69	173
Chris@69	174 /* Range: [ 1 : 2^30 ] */
Chris@69	175 rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
Chris@69	176
Chris@69	177 /* Update inverse gain */
Chris@69	178 /* Range: [ 0 : 2^30 ] */
Chris@69	179 invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
Chris@69	180 silk_assert( invGain_Q30 >= 0 );
Chris@69	181 silk_assert( invGain_Q30 <= ( 1 << 30 ) );
Chris@69	182 if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
Chris@69	183 return 0;
Chris@69	184 }
Chris@69	185
Chris@69	186 return invGain_Q30;
Chris@69	187 }
Chris@69	188
Chris@69	189 /* For input in Q12 domain */
Chris@69	190 opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
Chris@69	191 const opus_int16 A_Q12, / I Prediction coefficients, Q12 [order] */
Chris@69	192 const opus_int order /* I Prediction order */
Chris@69	193 )
Chris@69	194 {
Chris@69	195 #ifdef OPUS_CHECK_ASM
Chris@69	196 const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order );
Chris@69	197 #endif
Chris@69	198
Chris@69	199 opus_int32 invGain_Q30;
Chris@69	200 if( ( SILK_MAX_ORDER_LPC != 24 ) \|\| ( order & 1 )) {
Chris@69	201 invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order );
Chris@69	202 }
Chris@69	203 else {
Chris@69	204 opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ];
Chris@69	205 opus_int32 DC_resp;
Chris@69	206 int16x8_t t0_s16x8, t1_s16x8, t2_s16x8;
Chris@69	207 int32x4_t t0_s32x4;
Chris@69	208 const opus_int leftover = order & 7;
Chris@69	209
Chris@69	210 /* Increase Q domain of the AR coefficients */
Chris@69	211 t0_s16x8 = vld1q_s16( A_Q12 + 0 );
Chris@69	212 t1_s16x8 = vld1q_s16( A_Q12 + 8 );
Chris@69	213 t2_s16x8 = vld1q_s16( A_Q12 + 16 );
Chris@69	214 t0_s32x4 = vpaddlq_s16( t0_s16x8 );
Chris@69	215
Chris@69	216 switch( order - leftover )
Chris@69	217 {
Chris@69	218 case 24:
Chris@69	219 t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );
Chris@69	220 /* FALLTHROUGH */
Chris@69	221
Chris@69	222 case 16:
Chris@69	223 t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );
Chris@69	224 vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
Chris@69	225 vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
Chris@69	226 /* FALLTHROUGH */
Chris@69	227
Chris@69	228 case 8:
Chris@69	229 {
Chris@69	230 const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) );
Chris@69	231 const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 );
Chris@69	232 DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 );
Chris@69	233 vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );
Chris@69	234 vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );
Chris@69	235 }
Chris@69	236 break;
Chris@69	237
Chris@69	238 default:
Chris@69	239 DC_resp = 0;
Chris@69	240 break;
Chris@69	241 }
Chris@69	242 A_Q12 += order - leftover;
Chris@69	243
Chris@69	244 switch( leftover )
Chris@69	245 {
Chris@69	246 case 6:
Chris@69	247 DC_resp += (opus_int32)A_Q12[ 5 ];
Chris@69	248 DC_resp += (opus_int32)A_Q12[ 4 ];
Chris@69	249 /* FALLTHROUGH */
Chris@69	250
Chris@69	251 case 4:
Chris@69	252 DC_resp += (opus_int32)A_Q12[ 3 ];
Chris@69	253 DC_resp += (opus_int32)A_Q12[ 2 ];
Chris@69	254 /* FALLTHROUGH */
Chris@69	255
Chris@69	256 case 2:
Chris@69	257 DC_resp += (opus_int32)A_Q12[ 1 ];
Chris@69	258 DC_resp += (opus_int32)A_Q12[ 0 ];
Chris@69	259 /* FALLTHROUGH */
Chris@69	260
Chris@69	261 default:
Chris@69	262 break;
Chris@69	263 }
Chris@69	264
Chris@69	265 /* If the DC is unstable, we don't even need to do the full calculations */
Chris@69	266 if( DC_resp >= 4096 ) {
Chris@69	267 invGain_Q30 = 0;
Chris@69	268 } else {
Chris@69	269 vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );
Chris@69	270 vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );
Chris@69	271 invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
Chris@69	272 }
Chris@69	273 }
Chris@69	274
Chris@69	275 #ifdef OPUS_CHECK_ASM
Chris@69	276 silk_assert( invGain_Q30_c == invGain_Q30 );
Chris@69	277 #endif
Chris@69	278
Chris@69	279 return invGain_Q30;
Chris@69	280 }

Mercurial > hg > sv-dependency-builds

annotate src/opus-1.3/silk/arm/LPC_inv_pred_gain_neon_intr.c @ 81:7029a4916348