sv-dependency-builds: src/opus-1.3/silk/arm/biquad_alt_neon

annotate src/opus-1.3/silk/arm/biquad_alt_neon_intr.c @ 158:fa7c54aeb697

Rebuild with --disable-stack-protector for mingw32

author	Chris Cannam <cannam@all-day-breakfast.com>
date	Fri, 25 Jan 2019 14:31:07 +0000
parents	4664ac0c1032
children

rev	line source
cannam@154	1 /***********************************************************************
cannam@154	2 Copyright (c) 2017 Google Inc.
cannam@154	3 Redistribution and use in source and binary forms, with or without
cannam@154	4 modification, are permitted provided that the following conditions
cannam@154	5 are met:
cannam@154	6 - Redistributions of source code must retain the above copyright notice,
cannam@154	7 this list of conditions and the following disclaimer.
cannam@154	8 - Redistributions in binary form must reproduce the above copyright
cannam@154	9 notice, this list of conditions and the following disclaimer in the
cannam@154	10 documentation and/or other materials provided with the distribution.
cannam@154	11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
cannam@154	12 names of specific contributors, may be used to endorse or promote
cannam@154	13 products derived from this software without specific prior written
cannam@154	14 permission.
cannam@154	15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
cannam@154	16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
cannam@154	17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
cannam@154	18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
cannam@154	19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
cannam@154	20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
cannam@154	21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
cannam@154	22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
cannam@154	23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
cannam@154	24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
cannam@154	25 POSSIBILITY OF SUCH DAMAGE.
cannam@154	26 ***********************************************************************/
cannam@154	27
cannam@154	28 #ifdef HAVE_CONFIG_H
cannam@154	29 #include "config.h"
cannam@154	30 #endif
cannam@154	31
cannam@154	32 #include <arm_neon.h>
cannam@154	33 #ifdef OPUS_CHECK_ASM
cannam@154	34 # include <string.h>
cannam@154	35 # include "stack_alloc.h"
cannam@154	36 #endif
cannam@154	37 #include "SigProc_FIX.h"
cannam@154	38
cannam@154	39 static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t S_s32x4, int32x2_t out32_Q14_s32x2 )
cannam@154	40 {
cannam@154	41 int32x4_t t_s32x4, out32_Q14_s32x4;
cannam@154	42
cannam@154	43 out32_Q14_s32x2 = vadd_s32( vget_low_s32( S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */
cannam@154	44 S_s32x4 = vcombine_s32( vget_high_s32( S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */
cannam@154	45 out32_Q14_s32x2 = vshl_n_s32( out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */
cannam@154	46 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */
cannam@154	47 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */
cannam@154	48 S_s32x4 = vrsraq_n_s32( S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */
cannam@154	49 t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */
cannam@154	50 S_s32x4 = vaddq_s32( S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */
cannam@154	51 t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */
cannam@154	52 S_s32x4 = vaddq_s32( S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */
cannam@154	53 }
cannam@154	54
cannam@154	55 void silk_biquad_alt_stride2_neon(
cannam@154	56 const opus_int16 in, / I input signal */
cannam@154	57 const opus_int32 B_Q28, / I MA coefficients [3] */
cannam@154	58 const opus_int32 A_Q28, / I AR coefficients [2] */
cannam@154	59 opus_int32 S, / I/O State vector [4] */
cannam@154	60 opus_int16 out, / O output signal */
cannam@154	61 const opus_int32 len /* I signal length (must be even) */
cannam@154	62 )
cannam@154	63 {
cannam@154	64 /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
cannam@154	65 opus_int k = 0;
cannam@154	66 const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
cannam@154	67 const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
cannam@154	68 int16x4_t in_s16x4 = vdup_n_s16( 0 );
cannam@154	69 int16x4_t out_s16x4;
cannam@154	70 int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
cannam@154	71 int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
cannam@154	72 int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;
cannam@154	73
cannam@154	74 #ifdef OPUS_CHECK_ASM
cannam@154	75 opus_int32 S_c[ 4 ];
cannam@154	76 VARDECL( opus_int16, out_c );
cannam@154	77 SAVE_STACK;
cannam@154	78 ALLOC( out_c, 2 * len, opus_int16 );
cannam@154	79
cannam@154	80 silk_memcpy( &S_c, S, sizeof( S_c ) );
cannam@154	81 silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
cannam@154	82 #endif
cannam@154	83
cannam@154	84 /* Negate A_Q28 values and split in two parts */
cannam@154	85 A_Q28_s32x2 = vld1_s32( A_Q28 );
cannam@154	86 A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
cannam@154	87 A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */
cannam@154	88 A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */
cannam@154	89 A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */
cannam@154	90 A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
cannam@154	91 A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */
cannam@154	92
cannam@154	93 B_Q28_s32x2 = vld1_s32( B_Q28 );
cannam@154	94 t_s32x2 = vld1_s32( B_Q28 + 1 );
cannam@154	95 t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 );
cannam@154	96 t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 );
cannam@154	97 t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 );
cannam@154	98 A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */
cannam@154	99 A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */
cannam@154	100 B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */
cannam@154	101 S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
cannam@154	102 S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
cannam@154	103 S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );
cannam@154	104
cannam@154	105 for( ; k < len - 1; k += 2 ) {
cannam@154	106 int32x4_t in_s32x4[ 2 ], t_s32x4;
cannam@154	107 int32x2_t out32_Q14_s32x2[ 2 ];
cannam@154	108
cannam@154	109 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
cannam@154	110 in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
cannam@154	111 in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */
cannam@154	112 t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
cannam@154	113 in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */
cannam@154	114 in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */
cannam@154	115 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
cannam@154	116 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );
cannam@154	117
cannam@154	118 /* Scale back to Q0 and saturate */
cannam@154	119 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */
cannam@154	120 out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */
cannam@154	121 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */
cannam@154	122 vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
cannam@154	123 }
cannam@154	124
cannam@154	125 /* Process leftover. */
cannam@154	126 if( k < len ) {
cannam@154	127 int32x4_t in_s32x4;
cannam@154	128 int32x2_t out32_Q14_s32x2;
cannam@154	129
cannam@154	130 /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
cannam@154	131 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */
cannam@154	132 in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */
cannam@154	133 in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */
cannam@154	134 t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
cannam@154	135 in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */
cannam@154	136 silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );
cannam@154	137
cannam@154	138 /* Scale back to Q0 and saturate */
cannam@154	139 out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */
cannam@154	140 out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */
cannam@154	141 out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */
cannam@154	142 vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
cannam@154	143 vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
cannam@154	144 }
cannam@154	145
cannam@154	146 vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */
cannam@154	147 vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */
cannam@154	148 vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */
cannam@154	149 vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */
cannam@154	150
cannam@154	151 #ifdef OPUS_CHECK_ASM
cannam@154	152 silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
cannam@154	153 silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
cannam@154	154 RESTORE_STACK;
cannam@154	155 #endif
cannam@154	156 }

Mercurial > hg > sv-dependency-builds

annotate src/opus-1.3/silk/arm/biquad_alt_neon_intr.c @ 158:fa7c54aeb697