cannam@154
|
1 /***********************************************************************
|
cannam@154
|
2 Copyright (c) 2017 Google Inc.
|
cannam@154
|
3 Redistribution and use in source and binary forms, with or without
|
cannam@154
|
4 modification, are permitted provided that the following conditions
|
cannam@154
|
5 are met:
|
cannam@154
|
6 - Redistributions of source code must retain the above copyright notice,
|
cannam@154
|
7 this list of conditions and the following disclaimer.
|
cannam@154
|
8 - Redistributions in binary form must reproduce the above copyright
|
cannam@154
|
9 notice, this list of conditions and the following disclaimer in the
|
cannam@154
|
10 documentation and/or other materials provided with the distribution.
|
cannam@154
|
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
|
cannam@154
|
12 names of specific contributors, may be used to endorse or promote
|
cannam@154
|
13 products derived from this software without specific prior written
|
cannam@154
|
14 permission.
|
cannam@154
|
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
cannam@154
|
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
cannam@154
|
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
cannam@154
|
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
cannam@154
|
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
cannam@154
|
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
cannam@154
|
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
cannam@154
|
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
cannam@154
|
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
cannam@154
|
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
cannam@154
|
25 POSSIBILITY OF SUCH DAMAGE.
|
cannam@154
|
26 ***********************************************************************/
|
cannam@154
|
27
|
cannam@154
|
28 #ifdef HAVE_CONFIG_H
|
cannam@154
|
29 #include "config.h"
|
cannam@154
|
30 #endif
|
cannam@154
|
31
|
cannam@154
|
32 #include <arm_neon.h>
|
cannam@154
|
33 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
34 # include <string.h>
|
cannam@154
|
35 #endif
|
cannam@154
|
36 #include "main.h"
|
cannam@154
|
37 #include "stack_alloc.h"
|
cannam@154
|
38
|
cannam@154
|
39 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
|
cannam@154
|
40 /* If there are more states, C function is called, and this optimization must be expanded. */
|
cannam@154
|
41 #define NEON_MAX_DEL_DEC_STATES 4
|
cannam@154
|
42
|
cannam@154
|
43 typedef struct {
|
cannam@154
|
44 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
45 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
46 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
47 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
48 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
49 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
50 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
51 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
52 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
53 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
54 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
55 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
56 } NSQ_del_decs_struct;
|
cannam@154
|
57
|
cannam@154
|
58 typedef struct {
|
cannam@154
|
59 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
60 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
61 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
62 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
63 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
64 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
65 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ];
|
cannam@154
|
66 } NSQ_samples_struct;
|
cannam@154
|
67
|
cannam@154
|
68 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
|
cannam@154
|
69 const silk_encoder_state *psEncC, /* I Encoder State */
|
cannam@154
|
70 silk_nsq_state *NSQ, /* I/O NSQ state */
|
cannam@154
|
71 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
|
cannam@154
|
72 const opus_int16 x16[], /* I Input */
|
cannam@154
|
73 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
|
cannam@154
|
74 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
|
cannam@154
|
75 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
cannam@154
|
76 opus_int subfr, /* I Subframe number */
|
cannam@154
|
77 const opus_int LTP_scale_Q14, /* I LTP state scaling */
|
cannam@154
|
78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
|
cannam@154
|
79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
|
cannam@154
|
80 const opus_int signal_type, /* I Signal type */
|
cannam@154
|
81 const opus_int decisionDelay /* I Decision delay */
|
cannam@154
|
82 );
|
cannam@154
|
83
|
cannam@154
|
84 /******************************************/
|
cannam@154
|
85 /* Noise shape quantizer for one subframe */
|
cannam@154
|
86 /******************************************/
|
cannam@154
|
87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
|
cannam@154
|
88 silk_nsq_state *NSQ, /* I/O NSQ state */
|
cannam@154
|
89 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
|
cannam@154
|
90 opus_int signalType, /* I Signal type */
|
cannam@154
|
91 const opus_int32 x_Q10[], /* I */
|
cannam@154
|
92 opus_int8 pulses[], /* O */
|
cannam@154
|
93 opus_int16 xq[], /* O */
|
cannam@154
|
94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
|
cannam@154
|
95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
|
cannam@154
|
96 const opus_int16 a_Q12[], /* I Short term prediction coefs */
|
cannam@154
|
97 const opus_int16 b_Q14[], /* I Long term prediction coefs */
|
cannam@154
|
98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
|
cannam@154
|
99 opus_int lag, /* I Pitch lag */
|
cannam@154
|
100 opus_int32 HarmShapeFIRPacked_Q14, /* I */
|
cannam@154
|
101 opus_int Tilt_Q14, /* I Spectral tilt */
|
cannam@154
|
102 opus_int32 LF_shp_Q14, /* I */
|
cannam@154
|
103 opus_int32 Gain_Q16, /* I */
|
cannam@154
|
104 opus_int Lambda_Q10, /* I */
|
cannam@154
|
105 opus_int offset_Q10, /* I */
|
cannam@154
|
106 opus_int length, /* I Input length */
|
cannam@154
|
107 opus_int subfr, /* I Subframe number */
|
cannam@154
|
108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
|
cannam@154
|
109 opus_int predictLPCOrder, /* I Prediction filter order */
|
cannam@154
|
110 opus_int warping_Q16, /* I */
|
cannam@154
|
111 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
|
cannam@154
|
112 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
|
cannam@154
|
113 opus_int decisionDelay /* I */
|
cannam@154
|
114 );
|
cannam@154
|
115
|
cannam@154
|
116 static OPUS_INLINE void copy_winner_state_kernel(
|
cannam@154
|
117 const NSQ_del_decs_struct *psDelDec,
|
cannam@154
|
118 const opus_int offset,
|
cannam@154
|
119 const opus_int last_smple_idx,
|
cannam@154
|
120 const opus_int Winner_ind,
|
cannam@154
|
121 const int32x2_t gain_lo_s32x2,
|
cannam@154
|
122 const int32x2_t gain_hi_s32x2,
|
cannam@154
|
123 const int32x4_t shift_s32x4,
|
cannam@154
|
124 int32x4_t t0_s32x4,
|
cannam@154
|
125 int32x4_t t1_s32x4,
|
cannam@154
|
126 opus_int8 *const pulses,
|
cannam@154
|
127 opus_int16 *pxq,
|
cannam@154
|
128 silk_nsq_state *NSQ
|
cannam@154
|
129 )
|
cannam@154
|
130 {
|
cannam@154
|
131 int16x8_t t_s16x8;
|
cannam@154
|
132 int32x4_t o0_s32x4, o1_s32x4;
|
cannam@154
|
133
|
cannam@154
|
134 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
|
cannam@154
|
135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
|
cannam@154
|
136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
|
cannam@154
|
137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
|
cannam@154
|
138 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
|
cannam@154
|
139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
|
cannam@154
|
140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
|
cannam@154
|
141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
|
cannam@154
|
142 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
|
cannam@154
|
143 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
|
cannam@154
|
144
|
cannam@154
|
145 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
|
cannam@154
|
146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
|
cannam@154
|
147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
|
cannam@154
|
148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
|
cannam@154
|
149 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
|
cannam@154
|
150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
|
cannam@154
|
151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
|
cannam@154
|
152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
|
cannam@154
|
153 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
|
cannam@154
|
154 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
|
cannam@154
|
155 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
|
cannam@154
|
156 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
|
cannam@154
|
157 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
|
cannam@154
|
158 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
|
cannam@154
|
159 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
|
cannam@154
|
160 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
|
cannam@154
|
161
|
cannam@154
|
162 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
|
cannam@154
|
163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
|
cannam@154
|
164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
|
cannam@154
|
165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
|
cannam@154
|
166 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
|
cannam@154
|
167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
|
cannam@154
|
168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
|
cannam@154
|
169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
|
cannam@154
|
170 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
|
cannam@154
|
171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
|
cannam@154
|
172 }
|
cannam@154
|
173
|
cannam@154
|
174 static OPUS_INLINE void copy_winner_state(
|
cannam@154
|
175 const NSQ_del_decs_struct *psDelDec,
|
cannam@154
|
176 const opus_int decisionDelay,
|
cannam@154
|
177 const opus_int smpl_buf_idx,
|
cannam@154
|
178 const opus_int Winner_ind,
|
cannam@154
|
179 const opus_int32 gain,
|
cannam@154
|
180 const opus_int32 shift,
|
cannam@154
|
181 opus_int8 *const pulses,
|
cannam@154
|
182 opus_int16 *pxq,
|
cannam@154
|
183 silk_nsq_state *NSQ
|
cannam@154
|
184 )
|
cannam@154
|
185 {
|
cannam@154
|
186 opus_int i, last_smple_idx;
|
cannam@154
|
187 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
|
cannam@154
|
188 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
|
cannam@154
|
189 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
|
cannam@154
|
190 int32x4_t t0_s32x4, t1_s32x4;
|
cannam@154
|
191
|
cannam@154
|
192 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
|
cannam@154
|
193 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
|
cannam@154
|
194 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
|
cannam@154
|
195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
|
cannam@154
|
196
|
cannam@154
|
197 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
|
cannam@154
|
198 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
|
cannam@154
|
199 }
|
cannam@154
|
200 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
|
cannam@154
|
201 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
|
cannam@154
|
202 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
|
cannam@154
|
203 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
|
cannam@154
|
204 }
|
cannam@154
|
205
|
cannam@154
|
206 last_smple_idx += DECISION_DELAY;
|
cannam@154
|
207 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
|
cannam@154
|
208 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
|
cannam@154
|
209 }
|
cannam@154
|
210 for( ; i < decisionDelay; i++, last_smple_idx-- ) {
|
cannam@154
|
211 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
|
cannam@154
|
212 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
|
cannam@154
|
213 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
|
cannam@154
|
214 }
|
cannam@154
|
215 }
|
cannam@154
|
216
|
cannam@154
|
217 void silk_NSQ_del_dec_neon(
|
cannam@154
|
218 const silk_encoder_state *psEncC, /* I Encoder State */
|
cannam@154
|
219 silk_nsq_state *NSQ, /* I/O NSQ state */
|
cannam@154
|
220 SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
cannam@154
|
221 const opus_int16 x16[], /* I Input */
|
cannam@154
|
222 opus_int8 pulses[], /* O Quantized pulse signal */
|
cannam@154
|
223 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
cannam@154
|
224 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
cannam@154
|
225 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
cannam@154
|
226 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
cannam@154
|
227 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
cannam@154
|
228 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
cannam@154
|
229 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
|
cannam@154
|
230 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
|
cannam@154
|
231 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
|
cannam@154
|
232 const opus_int LTP_scale_Q14 /* I LTP state scaling */
|
cannam@154
|
233 )
|
cannam@154
|
234 {
|
cannam@154
|
235 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
236 silk_nsq_state NSQ_c;
|
cannam@154
|
237 SideInfoIndices psIndices_c;
|
cannam@154
|
238 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
|
cannam@154
|
239 const opus_int8 *const pulses_a = pulses;
|
cannam@154
|
240
|
cannam@154
|
241 ( void )pulses_a;
|
cannam@154
|
242 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
|
cannam@154
|
243 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
|
cannam@154
|
244 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
|
cannam@154
|
245 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
|
cannam@154
|
246 pitchL, Lambda_Q10, LTP_scale_Q14 );
|
cannam@154
|
247 #endif
|
cannam@154
|
248
|
cannam@154
|
249 /* The optimization parallelizes the different delay decision states. */
|
cannam@154
|
250 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
|
cannam@154
|
251 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
|
cannam@154
|
252 /* If there are more states, C function is called, and this optimization must be expanded. */
|
cannam@154
|
253 /* When the number of delay decision states is less than 3, there are penalties using this */
|
cannam@154
|
254 /* optimization, and C function is called. */
|
cannam@154
|
255 /* When the number of delay decision states is 2, it's better to specialize another */
|
cannam@154
|
256 /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority) */
|
cannam@154
|
257 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
|
cannam@154
|
258 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
|
cannam@154
|
259 } else {
|
cannam@154
|
260 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
|
cannam@154
|
261 opus_int smpl_buf_idx, decisionDelay;
|
cannam@154
|
262 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
|
cannam@154
|
263 opus_int16 *pxq;
|
cannam@154
|
264 VARDECL( opus_int32, sLTP_Q15 );
|
cannam@154
|
265 VARDECL( opus_int16, sLTP );
|
cannam@154
|
266 opus_int32 HarmShapeFIRPacked_Q14;
|
cannam@154
|
267 opus_int offset_Q10;
|
cannam@154
|
268 opus_int32 RDmin_Q10, Gain_Q10;
|
cannam@154
|
269 VARDECL( opus_int32, x_sc_Q10 );
|
cannam@154
|
270 VARDECL( opus_int32, delayedGain_Q10 );
|
cannam@154
|
271 VARDECL( NSQ_del_decs_struct, psDelDec );
|
cannam@154
|
272 int32x4_t t_s32x4;
|
cannam@154
|
273 SAVE_STACK;
|
cannam@154
|
274
|
cannam@154
|
275 /* Set unvoiced lag to the previous one, overwrite later for voiced */
|
cannam@154
|
276 lag = NSQ->lagPrev;
|
cannam@154
|
277
|
cannam@154
|
278 silk_assert( NSQ->prev_gain_Q16 != 0 );
|
cannam@154
|
279
|
cannam@154
|
280 /* Initialize delayed decision states */
|
cannam@154
|
281 ALLOC( psDelDec, 1, NSQ_del_decs_struct );
|
cannam@154
|
282 /* Only RandState and RD_Q10 need to be initialized to 0. */
|
cannam@154
|
283 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
|
cannam@154
|
284 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
|
cannam@154
|
285
|
cannam@154
|
286 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
|
cannam@154
|
287 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
|
cannam@154
|
288 }
|
cannam@154
|
289 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
|
cannam@154
|
290 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
|
cannam@154
|
291 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
|
cannam@154
|
292 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
cannam@154
|
293 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
|
cannam@154
|
294 }
|
cannam@154
|
295 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
|
cannam@154
|
296 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
|
cannam@154
|
297 }
|
cannam@154
|
298
|
cannam@154
|
299 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
|
cannam@154
|
300 smpl_buf_idx = 0; /* index of oldest samples */
|
cannam@154
|
301
|
cannam@154
|
302 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
|
cannam@154
|
303
|
cannam@154
|
304 /* For voiced frames limit the decision delay to lower than the pitch lag */
|
cannam@154
|
305 if( psIndices->signalType == TYPE_VOICED ) {
|
cannam@154
|
306 opus_int pitch_min = pitchL[ 0 ];
|
cannam@154
|
307 for( k = 1; k < psEncC->nb_subfr; k++ ) {
|
cannam@154
|
308 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
|
cannam@154
|
309 }
|
cannam@154
|
310 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
|
cannam@154
|
311 } else {
|
cannam@154
|
312 if( lag > 0 ) {
|
cannam@154
|
313 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
|
cannam@154
|
314 }
|
cannam@154
|
315 }
|
cannam@154
|
316
|
cannam@154
|
317 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
|
cannam@154
|
318 LSF_interpolation_flag = 0;
|
cannam@154
|
319 } else {
|
cannam@154
|
320 LSF_interpolation_flag = 1;
|
cannam@154
|
321 }
|
cannam@154
|
322
|
cannam@154
|
323 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
|
cannam@154
|
324 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
|
cannam@154
|
325 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
|
cannam@154
|
326 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
|
cannam@154
|
327 /* Set up pointers to start of sub frame */
|
cannam@154
|
328 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
|
cannam@154
|
329 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
|
cannam@154
|
330 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
|
cannam@154
|
331 subfr = 0;
|
cannam@154
|
332 for( k = 0; k < psEncC->nb_subfr; k++ ) {
|
cannam@154
|
333 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
|
cannam@154
|
334 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
|
cannam@154
|
335 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
|
cannam@154
|
336
|
cannam@154
|
337 /* Noise shape parameters */
|
cannam@154
|
338 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
|
cannam@154
|
339 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
|
cannam@154
|
340 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
|
cannam@154
|
341
|
cannam@154
|
342 NSQ->rewhite_flag = 0;
|
cannam@154
|
343 if( psIndices->signalType == TYPE_VOICED ) {
|
cannam@154
|
344 /* Voiced */
|
cannam@154
|
345 lag = pitchL[ k ];
|
cannam@154
|
346
|
cannam@154
|
347 /* Re-whitening */
|
cannam@154
|
348 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
|
cannam@154
|
349 if( k == 2 ) {
|
cannam@154
|
350 /* RESET DELAYED DECISIONS */
|
cannam@154
|
351 /* Find winner */
|
cannam@154
|
352 int32x4_t RD_Q10_s32x4;
|
cannam@154
|
353 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
|
cannam@154
|
354 Winner_ind = 0;
|
cannam@154
|
355 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
|
cannam@154
|
356 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
|
cannam@154
|
357 RDmin_Q10 = psDelDec->RD_Q10[ i ];
|
cannam@154
|
358 Winner_ind = i;
|
cannam@154
|
359 }
|
cannam@154
|
360 }
|
cannam@154
|
361 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
|
cannam@154
|
362 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
|
cannam@154
|
363 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
|
cannam@154
|
364 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
|
cannam@154
|
365
|
cannam@154
|
366 /* Copy final part of signals from winner state to output and long-term filter states */
|
cannam@154
|
367 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
|
cannam@154
|
368
|
cannam@154
|
369 subfr = 0;
|
cannam@154
|
370 }
|
cannam@154
|
371
|
cannam@154
|
372 /* Rewhiten with new A coefs */
|
cannam@154
|
373 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
|
cannam@154
|
374 silk_assert( start_idx > 0 );
|
cannam@154
|
375
|
cannam@154
|
376 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
|
cannam@154
|
377 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
|
cannam@154
|
378
|
cannam@154
|
379 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
|
cannam@154
|
380 NSQ->rewhite_flag = 1;
|
cannam@154
|
381 }
|
cannam@154
|
382 }
|
cannam@154
|
383
|
cannam@154
|
384 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
|
cannam@154
|
385 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
|
cannam@154
|
386
|
cannam@154
|
387 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
|
cannam@154
|
388 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
|
cannam@154
|
389 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
|
cannam@154
|
390 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
|
cannam@154
|
391
|
cannam@154
|
392 x16 += psEncC->subfr_length;
|
cannam@154
|
393 pulses += psEncC->subfr_length;
|
cannam@154
|
394 pxq += psEncC->subfr_length;
|
cannam@154
|
395 }
|
cannam@154
|
396
|
cannam@154
|
397 /* Find winner */
|
cannam@154
|
398 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
|
cannam@154
|
399 Winner_ind = 0;
|
cannam@154
|
400 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
|
cannam@154
|
401 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
|
cannam@154
|
402 RDmin_Q10 = psDelDec->RD_Q10[ k ];
|
cannam@154
|
403 Winner_ind = k;
|
cannam@154
|
404 }
|
cannam@154
|
405 }
|
cannam@154
|
406
|
cannam@154
|
407 /* Copy final part of signals from winner state to output and long-term filter states */
|
cannam@154
|
408 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
|
cannam@154
|
409 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
|
cannam@154
|
410 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
|
cannam@154
|
411
|
cannam@154
|
412 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
|
cannam@154
|
413 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
|
cannam@154
|
414 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
|
cannam@154
|
415 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
|
cannam@154
|
416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
|
cannam@154
|
417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
|
cannam@154
|
418 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
|
cannam@154
|
419 }
|
cannam@154
|
420
|
cannam@154
|
421 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
cannam@154
|
422 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
|
cannam@154
|
423 }
|
cannam@154
|
424
|
cannam@154
|
425 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
|
cannam@154
|
426 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
|
cannam@154
|
427 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
|
cannam@154
|
428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
|
cannam@154
|
429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
|
cannam@154
|
430 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
|
cannam@154
|
431 }
|
cannam@154
|
432
|
cannam@154
|
433 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
|
cannam@154
|
434 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
|
cannam@154
|
435 }
|
cannam@154
|
436
|
cannam@154
|
437 /* Update states */
|
cannam@154
|
438 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
|
cannam@154
|
439 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ];
|
cannam@154
|
440 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
|
cannam@154
|
441
|
cannam@154
|
442 /* Save quantized speech signal */
|
cannam@154
|
443 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
|
cannam@154
|
444 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
|
cannam@154
|
445 RESTORE_STACK;
|
cannam@154
|
446 }
|
cannam@154
|
447
|
cannam@154
|
448 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
449 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
|
cannam@154
|
450 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
|
cannam@154
|
451 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
|
cannam@154
|
452 #endif
|
cannam@154
|
453 }
|
cannam@154
|
454
|
cannam@154
|
455 /******************************************/
|
cannam@154
|
456 /* Noise shape quantizer for one subframe */
|
cannam@154
|
457 /******************************************/
|
cannam@154
|
458 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
|
cannam@154
|
459 /* Therefore here we append "_local" to the NEON function name to avoid confusion. */
|
cannam@154
|
460 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
|
cannam@154
|
461 {
|
cannam@154
|
462 int16x8_t t_s16x8;
|
cannam@154
|
463 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
|
cannam@154
|
464 silk_assert( order == 10 || order == 16 );
|
cannam@154
|
465
|
cannam@154
|
466 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */
|
cannam@154
|
467 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */
|
cannam@154
|
468 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */
|
cannam@154
|
469 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */
|
cannam@154
|
470
|
cannam@154
|
471 if( order == 16 ) {
|
cannam@154
|
472 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */
|
cannam@154
|
473 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */
|
cannam@154
|
474 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */
|
cannam@154
|
475 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */
|
cannam@154
|
476 } else {
|
cannam@154
|
477 int16x4_t t_s16x4;
|
cannam@154
|
478
|
cannam@154
|
479 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */
|
cannam@154
|
480 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */
|
cannam@154
|
481 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */
|
cannam@154
|
482 t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
|
cannam@154
|
483 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */
|
cannam@154
|
484 }
|
cannam@154
|
485 vst1q_s32( out + 0, t0_s32x4 );
|
cannam@154
|
486 vst1q_s32( out + 4, t1_s32x4 );
|
cannam@154
|
487 vst1q_s32( out + 8, t2_s32x4 );
|
cannam@154
|
488 vst1q_s32( out + 12, t3_s32x4 );
|
cannam@154
|
489 }
|
cannam@154
|
490
|
cannam@154
|
491 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
|
cannam@154
|
492 const int32x4_t out_s32x4,
|
cannam@154
|
493 const int32x4_t in_s32x4,
|
cannam@154
|
494 const int32x2_t coef_s32x2
|
cannam@154
|
495 )
|
cannam@154
|
496 {
|
cannam@154
|
497 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
|
cannam@154
|
498 }
|
cannam@154
|
499
|
cannam@154
|
500 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
|
cannam@154
|
501 const int32x4_t out_s32x4,
|
cannam@154
|
502 const int32x4_t in_s32x4,
|
cannam@154
|
503 const int32x2_t coef_s32x2
|
cannam@154
|
504 )
|
cannam@154
|
505 {
|
cannam@154
|
506 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
|
cannam@154
|
507 }
|
cannam@154
|
508
|
cannam@154
|
509 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
|
cannam@154
|
510 /* Therefore here we append "_local" to the function name to avoid confusion. */
|
cannam@154
|
511 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
|
cannam@154
|
512 {
|
cannam@154
|
513 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
|
cannam@154
|
514 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
|
cannam@154
|
515 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
|
cannam@154
|
516 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
|
cannam@154
|
517 int32x4_t LPC_pred_Q14_s32x4;
|
cannam@154
|
518
|
cannam@154
|
519 silk_assert( order == 10 || order == 16 );
|
cannam@154
|
520 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
|
cannam@154
|
521 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
|
cannam@154
|
522 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
|
cannam@154
|
523 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
|
cannam@154
|
524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
|
cannam@154
|
525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
|
cannam@154
|
526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
|
cannam@154
|
527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
|
cannam@154
|
528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
|
cannam@154
|
529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
|
cannam@154
|
530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
|
cannam@154
|
531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
|
cannam@154
|
532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
|
cannam@154
|
533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
|
cannam@154
|
534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
|
cannam@154
|
535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
|
cannam@154
|
536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
|
cannam@154
|
537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
|
cannam@154
|
538
|
cannam@154
|
539 return LPC_pred_Q14_s32x4;
|
cannam@154
|
540 }
|
cannam@154
|
541
|
cannam@154
|
542 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
|
cannam@154
|
543 silk_nsq_state *NSQ, /* I/O NSQ state */
|
cannam@154
|
544 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
|
cannam@154
|
545 opus_int signalType, /* I Signal type */
|
cannam@154
|
546 const opus_int32 x_Q10[], /* I */
|
cannam@154
|
547 opus_int8 pulses[], /* O */
|
cannam@154
|
548 opus_int16 xq[], /* O */
|
cannam@154
|
549 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
|
cannam@154
|
550 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
|
cannam@154
|
551 const opus_int16 a_Q12[], /* I Short term prediction coefs */
|
cannam@154
|
552 const opus_int16 b_Q14[], /* I Long term prediction coefs */
|
cannam@154
|
553 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
|
cannam@154
|
554 opus_int lag, /* I Pitch lag */
|
cannam@154
|
555 opus_int32 HarmShapeFIRPacked_Q14, /* I */
|
cannam@154
|
556 opus_int Tilt_Q14, /* I Spectral tilt */
|
cannam@154
|
557 opus_int32 LF_shp_Q14, /* I */
|
cannam@154
|
558 opus_int32 Gain_Q16, /* I */
|
cannam@154
|
559 opus_int Lambda_Q10, /* I */
|
cannam@154
|
560 opus_int offset_Q10, /* I */
|
cannam@154
|
561 opus_int length, /* I Input length */
|
cannam@154
|
562 opus_int subfr, /* I Subframe number */
|
cannam@154
|
563 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
|
cannam@154
|
564 opus_int predictLPCOrder, /* I Prediction filter order */
|
cannam@154
|
565 opus_int warping_Q16, /* I */
|
cannam@154
|
566 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
|
cannam@154
|
567 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
|
cannam@154
|
568 opus_int decisionDelay /* I */
|
cannam@154
|
569 )
|
cannam@154
|
570 {
|
cannam@154
|
571 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
|
cannam@154
|
572 opus_int32 Winner_rand_state;
|
cannam@154
|
573 opus_int32 LTP_pred_Q14, n_LTP_Q14;
|
cannam@154
|
574 opus_int32 RDmin_Q10, RDmax_Q10;
|
cannam@154
|
575 opus_int32 Gain_Q10;
|
cannam@154
|
576 opus_int32 *pred_lag_ptr, *shp_lag_ptr;
|
cannam@154
|
577 opus_int32 a_Q12_arch[MAX_LPC_ORDER];
|
cannam@154
|
578 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
|
cannam@154
|
579 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
|
cannam@154
|
580 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
|
cannam@154
|
581 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
|
cannam@154
|
582 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
|
cannam@154
|
583
|
cannam@154
|
584 VARDECL( NSQ_samples_struct, psSampleState );
|
cannam@154
|
585 SAVE_STACK;
|
cannam@154
|
586
|
cannam@154
|
587 silk_assert( nStatesDelayedDecision > 0 );
|
cannam@154
|
588 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
|
cannam@154
|
589 ALLOC( psSampleState, 2, NSQ_samples_struct );
|
cannam@154
|
590
|
cannam@154
|
591 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
|
cannam@154
|
592 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
|
cannam@154
|
593 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
|
cannam@154
|
594
|
cannam@154
|
595 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
|
cannam@154
|
596 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i );
|
cannam@154
|
597 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) );
|
cannam@154
|
598 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
|
cannam@154
|
599 }
|
cannam@154
|
600
|
cannam@154
|
601 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
|
cannam@154
|
602 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
|
cannam@154
|
603 }
|
cannam@154
|
604
|
cannam@154
|
605 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
|
cannam@154
|
606
|
cannam@154
|
607 for( i = 0; i < length; i++ ) {
|
cannam@154
|
608 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
|
cannam@154
|
609 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
|
cannam@154
|
610 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
|
cannam@154
|
611 int32x2_t AR_shp_Q28_s32x2;
|
cannam@154
|
612 int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
|
cannam@154
|
613
|
cannam@154
|
614 /* Perform common calculations used in all states */
|
cannam@154
|
615
|
cannam@154
|
616 /* Long-term prediction */
|
cannam@154
|
617 if( signalType == TYPE_VOICED ) {
|
cannam@154
|
618 /* Unrolled loop */
|
cannam@154
|
619 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
|
cannam@154
|
620 LTP_pred_Q14 = 2;
|
cannam@154
|
621 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );
|
cannam@154
|
622 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
|
cannam@154
|
623 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
|
cannam@154
|
624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
|
cannam@154
|
625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
|
cannam@154
|
626 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
|
cannam@154
|
627 pred_lag_ptr++;
|
cannam@154
|
628 } else {
|
cannam@154
|
629 LTP_pred_Q14 = 0;
|
cannam@154
|
630 }
|
cannam@154
|
631
|
cannam@154
|
632 /* Long-term shaping */
|
cannam@154
|
633 if( lag > 0 ) {
|
cannam@154
|
634 /* Symmetric, packed FIR coefficients */
|
cannam@154
|
635 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
|
cannam@154
|
636 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
|
cannam@154
|
637 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
|
cannam@154
|
638 shp_lag_ptr++;
|
cannam@154
|
639 } else {
|
cannam@154
|
640 n_LTP_Q14 = 0;
|
cannam@154
|
641 }
|
cannam@154
|
642
|
cannam@154
|
643 /* Generate dither */
|
cannam@154
|
644 Seed_s32x4 = vld1q_s32( psDelDec->Seed );
|
cannam@154
|
645 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
|
cannam@154
|
646 vst1q_s32( psDelDec->Seed, Seed_s32x4 );
|
cannam@154
|
647
|
cannam@154
|
648 /* Short-term prediction */
|
cannam@154
|
649 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
|
cannam@154
|
650 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
|
cannam@154
|
651
|
cannam@154
|
652 /* Noise shape feedback */
|
cannam@154
|
653 /* Output of lowpass section */
|
cannam@154
|
654 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
|
cannam@154
|
655 /* Output of allpass section */
|
cannam@154
|
656 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
|
cannam@154
|
657 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
|
cannam@154
|
658 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
|
cannam@154
|
659 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
|
cannam@154
|
660 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
|
cannam@154
|
661
|
cannam@154
|
662 /* Loop over allpass sections */
|
cannam@154
|
663 for( j = 2; j < shapingLPCOrder; j += 2 ) {
|
cannam@154
|
664 /* Output of allpass section */
|
cannam@154
|
665 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
|
cannam@154
|
666 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
|
cannam@154
|
667 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
|
cannam@154
|
668 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
|
cannam@154
|
669 /* Output of allpass section */
|
cannam@154
|
670 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
|
cannam@154
|
671 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
|
cannam@154
|
672 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
|
cannam@154
|
673 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
|
cannam@154
|
674 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
|
cannam@154
|
675 }
|
cannam@154
|
676 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
|
cannam@154
|
677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
|
cannam@154
|
678 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */
|
cannam@154
|
679 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */
|
cannam@154
|
680 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */
|
cannam@154
|
681 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */
|
cannam@154
|
682 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
|
cannam@154
|
683 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */
|
cannam@154
|
684
|
cannam@154
|
685 /* Input minus prediction plus noise feedback */
|
cannam@154
|
686 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
|
cannam@154
|
687 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */
|
cannam@154
|
688 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
|
cannam@154
|
689 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */
|
cannam@154
|
690 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */
|
cannam@154
|
691 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */
|
cannam@154
|
692
|
cannam@154
|
693 /* Flip sign depending on dither */
|
cannam@154
|
694 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
|
cannam@154
|
695 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
|
cannam@154
|
696 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
|
cannam@154
|
697 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
|
cannam@154
|
698 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
|
cannam@154
|
699 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
|
cannam@154
|
700
|
cannam@154
|
701 /* Find two quantization level candidates and measure their rate-distortion */
|
cannam@154
|
702 {
|
cannam@154
|
703 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
|
cannam@154
|
704 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
|
cannam@154
|
705 int16x4_t q2_Q10_s16x4;
|
cannam@154
|
706 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
|
cannam@154
|
707 uint32x4_t t_u32x4;
|
cannam@154
|
708
|
cannam@154
|
709 if( Lambda_Q10 > 2048 ) {
|
cannam@154
|
710 /* For aggressive RDO, the bias becomes more than one pulse. */
|
cannam@154
|
711 const int rdo_offset = Lambda_Q10/2 - 512;
|
cannam@154
|
712 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
|
cannam@154
|
713 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
|
cannam@154
|
714 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
|
cannam@154
|
715 silk_assert( Lambda_Q10 <= 32767 );
|
cannam@154
|
716
|
cannam@154
|
717 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
|
cannam@154
|
718 q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
|
cannam@154
|
719 q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
|
cannam@154
|
720 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
|
cannam@154
|
721 }
|
cannam@154
|
722 {
|
cannam@154
|
723 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
|
cannam@154
|
724 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
|
cannam@154
|
725 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
|
cannam@154
|
726 int16x4_t tmp1_s16x4, tmp2_s16x4;
|
cannam@154
|
727
|
cannam@154
|
728 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
|
cannam@154
|
729 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
|
cannam@154
|
730 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
|
cannam@154
|
731 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
|
cannam@154
|
732 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
|
cannam@154
|
733 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
|
cannam@154
|
734 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
|
cannam@154
|
735 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
|
cannam@154
|
736 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
|
cannam@154
|
737 tmp1_s16x4 = q1_Q10_s16x4;
|
cannam@154
|
738 tmp2_s16x4 = q2_Q10_s16x4;
|
cannam@154
|
739 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
|
cannam@154
|
740 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
|
cannam@154
|
741 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
|
cannam@154
|
742 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
|
cannam@154
|
743 }
|
cannam@154
|
744
|
cannam@154
|
745 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
|
cannam@154
|
746 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
|
cannam@154
|
747 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
|
cannam@154
|
748
|
cannam@154
|
749 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
|
cannam@154
|
750 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
|
cannam@154
|
751 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
|
cannam@154
|
752
|
cannam@154
|
753 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
|
cannam@154
|
754 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
|
cannam@154
|
755 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
|
cannam@154
|
756 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
|
cannam@154
|
757 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
|
cannam@154
|
758 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
|
cannam@154
|
759 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
|
cannam@154
|
760 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
|
cannam@154
|
761 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
|
cannam@154
|
762 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
|
cannam@154
|
763 }
|
cannam@154
|
764
|
cannam@154
|
765 {
|
cannam@154
|
766 /* Update states for best quantization */
|
cannam@154
|
767 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
|
cannam@154
|
768
|
cannam@154
|
769 /* Quantized excitation */
|
cannam@154
|
770 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
|
cannam@154
|
771 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
|
cannam@154
|
772 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
|
cannam@154
|
773
|
cannam@154
|
774 /* Add predictions */
|
cannam@154
|
775 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
|
cannam@154
|
776 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
|
cannam@154
|
777
|
cannam@154
|
778 /* Update states */
|
cannam@154
|
779 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
|
cannam@154
|
780 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
|
cannam@154
|
781 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
|
cannam@154
|
782 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
|
cannam@154
|
783 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
|
cannam@154
|
784 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
|
cannam@154
|
785 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
|
cannam@154
|
786
|
cannam@154
|
787 /* Quantized excitation */
|
cannam@154
|
788 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
|
cannam@154
|
789 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
|
cannam@154
|
790 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
|
cannam@154
|
791
|
cannam@154
|
792 /* Add predictions */
|
cannam@154
|
793 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
|
cannam@154
|
794 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
|
cannam@154
|
795
|
cannam@154
|
796 /* Update states */
|
cannam@154
|
797 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
|
cannam@154
|
798 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
|
cannam@154
|
799 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
|
cannam@154
|
800 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
|
cannam@154
|
801 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
|
cannam@154
|
802 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
|
cannam@154
|
803 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
|
cannam@154
|
804 }
|
cannam@154
|
805
|
cannam@154
|
806 *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
|
cannam@154
|
807 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
|
cannam@154
|
808 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
|
cannam@154
|
809 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
|
cannam@154
|
810
|
cannam@154
|
811 /* Find winner */
|
cannam@154
|
812 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
|
cannam@154
|
813 Winner_ind = 0;
|
cannam@154
|
814 for( k = 1; k < nStatesDelayedDecision; k++ ) {
|
cannam@154
|
815 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
|
cannam@154
|
816 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
|
cannam@154
|
817 Winner_ind = k;
|
cannam@154
|
818 }
|
cannam@154
|
819 }
|
cannam@154
|
820
|
cannam@154
|
821 /* Increase RD values of expired states */
|
cannam@154
|
822 {
|
cannam@154
|
823 uint32x4_t t_u32x4;
|
cannam@154
|
824 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
|
cannam@154
|
825 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
|
cannam@154
|
826 t_u32x4 = vmvnq_u32( t_u32x4 );
|
cannam@154
|
827 t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
|
cannam@154
|
828 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
|
cannam@154
|
829 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
|
cannam@154
|
830 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
|
cannam@154
|
831 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
|
cannam@154
|
832 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
|
cannam@154
|
833 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
|
cannam@154
|
834
|
cannam@154
|
835 /* Find worst in first set and best in second set */
|
cannam@154
|
836 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
|
cannam@154
|
837 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
|
cannam@154
|
838 RDmax_ind = 0;
|
cannam@154
|
839 RDmin_ind = 0;
|
cannam@154
|
840 for( k = 1; k < nStatesDelayedDecision; k++ ) {
|
cannam@154
|
841 /* find worst in first set */
|
cannam@154
|
842 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
|
cannam@154
|
843 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
|
cannam@154
|
844 RDmax_ind = k;
|
cannam@154
|
845 }
|
cannam@154
|
846 /* find best in second set */
|
cannam@154
|
847 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
|
cannam@154
|
848 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
|
cannam@154
|
849 RDmin_ind = k;
|
cannam@154
|
850 }
|
cannam@154
|
851 }
|
cannam@154
|
852 }
|
cannam@154
|
853
|
cannam@154
|
854 /* Replace a state if best from second set outperforms worst in first set */
|
cannam@154
|
855 if( RDmin_Q10 < RDmax_Q10 ) {
|
cannam@154
|
856 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
|
cannam@154
|
857 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
|
cannam@154
|
858 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
|
cannam@154
|
859 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */
|
cannam@154
|
860 /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
|
cannam@154
|
861 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */
|
cannam@154
|
862 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
|
cannam@154
|
863 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
|
cannam@154
|
864 }
|
cannam@154
|
865 for( j = 0; j < numOthers; j++ ) {
|
cannam@154
|
866 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
|
cannam@154
|
867 }
|
cannam@154
|
868
|
cannam@154
|
869 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
|
cannam@154
|
870 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
|
cannam@154
|
871 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
|
cannam@154
|
872 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
|
cannam@154
|
873 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
|
cannam@154
|
874 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
|
cannam@154
|
875 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
|
cannam@154
|
876 }
|
cannam@154
|
877
|
cannam@154
|
878 /* Write samples from winner to output and long-term filter states */
|
cannam@154
|
879 if( subfr > 0 || i >= decisionDelay ) {
|
cannam@154
|
880 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
|
cannam@154
|
881 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
|
cannam@154
|
882 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
|
cannam@154
|
883 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
|
cannam@154
|
884 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ];
|
cannam@154
|
885 }
|
cannam@154
|
886 NSQ->sLTP_shp_buf_idx++;
|
cannam@154
|
887 NSQ->sLTP_buf_idx++;
|
cannam@154
|
888
|
cannam@154
|
889 /* Update states */
|
cannam@154
|
890 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
|
cannam@154
|
891 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
|
cannam@154
|
892 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
|
cannam@154
|
893 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
|
cannam@154
|
894 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
|
cannam@154
|
895 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
|
cannam@154
|
896 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
|
cannam@154
|
897 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
|
cannam@154
|
898 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
|
cannam@154
|
899 tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
|
cannam@154
|
900 vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
|
cannam@154
|
901 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
|
cannam@154
|
902 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
|
cannam@154
|
903 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
|
cannam@154
|
904 }
|
cannam@154
|
905 /* Update LPC states */
|
cannam@154
|
906 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
|
cannam@154
|
907
|
cannam@154
|
908 RESTORE_STACK;
|
cannam@154
|
909 }
|
cannam@154
|
910
|
cannam@154
|
911 static OPUS_INLINE void silk_SMULWB_8_neon(
|
cannam@154
|
912 const opus_int16 *a,
|
cannam@154
|
913 const int32x2_t b,
|
cannam@154
|
914 opus_int32 *o
|
cannam@154
|
915 )
|
cannam@154
|
916 {
|
cannam@154
|
917 const int16x8_t a_s16x8 = vld1q_s16( a );
|
cannam@154
|
918 int32x4_t o0_s32x4, o1_s32x4;
|
cannam@154
|
919
|
cannam@154
|
920 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
|
cannam@154
|
921 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
|
cannam@154
|
922 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
|
cannam@154
|
923 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
|
cannam@154
|
924 vst1q_s32( o, o0_s32x4 );
|
cannam@154
|
925 vst1q_s32( o + 4, o1_s32x4 );
|
cannam@154
|
926 }
|
cannam@154
|
927
|
cannam@154
|
928 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
|
cannam@154
|
929 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
|
cannam@154
|
930 opus_int32 *a,
|
cannam@154
|
931 const int32x2_t b_s32x2)
|
cannam@154
|
932 {
|
cannam@154
|
933 int32x4_t o_s32x4;
|
cannam@154
|
934
|
cannam@154
|
935 o_s32x4 = vld1q_s32( a );
|
cannam@154
|
936 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
|
cannam@154
|
937 vst1q_s32( a, o_s32x4 );
|
cannam@154
|
938 }
|
cannam@154
|
939
|
cannam@154
|
940 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
|
cannam@154
|
941 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
|
cannam@154
|
942 opus_int32 *a,
|
cannam@154
|
943 const int32x2_t b_s32x2
|
cannam@154
|
944 )
|
cannam@154
|
945 {
|
cannam@154
|
946 int32x4_t o0_s32x4, o1_s32x4;
|
cannam@154
|
947
|
cannam@154
|
948 o0_s32x4 = vld1q_s32( a );
|
cannam@154
|
949 o1_s32x4 = vld1q_s32( a + 4 );
|
cannam@154
|
950 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
|
cannam@154
|
951 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
|
cannam@154
|
952 vst1q_s32( a, o0_s32x4 );
|
cannam@154
|
953 vst1q_s32( a + 4, o1_s32x4 );
|
cannam@154
|
954 }
|
cannam@154
|
955
|
cannam@154
|
956 static OPUS_INLINE void silk_SMULWW_4_neon(
|
cannam@154
|
957 opus_int32 *a,
|
cannam@154
|
958 const int32x2_t b_s32x2)
|
cannam@154
|
959 {
|
cannam@154
|
960 int32x4_t a_s32x4, o_s32x4;
|
cannam@154
|
961
|
cannam@154
|
962 a_s32x4 = vld1q_s32( a );
|
cannam@154
|
963 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
|
cannam@154
|
964 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
|
cannam@154
|
965 vst1q_s32( a, o_s32x4 );
|
cannam@154
|
966 }
|
cannam@154
|
967
|
cannam@154
|
968 static OPUS_INLINE void silk_SMULWW_8_neon(
|
cannam@154
|
969 opus_int32 *a,
|
cannam@154
|
970 const int32x2_t b_s32x2
|
cannam@154
|
971 )
|
cannam@154
|
972 {
|
cannam@154
|
973 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
|
cannam@154
|
974
|
cannam@154
|
975 a0_s32x4 = vld1q_s32( a );
|
cannam@154
|
976 a1_s32x4 = vld1q_s32( a + 4 );
|
cannam@154
|
977 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
|
cannam@154
|
978 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
|
cannam@154
|
979 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
|
cannam@154
|
980 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
|
cannam@154
|
981 vst1q_s32( a, o0_s32x4 );
|
cannam@154
|
982 vst1q_s32( a + 4, o1_s32x4 );
|
cannam@154
|
983 }
|
cannam@154
|
984
|
cannam@154
|
985 static OPUS_INLINE void silk_SMULWW_loop_neon(
|
cannam@154
|
986 const opus_int16 *a,
|
cannam@154
|
987 const opus_int32 b,
|
cannam@154
|
988 opus_int32 *o,
|
cannam@154
|
989 const opus_int loop_num
|
cannam@154
|
990 )
|
cannam@154
|
991 {
|
cannam@154
|
992 opus_int i;
|
cannam@154
|
993 int32x2_t b_s32x2;
|
cannam@154
|
994
|
cannam@154
|
995 b_s32x2 = vdup_n_s32( b );
|
cannam@154
|
996 for( i = 0; i < loop_num - 7; i += 8 ) {
|
cannam@154
|
997 silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
|
cannam@154
|
998 }
|
cannam@154
|
999 for( ; i < loop_num; i++ ) {
|
cannam@154
|
1000 o[ i ] = silk_SMULWW( a[ i ], b );
|
cannam@154
|
1001 }
|
cannam@154
|
1002 }
|
cannam@154
|
1003
|
cannam@154
|
1004 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
|
cannam@154
|
1005 const silk_encoder_state *psEncC, /* I Encoder State */
|
cannam@154
|
1006 silk_nsq_state *NSQ, /* I/O NSQ state */
|
cannam@154
|
1007 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
|
cannam@154
|
1008 const opus_int16 x16[], /* I Input */
|
cannam@154
|
1009 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
|
cannam@154
|
1010 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
|
cannam@154
|
1011 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
cannam@154
|
1012 opus_int subfr, /* I Subframe number */
|
cannam@154
|
1013 const opus_int LTP_scale_Q14, /* I LTP state scaling */
|
cannam@154
|
1014 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
|
cannam@154
|
1015 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
|
cannam@154
|
1016 const opus_int signal_type, /* I Signal type */
|
cannam@154
|
1017 const opus_int decisionDelay /* I Decision delay */
|
cannam@154
|
1018 )
|
cannam@154
|
1019 {
|
cannam@154
|
1020 opus_int i, lag;
|
cannam@154
|
1021 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
|
cannam@154
|
1022
|
cannam@154
|
1023 lag = pitchL[ subfr ];
|
cannam@154
|
1024 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
|
cannam@154
|
1025 silk_assert( inv_gain_Q31 != 0 );
|
cannam@154
|
1026
|
cannam@154
|
1027 /* Scale input */
|
cannam@154
|
1028 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
|
cannam@154
|
1029 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
|
cannam@154
|
1030
|
cannam@154
|
1031 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
|
cannam@154
|
1032 if( NSQ->rewhite_flag ) {
|
cannam@154
|
1033 if( subfr == 0 ) {
|
cannam@154
|
1034 /* Do LTP downscaling */
|
cannam@154
|
1035 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
|
cannam@154
|
1036 }
|
cannam@154
|
1037 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
|
cannam@154
|
1038 }
|
cannam@154
|
1039
|
cannam@154
|
1040 /* Adjust for changing gain */
|
cannam@154
|
1041 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
|
cannam@154
|
1042 int32x2_t gain_adj_Q16_s32x2;
|
cannam@154
|
1043 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
|
cannam@154
|
1044
|
cannam@154
|
1045 /* Scale long-term shaping state */
|
cannam@154
|
1046 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
|
cannam@154
|
1047 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
|
cannam@154
|
1048 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
|
cannam@154
|
1049 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
|
cannam@154
|
1050 }
|
cannam@154
|
1051 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
|
cannam@154
|
1052 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
|
cannam@154
|
1053 }
|
cannam@154
|
1054
|
cannam@154
|
1055 /* Scale long-term prediction state */
|
cannam@154
|
1056 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
|
cannam@154
|
1057 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
|
cannam@154
|
1058 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
|
cannam@154
|
1059 }
|
cannam@154
|
1060 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
|
cannam@154
|
1061 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
|
cannam@154
|
1062 }
|
cannam@154
|
1063 }
|
cannam@154
|
1064
|
cannam@154
|
1065 /* Scale scalar states */
|
cannam@154
|
1066 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
|
cannam@154
|
1067 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
|
cannam@154
|
1068
|
cannam@154
|
1069 /* Scale short-term prediction and shaping states */
|
cannam@154
|
1070 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
cannam@154
|
1071 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1072 }
|
cannam@154
|
1073
|
cannam@154
|
1074 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
|
cannam@154
|
1075 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1076 }
|
cannam@154
|
1077
|
cannam@154
|
1078 for( i = 0; i < DECISION_DELAY; i++ ) {
|
cannam@154
|
1079 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1080 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1081 }
|
cannam@154
|
1082 } else {
|
cannam@154
|
1083 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
|
cannam@154
|
1084 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
|
cannam@154
|
1085 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
|
cannam@154
|
1086 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
|
cannam@154
|
1087 }
|
cannam@154
|
1088 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
|
cannam@154
|
1089 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
|
cannam@154
|
1090 }
|
cannam@154
|
1091
|
cannam@154
|
1092 /* Scale long-term prediction state */
|
cannam@154
|
1093 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
|
cannam@154
|
1094 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
|
cannam@154
|
1095 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
|
cannam@154
|
1096 }
|
cannam@154
|
1097 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
|
cannam@154
|
1098 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
|
cannam@154
|
1099 }
|
cannam@154
|
1100 }
|
cannam@154
|
1101
|
cannam@154
|
1102 /* Scale scalar states */
|
cannam@154
|
1103 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
|
cannam@154
|
1104 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
|
cannam@154
|
1105
|
cannam@154
|
1106 /* Scale short-term prediction and shaping states */
|
cannam@154
|
1107 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
cannam@154
|
1108 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1109 }
|
cannam@154
|
1110
|
cannam@154
|
1111 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
|
cannam@154
|
1112 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1113 }
|
cannam@154
|
1114
|
cannam@154
|
1115 for( i = 0; i < DECISION_DELAY; i++ ) {
|
cannam@154
|
1116 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1117 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
|
cannam@154
|
1118 }
|
cannam@154
|
1119 }
|
cannam@154
|
1120
|
cannam@154
|
1121 /* Save inverse gain */
|
cannam@154
|
1122 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
|
cannam@154
|
1123 }
|
cannam@154
|
1124 }
|