cannam@154
|
1 /* Copyright (c) 2014, Cisco Systems, INC
|
cannam@154
|
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
|
cannam@154
|
3
|
cannam@154
|
4 Redistribution and use in source and binary forms, with or without
|
cannam@154
|
5 modification, are permitted provided that the following conditions
|
cannam@154
|
6 are met:
|
cannam@154
|
7
|
cannam@154
|
8 - Redistributions of source code must retain the above copyright
|
cannam@154
|
9 notice, this list of conditions and the following disclaimer.
|
cannam@154
|
10
|
cannam@154
|
11 - Redistributions in binary form must reproduce the above copyright
|
cannam@154
|
12 notice, this list of conditions and the following disclaimer in the
|
cannam@154
|
13 documentation and/or other materials provided with the distribution.
|
cannam@154
|
14
|
cannam@154
|
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
cannam@154
|
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
cannam@154
|
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
cannam@154
|
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
cannam@154
|
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
cannam@154
|
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
cannam@154
|
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
cannam@154
|
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
cannam@154
|
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
cannam@154
|
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
cannam@154
|
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
cannam@154
|
26 */
|
cannam@154
|
27
|
cannam@154
|
28 #ifdef HAVE_CONFIG_H
|
cannam@154
|
29 #include "config.h"
|
cannam@154
|
30 #endif
|
cannam@154
|
31
|
cannam@154
|
32 #include "macros.h"
|
cannam@154
|
33 #include "celt_lpc.h"
|
cannam@154
|
34 #include "stack_alloc.h"
|
cannam@154
|
35 #include "mathops.h"
|
cannam@154
|
36 #include "pitch.h"
|
cannam@154
|
37
|
cannam@154
|
38 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
|
cannam@154
|
39
|
cannam@154
|
40 #include <xmmintrin.h>
|
cannam@154
|
41 #include "arch.h"
|
cannam@154
|
42
|
cannam@154
|
43 void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
|
cannam@154
|
44 {
|
cannam@154
|
45 int j;
|
cannam@154
|
46 __m128 xsum1, xsum2;
|
cannam@154
|
47 xsum1 = _mm_loadu_ps(sum);
|
cannam@154
|
48 xsum2 = _mm_setzero_ps();
|
cannam@154
|
49
|
cannam@154
|
50 for (j = 0; j < len-3; j += 4)
|
cannam@154
|
51 {
|
cannam@154
|
52 __m128 x0 = _mm_loadu_ps(x+j);
|
cannam@154
|
53 __m128 yj = _mm_loadu_ps(y+j);
|
cannam@154
|
54 __m128 y3 = _mm_loadu_ps(y+j+3);
|
cannam@154
|
55
|
cannam@154
|
56 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
|
cannam@154
|
57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
|
cannam@154
|
58 _mm_shuffle_ps(yj,y3,0x49)));
|
cannam@154
|
59 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
|
cannam@154
|
60 _mm_shuffle_ps(yj,y3,0x9e)));
|
cannam@154
|
61 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
|
cannam@154
|
62 }
|
cannam@154
|
63 if (j < len)
|
cannam@154
|
64 {
|
cannam@154
|
65 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
|
cannam@154
|
66 if (++j < len)
|
cannam@154
|
67 {
|
cannam@154
|
68 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
|
cannam@154
|
69 if (++j < len)
|
cannam@154
|
70 {
|
cannam@154
|
71 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
|
cannam@154
|
72 }
|
cannam@154
|
73 }
|
cannam@154
|
74 }
|
cannam@154
|
75 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
|
cannam@154
|
76 }
|
cannam@154
|
77
|
cannam@154
|
78
|
cannam@154
|
79 void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
cannam@154
|
80 int N, opus_val32 *xy1, opus_val32 *xy2)
|
cannam@154
|
81 {
|
cannam@154
|
82 int i;
|
cannam@154
|
83 __m128 xsum1, xsum2;
|
cannam@154
|
84 xsum1 = _mm_setzero_ps();
|
cannam@154
|
85 xsum2 = _mm_setzero_ps();
|
cannam@154
|
86 for (i=0;i<N-3;i+=4)
|
cannam@154
|
87 {
|
cannam@154
|
88 __m128 xi = _mm_loadu_ps(x+i);
|
cannam@154
|
89 __m128 y1i = _mm_loadu_ps(y01+i);
|
cannam@154
|
90 __m128 y2i = _mm_loadu_ps(y02+i);
|
cannam@154
|
91 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
|
cannam@154
|
92 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
|
cannam@154
|
93 }
|
cannam@154
|
94 /* Horizontal sum */
|
cannam@154
|
95 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
|
cannam@154
|
96 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
|
cannam@154
|
97 _mm_store_ss(xy1, xsum1);
|
cannam@154
|
98 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
|
cannam@154
|
99 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
|
cannam@154
|
100 _mm_store_ss(xy2, xsum2);
|
cannam@154
|
101 for (;i<N;i++)
|
cannam@154
|
102 {
|
cannam@154
|
103 *xy1 = MAC16_16(*xy1, x[i], y01[i]);
|
cannam@154
|
104 *xy2 = MAC16_16(*xy2, x[i], y02[i]);
|
cannam@154
|
105 }
|
cannam@154
|
106 }
|
cannam@154
|
107
|
cannam@154
|
108 opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
|
cannam@154
|
109 int N)
|
cannam@154
|
110 {
|
cannam@154
|
111 int i;
|
cannam@154
|
112 float xy;
|
cannam@154
|
113 __m128 sum;
|
cannam@154
|
114 sum = _mm_setzero_ps();
|
cannam@154
|
115 /* FIXME: We should probably go 8-way and use 2 sums. */
|
cannam@154
|
116 for (i=0;i<N-3;i+=4)
|
cannam@154
|
117 {
|
cannam@154
|
118 __m128 xi = _mm_loadu_ps(x+i);
|
cannam@154
|
119 __m128 yi = _mm_loadu_ps(y+i);
|
cannam@154
|
120 sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
|
cannam@154
|
121 }
|
cannam@154
|
122 /* Horizontal sum */
|
cannam@154
|
123 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
|
cannam@154
|
124 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
|
cannam@154
|
125 _mm_store_ss(&xy, sum);
|
cannam@154
|
126 for (;i<N;i++)
|
cannam@154
|
127 {
|
cannam@154
|
128 xy = MAC16_16(xy, x[i], y[i]);
|
cannam@154
|
129 }
|
cannam@154
|
130 return xy;
|
cannam@154
|
131 }
|
cannam@154
|
132
|
cannam@154
|
133 void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
|
cannam@154
|
134 opus_val16 g10, opus_val16 g11, opus_val16 g12)
|
cannam@154
|
135 {
|
cannam@154
|
136 int i;
|
cannam@154
|
137 __m128 x0v;
|
cannam@154
|
138 __m128 g10v, g11v, g12v;
|
cannam@154
|
139 g10v = _mm_load1_ps(&g10);
|
cannam@154
|
140 g11v = _mm_load1_ps(&g11);
|
cannam@154
|
141 g12v = _mm_load1_ps(&g12);
|
cannam@154
|
142 x0v = _mm_loadu_ps(&x[-T-2]);
|
cannam@154
|
143 for (i=0;i<N-3;i+=4)
|
cannam@154
|
144 {
|
cannam@154
|
145 __m128 yi, yi2, x1v, x2v, x3v, x4v;
|
cannam@154
|
146 const opus_val32 *xp = &x[i-T-2];
|
cannam@154
|
147 yi = _mm_loadu_ps(x+i);
|
cannam@154
|
148 x4v = _mm_loadu_ps(xp+4);
|
cannam@154
|
149 #if 0
|
cannam@154
|
150 /* Slower version with all loads */
|
cannam@154
|
151 x1v = _mm_loadu_ps(xp+1);
|
cannam@154
|
152 x2v = _mm_loadu_ps(xp+2);
|
cannam@154
|
153 x3v = _mm_loadu_ps(xp+3);
|
cannam@154
|
154 #else
|
cannam@154
|
155 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
|
cannam@154
|
156 x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
|
cannam@154
|
157 x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
|
cannam@154
|
158 #endif
|
cannam@154
|
159
|
cannam@154
|
160 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
|
cannam@154
|
161 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
|
cannam@154
|
162 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
|
cannam@154
|
163 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
|
cannam@154
|
164 #else
|
cannam@154
|
165 /* Use partial sums */
|
cannam@154
|
166 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
|
cannam@154
|
167 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
|
cannam@154
|
168 yi = _mm_add_ps(yi, yi2);
|
cannam@154
|
169 #endif
|
cannam@154
|
170 x0v=x4v;
|
cannam@154
|
171 _mm_storeu_ps(y+i, yi);
|
cannam@154
|
172 }
|
cannam@154
|
173 #ifdef CUSTOM_MODES
|
cannam@154
|
174 for (;i<N;i++)
|
cannam@154
|
175 {
|
cannam@154
|
176 y[i] = x[i]
|
cannam@154
|
177 + MULT16_32_Q15(g10,x[i-T])
|
cannam@154
|
178 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
|
cannam@154
|
179 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
|
cannam@154
|
180 }
|
cannam@154
|
181 #endif
|
cannam@154
|
182 }
|
cannam@154
|
183
|
cannam@154
|
184
|
cannam@154
|
185 #endif
|