annotate src/opus-1.3/celt/x86/pitch_sse.c @ 78:7ea7031c0e5c pa_catalina

Apply patch from Tim Bunnell on PortAudio mailing list (2016-12-28, Mac 10.11 deprecation warning)
author Chris Cannam
date Wed, 30 Oct 2019 11:28:45 +0000
parents 7aeed7906520
children
rev   line source
Chris@69 1 /* Copyright (c) 2014, Cisco Systems, INC
Chris@69 2 Written by XiangMingZhu WeiZhou MinPeng YanWang
Chris@69 3
Chris@69 4 Redistribution and use in source and binary forms, with or without
Chris@69 5 modification, are permitted provided that the following conditions
Chris@69 6 are met:
Chris@69 7
Chris@69 8 - Redistributions of source code must retain the above copyright
Chris@69 9 notice, this list of conditions and the following disclaimer.
Chris@69 10
Chris@69 11 - Redistributions in binary form must reproduce the above copyright
Chris@69 12 notice, this list of conditions and the following disclaimer in the
Chris@69 13 documentation and/or other materials provided with the distribution.
Chris@69 14
Chris@69 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
Chris@69 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
Chris@69 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
Chris@69 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
Chris@69 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
Chris@69 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
Chris@69 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
Chris@69 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
Chris@69 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
Chris@69 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
Chris@69 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Chris@69 26 */
Chris@69 27
Chris@69 28 #ifdef HAVE_CONFIG_H
Chris@69 29 #include "config.h"
Chris@69 30 #endif
Chris@69 31
Chris@69 32 #include "macros.h"
Chris@69 33 #include "celt_lpc.h"
Chris@69 34 #include "stack_alloc.h"
Chris@69 35 #include "mathops.h"
Chris@69 36 #include "pitch.h"
Chris@69 37
Chris@69 38 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
Chris@69 39
Chris@69 40 #include <xmmintrin.h>
Chris@69 41 #include "arch.h"
Chris@69 42
Chris@69 43 void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
Chris@69 44 {
Chris@69 45 int j;
Chris@69 46 __m128 xsum1, xsum2;
Chris@69 47 xsum1 = _mm_loadu_ps(sum);
Chris@69 48 xsum2 = _mm_setzero_ps();
Chris@69 49
Chris@69 50 for (j = 0; j < len-3; j += 4)
Chris@69 51 {
Chris@69 52 __m128 x0 = _mm_loadu_ps(x+j);
Chris@69 53 __m128 yj = _mm_loadu_ps(y+j);
Chris@69 54 __m128 y3 = _mm_loadu_ps(y+j+3);
Chris@69 55
Chris@69 56 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
Chris@69 57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
Chris@69 58 _mm_shuffle_ps(yj,y3,0x49)));
Chris@69 59 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
Chris@69 60 _mm_shuffle_ps(yj,y3,0x9e)));
Chris@69 61 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
Chris@69 62 }
Chris@69 63 if (j < len)
Chris@69 64 {
Chris@69 65 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
Chris@69 66 if (++j < len)
Chris@69 67 {
Chris@69 68 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
Chris@69 69 if (++j < len)
Chris@69 70 {
Chris@69 71 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
Chris@69 72 }
Chris@69 73 }
Chris@69 74 }
Chris@69 75 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
Chris@69 76 }
Chris@69 77
Chris@69 78
Chris@69 79 void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
Chris@69 80 int N, opus_val32 *xy1, opus_val32 *xy2)
Chris@69 81 {
Chris@69 82 int i;
Chris@69 83 __m128 xsum1, xsum2;
Chris@69 84 xsum1 = _mm_setzero_ps();
Chris@69 85 xsum2 = _mm_setzero_ps();
Chris@69 86 for (i=0;i<N-3;i+=4)
Chris@69 87 {
Chris@69 88 __m128 xi = _mm_loadu_ps(x+i);
Chris@69 89 __m128 y1i = _mm_loadu_ps(y01+i);
Chris@69 90 __m128 y2i = _mm_loadu_ps(y02+i);
Chris@69 91 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
Chris@69 92 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
Chris@69 93 }
Chris@69 94 /* Horizontal sum */
Chris@69 95 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
Chris@69 96 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
Chris@69 97 _mm_store_ss(xy1, xsum1);
Chris@69 98 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
Chris@69 99 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
Chris@69 100 _mm_store_ss(xy2, xsum2);
Chris@69 101 for (;i<N;i++)
Chris@69 102 {
Chris@69 103 *xy1 = MAC16_16(*xy1, x[i], y01[i]);
Chris@69 104 *xy2 = MAC16_16(*xy2, x[i], y02[i]);
Chris@69 105 }
Chris@69 106 }
Chris@69 107
Chris@69 108 opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
Chris@69 109 int N)
Chris@69 110 {
Chris@69 111 int i;
Chris@69 112 float xy;
Chris@69 113 __m128 sum;
Chris@69 114 sum = _mm_setzero_ps();
Chris@69 115 /* FIXME: We should probably go 8-way and use 2 sums. */
Chris@69 116 for (i=0;i<N-3;i+=4)
Chris@69 117 {
Chris@69 118 __m128 xi = _mm_loadu_ps(x+i);
Chris@69 119 __m128 yi = _mm_loadu_ps(y+i);
Chris@69 120 sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
Chris@69 121 }
Chris@69 122 /* Horizontal sum */
Chris@69 123 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
Chris@69 124 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
Chris@69 125 _mm_store_ss(&xy, sum);
Chris@69 126 for (;i<N;i++)
Chris@69 127 {
Chris@69 128 xy = MAC16_16(xy, x[i], y[i]);
Chris@69 129 }
Chris@69 130 return xy;
Chris@69 131 }
Chris@69 132
Chris@69 133 void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
Chris@69 134 opus_val16 g10, opus_val16 g11, opus_val16 g12)
Chris@69 135 {
Chris@69 136 int i;
Chris@69 137 __m128 x0v;
Chris@69 138 __m128 g10v, g11v, g12v;
Chris@69 139 g10v = _mm_load1_ps(&g10);
Chris@69 140 g11v = _mm_load1_ps(&g11);
Chris@69 141 g12v = _mm_load1_ps(&g12);
Chris@69 142 x0v = _mm_loadu_ps(&x[-T-2]);
Chris@69 143 for (i=0;i<N-3;i+=4)
Chris@69 144 {
Chris@69 145 __m128 yi, yi2, x1v, x2v, x3v, x4v;
Chris@69 146 const opus_val32 *xp = &x[i-T-2];
Chris@69 147 yi = _mm_loadu_ps(x+i);
Chris@69 148 x4v = _mm_loadu_ps(xp+4);
Chris@69 149 #if 0
Chris@69 150 /* Slower version with all loads */
Chris@69 151 x1v = _mm_loadu_ps(xp+1);
Chris@69 152 x2v = _mm_loadu_ps(xp+2);
Chris@69 153 x3v = _mm_loadu_ps(xp+3);
Chris@69 154 #else
Chris@69 155 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
Chris@69 156 x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
Chris@69 157 x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
Chris@69 158 #endif
Chris@69 159
Chris@69 160 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
Chris@69 161 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
Chris@69 162 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
Chris@69 163 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
Chris@69 164 #else
Chris@69 165 /* Use partial sums */
Chris@69 166 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
Chris@69 167 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
Chris@69 168 yi = _mm_add_ps(yi, yi2);
Chris@69 169 #endif
Chris@69 170 x0v=x4v;
Chris@69 171 _mm_storeu_ps(y+i, yi);
Chris@69 172 }
Chris@69 173 #ifdef CUSTOM_MODES
Chris@69 174 for (;i<N;i++)
Chris@69 175 {
Chris@69 176 y[i] = x[i]
Chris@69 177 + MULT16_32_Q15(g10,x[i-T])
Chris@69 178 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
Chris@69 179 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
Chris@69 180 }
Chris@69 181 #endif
Chris@69 182 }
Chris@69 183
Chris@69 184
Chris@69 185 #endif