annotate src/opus-1.3/celt/x86/pitch_sse4_1.c @ 169:223a55898ab9 tip default

Add null config files
author Chris Cannam <cannam@all-day-breakfast.com>
date Mon, 02 Mar 2020 14:03:47 +0000
parents 4664ac0c1032
children
rev   line source
cannam@154 1 /* Copyright (c) 2014, Cisco Systems, INC
cannam@154 2 Written by XiangMingZhu WeiZhou MinPeng YanWang
cannam@154 3
cannam@154 4 Redistribution and use in source and binary forms, with or without
cannam@154 5 modification, are permitted provided that the following conditions
cannam@154 6 are met:
cannam@154 7
cannam@154 8 - Redistributions of source code must retain the above copyright
cannam@154 9 notice, this list of conditions and the following disclaimer.
cannam@154 10
cannam@154 11 - Redistributions in binary form must reproduce the above copyright
cannam@154 12 notice, this list of conditions and the following disclaimer in the
cannam@154 13 documentation and/or other materials provided with the distribution.
cannam@154 14
cannam@154 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
cannam@154 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
cannam@154 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
cannam@154 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
cannam@154 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
cannam@154 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
cannam@154 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
cannam@154 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
cannam@154 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
cannam@154 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
cannam@154 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cannam@154 26 */
cannam@154 27
cannam@154 28 #ifdef HAVE_CONFIG_H
cannam@154 29 #include "config.h"
cannam@154 30 #endif
cannam@154 31
cannam@154 32 #include <xmmintrin.h>
cannam@154 33 #include <emmintrin.h>
cannam@154 34
cannam@154 35 #include "macros.h"
cannam@154 36 #include "celt_lpc.h"
cannam@154 37 #include "stack_alloc.h"
cannam@154 38 #include "mathops.h"
cannam@154 39 #include "pitch.h"
cannam@154 40
cannam@154 41 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
cannam@154 42 #include <smmintrin.h>
cannam@154 43 #include "x86cpu.h"
cannam@154 44
cannam@154 45 opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
cannam@154 46 int N)
cannam@154 47 {
cannam@154 48 opus_int i, dataSize16;
cannam@154 49 opus_int32 sum;
cannam@154 50 __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
cannam@154 51 __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
cannam@154 52 __m128i inVec1_3210, inVec2_3210;
cannam@154 53
cannam@154 54 sum = 0;
cannam@154 55 dataSize16 = N & ~15;
cannam@154 56
cannam@154 57 acc1 = _mm_setzero_si128();
cannam@154 58 acc2 = _mm_setzero_si128();
cannam@154 59
cannam@154 60 for (i=0;i<dataSize16;i+=16) {
cannam@154 61 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
cannam@154 62 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
cannam@154 63
cannam@154 64 inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
cannam@154 65 inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
cannam@154 66
cannam@154 67 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
cannam@154 68 inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
cannam@154 69
cannam@154 70 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
cannam@154 71 acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
cannam@154 72 }
cannam@154 73
cannam@154 74 acc1 = _mm_add_epi32(acc1, acc2);
cannam@154 75
cannam@154 76 if (N - i >= 8)
cannam@154 77 {
cannam@154 78 inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
cannam@154 79 inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
cannam@154 80
cannam@154 81 inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
cannam@154 82
cannam@154 83 acc1 = _mm_add_epi32(acc1, inVec1_76543210);
cannam@154 84 i += 8;
cannam@154 85 }
cannam@154 86
cannam@154 87 if (N - i >= 4)
cannam@154 88 {
cannam@154 89 inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
cannam@154 90 inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
cannam@154 91
cannam@154 92 inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
cannam@154 93
cannam@154 94 acc1 = _mm_add_epi32(acc1, inVec1_3210);
cannam@154 95 i += 4;
cannam@154 96 }
cannam@154 97
cannam@154 98 acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
cannam@154 99 acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
cannam@154 100
cannam@154 101 sum += _mm_cvtsi128_si32(acc1);
cannam@154 102
cannam@154 103 for (;i<N;i++)
cannam@154 104 {
cannam@154 105 sum = silk_SMLABB(sum, x[i], y[i]);
cannam@154 106 }
cannam@154 107
cannam@154 108 return sum;
cannam@154 109 }
cannam@154 110
cannam@154 111 void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
cannam@154 112 {
cannam@154 113 int j;
cannam@154 114
cannam@154 115 __m128i vecX, vecX0, vecX1, vecX2, vecX3;
cannam@154 116 __m128i vecY0, vecY1, vecY2, vecY3;
cannam@154 117 __m128i sum0, sum1, sum2, sum3, vecSum;
cannam@154 118 __m128i initSum;
cannam@154 119
cannam@154 120 celt_assert(len >= 3);
cannam@154 121
cannam@154 122 sum0 = _mm_setzero_si128();
cannam@154 123 sum1 = _mm_setzero_si128();
cannam@154 124 sum2 = _mm_setzero_si128();
cannam@154 125 sum3 = _mm_setzero_si128();
cannam@154 126
cannam@154 127 for (j=0;j<(len-7);j+=8)
cannam@154 128 {
cannam@154 129 vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
cannam@154 130 vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
cannam@154 131 vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
cannam@154 132 vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
cannam@154 133 vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
cannam@154 134
cannam@154 135 sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
cannam@154 136 sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
cannam@154 137 sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
cannam@154 138 sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
cannam@154 139 }
cannam@154 140
cannam@154 141 sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
cannam@154 142 sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
cannam@154 143
cannam@154 144 sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
cannam@154 145 sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
cannam@154 146
cannam@154 147 sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
cannam@154 148 sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
cannam@154 149
cannam@154 150 sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
cannam@154 151 sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
cannam@154 152
cannam@154 153 vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
cannam@154 154 _mm_unpacklo_epi32(sum2, sum3));
cannam@154 155
cannam@154 156 for (;j<(len-3);j+=4)
cannam@154 157 {
cannam@154 158 vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
cannam@154 159 vecX0 = _mm_shuffle_epi32(vecX, 0x00);
cannam@154 160 vecX1 = _mm_shuffle_epi32(vecX, 0x55);
cannam@154 161 vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
cannam@154 162 vecX3 = _mm_shuffle_epi32(vecX, 0xff);
cannam@154 163
cannam@154 164 vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
cannam@154 165 vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
cannam@154 166 vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
cannam@154 167 vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
cannam@154 168
cannam@154 169 sum0 = _mm_mullo_epi32(vecX0, vecY0);
cannam@154 170 sum1 = _mm_mullo_epi32(vecX1, vecY1);
cannam@154 171 sum2 = _mm_mullo_epi32(vecX2, vecY2);
cannam@154 172 sum3 = _mm_mullo_epi32(vecX3, vecY3);
cannam@154 173
cannam@154 174 sum0 = _mm_add_epi32(sum0, sum1);
cannam@154 175 sum2 = _mm_add_epi32(sum2, sum3);
cannam@154 176 vecSum = _mm_add_epi32(vecSum, sum0);
cannam@154 177 vecSum = _mm_add_epi32(vecSum, sum2);
cannam@154 178 }
cannam@154 179
cannam@154 180 for (;j<len;j++)
cannam@154 181 {
cannam@154 182 vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
cannam@154 183 vecX0 = _mm_shuffle_epi32(vecX, 0x00);
cannam@154 184
cannam@154 185 vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
cannam@154 186
cannam@154 187 sum0 = _mm_mullo_epi32(vecX0, vecY0);
cannam@154 188 vecSum = _mm_add_epi32(vecSum, sum0);
cannam@154 189 }
cannam@154 190
cannam@154 191 initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
cannam@154 192 initSum = _mm_add_epi32(initSum, vecSum);
cannam@154 193 _mm_storeu_si128((__m128i *)sum, initSum);
cannam@154 194 }
cannam@154 195 #endif