annotate ffmpeg/libavcodec/x86/mpegaudiodec.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * MMX optimized MP3 decoding functions
yading@10 3 * Copyright (c) 2010 Vitor Sessak
yading@10 4 *
yading@10 5 * This file is part of FFmpeg.
yading@10 6 *
yading@10 7 * FFmpeg is free software; you can redistribute it and/or
yading@10 8 * modify it under the terms of the GNU Lesser General Public
yading@10 9 * License as published by the Free Software Foundation; either
yading@10 10 * version 2.1 of the License, or (at your option) any later version.
yading@10 11 *
yading@10 12 * FFmpeg is distributed in the hope that it will be useful,
yading@10 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 * Lesser General Public License for more details.
yading@10 16 *
yading@10 17 * You should have received a copy of the GNU Lesser General Public
yading@10 18 * License along with FFmpeg; if not, write to the Free Software
yading@10 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 */
yading@10 21
yading@10 22 #include "libavutil/attributes.h"
yading@10 23 #include "libavutil/cpu.h"
yading@10 24 #include "libavutil/internal.h"
yading@10 25 #include "libavutil/x86/asm.h"
yading@10 26 #include "libavutil/x86/cpu.h"
yading@10 27 #include "libavcodec/mpegaudiodsp.h"
yading@10 28
yading@10 29 #define DECL(CPU)\
yading@10 30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
yading@10 31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
yading@10 32
yading@10 33 DECL(sse)
yading@10 34 DECL(sse2)
yading@10 35 DECL(sse3)
yading@10 36 DECL(ssse3)
yading@10 37 DECL(avx)
yading@10 38
yading@10 39 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
yading@10 40 float *tmpbuf);
yading@10 41 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
yading@10 42 float *tmpbuf);
yading@10 43
yading@10 44 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
yading@10 45
yading@10 46 #if HAVE_SSE2_INLINE
yading@10 47
yading@10 48 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
yading@10 49 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
yading@10 50
yading@10 51 #define SUM8(op, sum, w, p) \
yading@10 52 { \
yading@10 53 op(sum, (w)[0 * 64], (p)[0 * 64]); \
yading@10 54 op(sum, (w)[1 * 64], (p)[1 * 64]); \
yading@10 55 op(sum, (w)[2 * 64], (p)[2 * 64]); \
yading@10 56 op(sum, (w)[3 * 64], (p)[3 * 64]); \
yading@10 57 op(sum, (w)[4 * 64], (p)[4 * 64]); \
yading@10 58 op(sum, (w)[5 * 64], (p)[5 * 64]); \
yading@10 59 op(sum, (w)[6 * 64], (p)[6 * 64]); \
yading@10 60 op(sum, (w)[7 * 64], (p)[7 * 64]); \
yading@10 61 }
yading@10 62
yading@10 63 static void apply_window(const float *buf, const float *win1,
yading@10 64 const float *win2, float *sum1, float *sum2, int len)
yading@10 65 {
yading@10 66 x86_reg count = - 4*len;
yading@10 67 const float *win1a = win1+len;
yading@10 68 const float *win2a = win2+len;
yading@10 69 const float *bufa = buf+len;
yading@10 70 float *sum1a = sum1+len;
yading@10 71 float *sum2a = sum2+len;
yading@10 72
yading@10 73
yading@10 74 #define MULT(a, b) \
yading@10 75 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
yading@10 76 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
yading@10 77 "mulps %%xmm2, %%xmm1 \n\t" \
yading@10 78 "subps %%xmm1, %%xmm0 \n\t" \
yading@10 79 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
yading@10 80 "subps %%xmm2, %%xmm4 \n\t" \
yading@10 81
yading@10 82 __asm__ volatile(
yading@10 83 "1: \n\t"
yading@10 84 "xorps %%xmm0, %%xmm0 \n\t"
yading@10 85 "xorps %%xmm4, %%xmm4 \n\t"
yading@10 86
yading@10 87 MULT( 0, 0)
yading@10 88 MULT( 256, 64)
yading@10 89 MULT( 512, 128)
yading@10 90 MULT( 768, 192)
yading@10 91 MULT(1024, 256)
yading@10 92 MULT(1280, 320)
yading@10 93 MULT(1536, 384)
yading@10 94 MULT(1792, 448)
yading@10 95
yading@10 96 "movaps %%xmm0, (%4,%0) \n\t"
yading@10 97 "movaps %%xmm4, (%5,%0) \n\t"
yading@10 98 "add $16, %0 \n\t"
yading@10 99 "jl 1b \n\t"
yading@10 100 :"+&r"(count)
yading@10 101 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
yading@10 102 );
yading@10 103
yading@10 104 #undef MULT
yading@10 105 }
yading@10 106
yading@10 107 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
yading@10 108 int incr)
yading@10 109 {
yading@10 110 LOCAL_ALIGNED_16(float, suma, [17]);
yading@10 111 LOCAL_ALIGNED_16(float, sumb, [17]);
yading@10 112 LOCAL_ALIGNED_16(float, sumc, [17]);
yading@10 113 LOCAL_ALIGNED_16(float, sumd, [17]);
yading@10 114
yading@10 115 float sum;
yading@10 116
yading@10 117 /* copy to avoid wrap */
yading@10 118 __asm__ volatile(
yading@10 119 "movaps 0(%0), %%xmm0 \n\t" \
yading@10 120 "movaps 16(%0), %%xmm1 \n\t" \
yading@10 121 "movaps 32(%0), %%xmm2 \n\t" \
yading@10 122 "movaps 48(%0), %%xmm3 \n\t" \
yading@10 123 "movaps %%xmm0, 0(%1) \n\t" \
yading@10 124 "movaps %%xmm1, 16(%1) \n\t" \
yading@10 125 "movaps %%xmm2, 32(%1) \n\t" \
yading@10 126 "movaps %%xmm3, 48(%1) \n\t" \
yading@10 127 "movaps 64(%0), %%xmm0 \n\t" \
yading@10 128 "movaps 80(%0), %%xmm1 \n\t" \
yading@10 129 "movaps 96(%0), %%xmm2 \n\t" \
yading@10 130 "movaps 112(%0), %%xmm3 \n\t" \
yading@10 131 "movaps %%xmm0, 64(%1) \n\t" \
yading@10 132 "movaps %%xmm1, 80(%1) \n\t" \
yading@10 133 "movaps %%xmm2, 96(%1) \n\t" \
yading@10 134 "movaps %%xmm3, 112(%1) \n\t"
yading@10 135 ::"r"(in), "r"(in+512)
yading@10 136 :"memory"
yading@10 137 );
yading@10 138
yading@10 139 apply_window(in + 16, win , win + 512, suma, sumc, 16);
yading@10 140 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
yading@10 141
yading@10 142 SUM8(MACS, suma[0], win + 32, in + 48);
yading@10 143
yading@10 144 sumc[ 0] = 0;
yading@10 145 sumb[16] = 0;
yading@10 146 sumd[16] = 0;
yading@10 147
yading@10 148 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
yading@10 149 "movups " #sumd "(%4), %%xmm0 \n\t" \
yading@10 150 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
yading@10 151 "subps " #suma "(%1), %%xmm0 \n\t" \
yading@10 152 "movaps %%xmm0," #out1 "(%0) \n\t" \
yading@10 153 \
yading@10 154 "movups " #sumc "(%3), %%xmm0 \n\t" \
yading@10 155 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
yading@10 156 "addps " #sumb "(%2), %%xmm0 \n\t" \
yading@10 157 "movaps %%xmm0," #out2 "(%0) \n\t"
yading@10 158
yading@10 159 if (incr == 1) {
yading@10 160 __asm__ volatile(
yading@10 161 SUMS( 0, 48, 4, 52, 0, 112)
yading@10 162 SUMS(16, 32, 20, 36, 16, 96)
yading@10 163 SUMS(32, 16, 36, 20, 32, 80)
yading@10 164 SUMS(48, 0, 52, 4, 48, 64)
yading@10 165
yading@10 166 :"+&r"(out)
yading@10 167 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
yading@10 168 :"memory"
yading@10 169 );
yading@10 170 out += 16*incr;
yading@10 171 } else {
yading@10 172 int j;
yading@10 173 float *out2 = out + 32 * incr;
yading@10 174 out[0 ] = -suma[ 0];
yading@10 175 out += incr;
yading@10 176 out2 -= incr;
yading@10 177 for(j=1;j<16;j++) {
yading@10 178 *out = -suma[ j] + sumd[16-j];
yading@10 179 *out2 = sumb[16-j] + sumc[ j];
yading@10 180 out += incr;
yading@10 181 out2 -= incr;
yading@10 182 }
yading@10 183 }
yading@10 184
yading@10 185 sum = 0;
yading@10 186 SUM8(MLSS, sum, win + 16 + 32, in + 32);
yading@10 187 *out = sum;
yading@10 188 }
yading@10 189
yading@10 190 #endif /* HAVE_SSE2_INLINE */
yading@10 191
yading@10 192 #if HAVE_YASM
yading@10 193 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
yading@10 194 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
yading@10 195 int count, int switch_point, int block_type) \
yading@10 196 { \
yading@10 197 int align_end = count - (count & 3); \
yading@10 198 int j; \
yading@10 199 for (j = 0; j < align_end; j+= 4) { \
yading@10 200 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
yading@10 201 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
yading@10 202 /* apply window & overlap with previous buffer */ \
yading@10 203 \
yading@10 204 /* select window */ \
yading@10 205 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
yading@10 206 in += 4*18; \
yading@10 207 buf += 4*18; \
yading@10 208 out += 4; \
yading@10 209 } \
yading@10 210 for (; j < count; j++) { \
yading@10 211 /* apply window & overlap with previous buffer */ \
yading@10 212 \
yading@10 213 /* select window */ \
yading@10 214 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
yading@10 215 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
yading@10 216 \
yading@10 217 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
yading@10 218 \
yading@10 219 in += 18; \
yading@10 220 buf++; \
yading@10 221 out++; \
yading@10 222 } \
yading@10 223 }
yading@10 224
yading@10 225 #if HAVE_SSE
yading@10 226 DECL_IMDCT_BLOCKS(sse,sse)
yading@10 227 DECL_IMDCT_BLOCKS(sse2,sse)
yading@10 228 DECL_IMDCT_BLOCKS(sse3,sse)
yading@10 229 DECL_IMDCT_BLOCKS(ssse3,sse)
yading@10 230 #endif
yading@10 231 #if HAVE_AVX_EXTERNAL
yading@10 232 DECL_IMDCT_BLOCKS(avx,avx)
yading@10 233 #endif
yading@10 234 #endif /* HAVE_YASM */
yading@10 235
yading@10 236 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
yading@10 237 {
yading@10 238 int mm_flags = av_get_cpu_flags();
yading@10 239
yading@10 240 int i, j;
yading@10 241 for (j = 0; j < 4; j++) {
yading@10 242 for (i = 0; i < 40; i ++) {
yading@10 243 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
yading@10 244 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
yading@10 245 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
yading@10 246 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
yading@10 247 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
yading@10 248 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
yading@10 249 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
yading@10 250 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
yading@10 251 }
yading@10 252 }
yading@10 253
yading@10 254 #if HAVE_SSE2_INLINE
yading@10 255 if (mm_flags & AV_CPU_FLAG_SSE2) {
yading@10 256 s->apply_window_float = apply_window_mp3;
yading@10 257 }
yading@10 258 #endif /* HAVE_SSE2_INLINE */
yading@10 259
yading@10 260 #if HAVE_YASM
yading@10 261 if (EXTERNAL_AVX(mm_flags)) {
yading@10 262 s->imdct36_blocks_float = imdct36_blocks_avx;
yading@10 263 } else if (EXTERNAL_SSSE3(mm_flags)) {
yading@10 264 s->imdct36_blocks_float = imdct36_blocks_ssse3;
yading@10 265 } else if (EXTERNAL_SSE3(mm_flags)) {
yading@10 266 s->imdct36_blocks_float = imdct36_blocks_sse3;
yading@10 267 } else if (EXTERNAL_SSE2(mm_flags)) {
yading@10 268 s->imdct36_blocks_float = imdct36_blocks_sse2;
yading@10 269 } else if (EXTERNAL_SSE(mm_flags)) {
yading@10 270 s->imdct36_blocks_float = imdct36_blocks_sse;
yading@10 271 }
yading@10 272 #endif /* HAVE_YASM */
yading@10 273 }