annotate ffmpeg/libavutil/x86/float_dsp_init.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 /*
yading@11 2 * This file is part of FFmpeg.
yading@11 3 *
yading@11 4 * FFmpeg is free software; you can redistribute it and/or
yading@11 5 * modify it under the terms of the GNU Lesser General Public
yading@11 6 * License as published by the Free Software Foundation; either
yading@11 7 * version 2.1 of the License, or (at your option) any later version.
yading@11 8 *
yading@11 9 * FFmpeg is distributed in the hope that it will be useful,
yading@11 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@11 12 * Lesser General Public License for more details.
yading@11 13 *
yading@11 14 * You should have received a copy of the GNU Lesser General Public
yading@11 15 * License along with FFmpeg; if not, write to the Free Software
yading@11 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@11 17 */
yading@11 18
yading@11 19 #include "config.h"
yading@11 20
yading@11 21 #include "libavutil/cpu.h"
yading@11 22 #include "libavutil/float_dsp.h"
yading@11 23 #include "cpu.h"
yading@11 24 #include "asm.h"
yading@11 25
yading@11 26 void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
yading@11 27 int len);
yading@11 28 void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
yading@11 29 int len);
yading@11 30
yading@11 31 void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
yading@11 32 int len);
yading@11 33 void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
yading@11 34 int len);
yading@11 35
yading@11 36 void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
yading@11 37 int len);
yading@11 38
yading@11 39 void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
yading@11 40 double mul, int len);
yading@11 41 void ff_vector_dmul_scalar_avx(double *dst, const double *src,
yading@11 42 double mul, int len);
yading@11 43
yading@11 44 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
yading@11 45 const float *src2, int len);
yading@11 46 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
yading@11 47 const float *src2, int len);
yading@11 48
yading@11 49 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
yading@11 50 const float *src1, int len);
yading@11 51 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
yading@11 52 const float *src1, int len);
yading@11 53
yading@11 54 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
yading@11 55
yading@11 56 void ff_butterflies_float_sse(float *src0, float *src1, int len);
yading@11 57
yading@11 58 #if HAVE_6REGS && HAVE_INLINE_ASM
yading@11 59 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
yading@11 60 const float *src1, const float *win,
yading@11 61 int len)
yading@11 62 {
yading@11 63 x86_reg i = -len * 4;
yading@11 64 x86_reg j = len * 4 - 8;
yading@11 65 __asm__ volatile (
yading@11 66 "1: \n"
yading@11 67 "pswapd (%5, %1), %%mm1 \n"
yading@11 68 "movq (%5, %0), %%mm0 \n"
yading@11 69 "pswapd (%4, %1), %%mm5 \n"
yading@11 70 "movq (%3, %0), %%mm4 \n"
yading@11 71 "movq %%mm0, %%mm2 \n"
yading@11 72 "movq %%mm1, %%mm3 \n"
yading@11 73 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
yading@11 74 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
yading@11 75 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
yading@11 76 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
yading@11 77 "pfadd %%mm3, %%mm2 \n"
yading@11 78 "pfsub %%mm0, %%mm1 \n"
yading@11 79 "pswapd %%mm2, %%mm2 \n"
yading@11 80 "movq %%mm1, (%2, %0) \n"
yading@11 81 "movq %%mm2, (%2, %1) \n"
yading@11 82 "sub $8, %1 \n"
yading@11 83 "add $8, %0 \n"
yading@11 84 "jl 1b \n"
yading@11 85 "femms \n"
yading@11 86 : "+r"(i), "+r"(j)
yading@11 87 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
yading@11 88 );
yading@11 89 }
yading@11 90
yading@11 91 static void vector_fmul_window_sse(float *dst, const float *src0,
yading@11 92 const float *src1, const float *win, int len)
yading@11 93 {
yading@11 94 x86_reg i = -len * 4;
yading@11 95 x86_reg j = len * 4 - 16;
yading@11 96 __asm__ volatile (
yading@11 97 "1: \n"
yading@11 98 "movaps (%5, %1), %%xmm1 \n"
yading@11 99 "movaps (%5, %0), %%xmm0 \n"
yading@11 100 "movaps (%4, %1), %%xmm5 \n"
yading@11 101 "movaps (%3, %0), %%xmm4 \n"
yading@11 102 "shufps $0x1b, %%xmm1, %%xmm1 \n"
yading@11 103 "shufps $0x1b, %%xmm5, %%xmm5 \n"
yading@11 104 "movaps %%xmm0, %%xmm2 \n"
yading@11 105 "movaps %%xmm1, %%xmm3 \n"
yading@11 106 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
yading@11 107 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
yading@11 108 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
yading@11 109 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
yading@11 110 "addps %%xmm3, %%xmm2 \n"
yading@11 111 "subps %%xmm0, %%xmm1 \n"
yading@11 112 "shufps $0x1b, %%xmm2, %%xmm2 \n"
yading@11 113 "movaps %%xmm1, (%2, %0) \n"
yading@11 114 "movaps %%xmm2, (%2, %1) \n"
yading@11 115 "sub $16, %1 \n"
yading@11 116 "add $16, %0 \n"
yading@11 117 "jl 1b \n"
yading@11 118 : "+r"(i), "+r"(j)
yading@11 119 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
yading@11 120 );
yading@11 121 }
yading@11 122 #endif /* HAVE_6REGS && HAVE_INLINE_ASM */
yading@11 123
yading@11 124 void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
yading@11 125 {
yading@11 126 int mm_flags = av_get_cpu_flags();
yading@11 127
yading@11 128 #if HAVE_6REGS && HAVE_INLINE_ASM
yading@11 129 if (INLINE_AMD3DNOWEXT(mm_flags)) {
yading@11 130 fdsp->vector_fmul_window = vector_fmul_window_3dnowext;
yading@11 131 }
yading@11 132 if (INLINE_SSE(mm_flags)) {
yading@11 133 fdsp->vector_fmul_window = vector_fmul_window_sse;
yading@11 134 }
yading@11 135 #endif
yading@11 136 if (EXTERNAL_SSE(mm_flags)) {
yading@11 137 fdsp->vector_fmul = ff_vector_fmul_sse;
yading@11 138 fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
yading@11 139 fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
yading@11 140 fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
yading@11 141 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
yading@11 142 fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
yading@11 143 fdsp->butterflies_float = ff_butterflies_float_sse;
yading@11 144 }
yading@11 145 if (EXTERNAL_SSE2(mm_flags)) {
yading@11 146 fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
yading@11 147 }
yading@11 148 if (EXTERNAL_AVX(mm_flags)) {
yading@11 149 fdsp->vector_fmul = ff_vector_fmul_avx;
yading@11 150 fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
yading@11 151 fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
yading@11 152 fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
yading@11 153 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
yading@11 154 }
yading@11 155 }