annotate ffmpeg/libavcodec/x86/dsputil_mmx.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * MMX optimized DSP utils
yading@10 3 * Copyright (c) 2000, 2001 Fabrice Bellard
yading@10 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
yading@10 5 *
yading@10 6 * This file is part of FFmpeg.
yading@10 7 *
yading@10 8 * FFmpeg is free software; you can redistribute it and/or
yading@10 9 * modify it under the terms of the GNU Lesser General Public
yading@10 10 * License as published by the Free Software Foundation; either
yading@10 11 * version 2.1 of the License, or (at your option) any later version.
yading@10 12 *
yading@10 13 * FFmpeg is distributed in the hope that it will be useful,
yading@10 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 16 * Lesser General Public License for more details.
yading@10 17 *
yading@10 18 * You should have received a copy of the GNU Lesser General Public
yading@10 19 * License along with FFmpeg; if not, write to the Free Software
yading@10 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 21 *
yading@10 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
yading@10 23 */
yading@10 24
yading@10 25 #include "libavutil/attributes.h"
yading@10 26 #include "libavutil/cpu.h"
yading@10 27 #include "libavutil/x86/asm.h"
yading@10 28 #include "libavcodec/dsputil.h"
yading@10 29 #include "libavcodec/h264dsp.h"
yading@10 30 #include "libavcodec/mpegvideo.h"
yading@10 31 #include "libavcodec/simple_idct.h"
yading@10 32 #include "libavcodec/videodsp.h"
yading@10 33 #include "dsputil_mmx.h"
yading@10 34 #include "idct_xvid.h"
yading@10 35 #include "diracdsp_mmx.h"
yading@10 36
yading@10 37 //#undef NDEBUG
yading@10 38 //#include <assert.h>
yading@10 39
yading@10 40 /* pixel operations */
yading@10 41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
yading@10 42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
yading@10 43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
yading@10 44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
yading@10 45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
yading@10 46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
yading@10 47 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
yading@10 48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
yading@10 49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
yading@10 50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
yading@10 51
yading@10 52 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
yading@10 53 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
yading@10 54
yading@10 55 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
yading@10 56 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
yading@10 57
yading@10 58
yading@10 59 #if HAVE_YASM
yading@10 60 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
yading@10 61 int dstStride, int src1Stride, int h);
yading@10 62 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
yading@10 63 uint8_t *src2, int dstStride,
yading@10 64 int src1Stride, int h);
yading@10 65 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
yading@10 66 int dstStride, int src1Stride, int h);
yading@10 67 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
yading@10 68 int dstStride, int src1Stride, int h);
yading@10 69 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
yading@10 70 int dstStride, int src1Stride, int h);
yading@10 71 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
yading@10 72 int dstStride, int src1Stride, int h);
yading@10 73 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
yading@10 74 ptrdiff_t line_size, int h);
yading@10 75
yading@10 76 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
yading@10 77 ptrdiff_t line_size, int h)
yading@10 78 {
yading@10 79 ff_put_pixels8_mmxext(block, pixels, line_size, h);
yading@10 80 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
yading@10 81 }
yading@10 82
yading@10 83 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 84 int dstStride, int srcStride, int h);
yading@10 85 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 86 int dstStride, int srcStride, int h);
yading@10 87 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 88 int dstStride, int srcStride,
yading@10 89 int h);
yading@10 90 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 91 int dstStride, int srcStride, int h);
yading@10 92 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 93 int dstStride, int srcStride, int h);
yading@10 94 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 95 int dstStride, int srcStride,
yading@10 96 int h);
yading@10 97 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 98 int dstStride, int srcStride);
yading@10 99 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 100 int dstStride, int srcStride);
yading@10 101 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 102 int dstStride, int srcStride);
yading@10 103 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 104 int dstStride, int srcStride);
yading@10 105 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 106 int dstStride, int srcStride);
yading@10 107 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
yading@10 108 int dstStride, int srcStride);
yading@10 109 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
yading@10 110 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
yading@10 111 #endif /* HAVE_YASM */
yading@10 112
yading@10 113
yading@10 114 #if HAVE_INLINE_ASM
yading@10 115
yading@10 116 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
yading@10 117 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
yading@10 118
yading@10 119 #define MOVQ_BFE(regd) \
yading@10 120 __asm__ volatile ( \
yading@10 121 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
yading@10 122 "paddb %%"#regd", %%"#regd" \n\t" ::)
yading@10 123
yading@10 124 #ifndef PIC
yading@10 125 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
yading@10 126 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
yading@10 127 #else
yading@10 128 // for shared library it's better to use this way for accessing constants
yading@10 129 // pcmpeqd -> -1
yading@10 130 #define MOVQ_BONE(regd) \
yading@10 131 __asm__ volatile ( \
yading@10 132 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
yading@10 133 "psrlw $15, %%"#regd" \n\t" \
yading@10 134 "packuswb %%"#regd", %%"#regd" \n\t" ::)
yading@10 135
yading@10 136 #define MOVQ_WTWO(regd) \
yading@10 137 __asm__ volatile ( \
yading@10 138 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
yading@10 139 "psrlw $15, %%"#regd" \n\t" \
yading@10 140 "psllw $1, %%"#regd" \n\t"::)
yading@10 141
yading@10 142 #endif
yading@10 143
yading@10 144 // using regr as temporary and for the output result
yading@10 145 // first argument is unmodifed and second is trashed
yading@10 146 // regfe is supposed to contain 0xfefefefefefefefe
yading@10 147 #define PAVGB_MMX(rega, regb, regr, regfe) \
yading@10 148 "movq "#rega", "#regr" \n\t" \
yading@10 149 "por "#regb", "#regr" \n\t" \
yading@10 150 "pxor "#rega", "#regb" \n\t" \
yading@10 151 "pand "#regfe", "#regb" \n\t" \
yading@10 152 "psrlq $1, "#regb" \n\t" \
yading@10 153 "psubb "#regb", "#regr" \n\t"
yading@10 154
yading@10 155 // mm6 is supposed to contain 0xfefefefefefefefe
yading@10 156 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
yading@10 157 "movq "#rega", "#regr" \n\t" \
yading@10 158 "movq "#regc", "#regp" \n\t" \
yading@10 159 "por "#regb", "#regr" \n\t" \
yading@10 160 "por "#regd", "#regp" \n\t" \
yading@10 161 "pxor "#rega", "#regb" \n\t" \
yading@10 162 "pxor "#regc", "#regd" \n\t" \
yading@10 163 "pand %%mm6, "#regb" \n\t" \
yading@10 164 "pand %%mm6, "#regd" \n\t" \
yading@10 165 "psrlq $1, "#regd" \n\t" \
yading@10 166 "psrlq $1, "#regb" \n\t" \
yading@10 167 "psubb "#regb", "#regr" \n\t" \
yading@10 168 "psubb "#regd", "#regp" \n\t"
yading@10 169
yading@10 170 /***********************************/
yading@10 171 /* MMX rounding */
yading@10 172
yading@10 173 #define DEF(x, y) x ## _ ## y ## _mmx
yading@10 174 #define SET_RND MOVQ_WTWO
yading@10 175 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
yading@10 176 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
yading@10 177 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
yading@10 178
yading@10 179 #include "dsputil_rnd_template.c"
yading@10 180
yading@10 181 #undef DEF
yading@10 182 #undef SET_RND
yading@10 183 #undef PAVGBP
yading@10 184 #undef PAVGB
yading@10 185 #undef OP_AVG
yading@10 186
yading@10 187 #endif /* HAVE_INLINE_ASM */
yading@10 188
yading@10 189
yading@10 190 #if HAVE_YASM
yading@10 191
yading@10 192 /***********************************/
yading@10 193 /* MMXEXT specific */
yading@10 194
yading@10 195 //FIXME the following could be optimized too ...
yading@10 196 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
yading@10 197 int line_size, int h)
yading@10 198 {
yading@10 199 ff_avg_pixels8_mmxext(block, pixels, line_size, h);
yading@10 200 ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
yading@10 201 }
yading@10 202
yading@10 203 #endif /* HAVE_YASM */
yading@10 204
yading@10 205
yading@10 206 #if HAVE_INLINE_ASM
yading@10 207 /***********************************/
yading@10 208 /* standard MMX */
yading@10 209
yading@10 210 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
yading@10 211 int line_size)
yading@10 212 {
yading@10 213 const int16_t *p;
yading@10 214 uint8_t *pix;
yading@10 215
yading@10 216 /* read the pixels */
yading@10 217 p = block;
yading@10 218 pix = pixels;
yading@10 219 /* unrolled loop */
yading@10 220 __asm__ volatile (
yading@10 221 "movq (%3), %%mm0 \n\t"
yading@10 222 "movq 8(%3), %%mm1 \n\t"
yading@10 223 "movq 16(%3), %%mm2 \n\t"
yading@10 224 "movq 24(%3), %%mm3 \n\t"
yading@10 225 "movq 32(%3), %%mm4 \n\t"
yading@10 226 "movq 40(%3), %%mm5 \n\t"
yading@10 227 "movq 48(%3), %%mm6 \n\t"
yading@10 228 "movq 56(%3), %%mm7 \n\t"
yading@10 229 "packuswb %%mm1, %%mm0 \n\t"
yading@10 230 "packuswb %%mm3, %%mm2 \n\t"
yading@10 231 "packuswb %%mm5, %%mm4 \n\t"
yading@10 232 "packuswb %%mm7, %%mm6 \n\t"
yading@10 233 "movq %%mm0, (%0) \n\t"
yading@10 234 "movq %%mm2, (%0, %1) \n\t"
yading@10 235 "movq %%mm4, (%0, %1, 2) \n\t"
yading@10 236 "movq %%mm6, (%0, %2) \n\t"
yading@10 237 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
yading@10 238 "r"(p)
yading@10 239 : "memory");
yading@10 240 pix += line_size * 4;
yading@10 241 p += 32;
yading@10 242
yading@10 243 // if here would be an exact copy of the code above
yading@10 244 // compiler would generate some very strange code
yading@10 245 // thus using "r"
yading@10 246 __asm__ volatile (
yading@10 247 "movq (%3), %%mm0 \n\t"
yading@10 248 "movq 8(%3), %%mm1 \n\t"
yading@10 249 "movq 16(%3), %%mm2 \n\t"
yading@10 250 "movq 24(%3), %%mm3 \n\t"
yading@10 251 "movq 32(%3), %%mm4 \n\t"
yading@10 252 "movq 40(%3), %%mm5 \n\t"
yading@10 253 "movq 48(%3), %%mm6 \n\t"
yading@10 254 "movq 56(%3), %%mm7 \n\t"
yading@10 255 "packuswb %%mm1, %%mm0 \n\t"
yading@10 256 "packuswb %%mm3, %%mm2 \n\t"
yading@10 257 "packuswb %%mm5, %%mm4 \n\t"
yading@10 258 "packuswb %%mm7, %%mm6 \n\t"
yading@10 259 "movq %%mm0, (%0) \n\t"
yading@10 260 "movq %%mm2, (%0, %1) \n\t"
yading@10 261 "movq %%mm4, (%0, %1, 2) \n\t"
yading@10 262 "movq %%mm6, (%0, %2) \n\t"
yading@10 263 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
yading@10 264 : "memory");
yading@10 265 }
yading@10 266
yading@10 267 #define put_signed_pixels_clamped_mmx_half(off) \
yading@10 268 "movq "#off"(%2), %%mm1 \n\t" \
yading@10 269 "movq 16 + "#off"(%2), %%mm2 \n\t" \
yading@10 270 "movq 32 + "#off"(%2), %%mm3 \n\t" \
yading@10 271 "movq 48 + "#off"(%2), %%mm4 \n\t" \
yading@10 272 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
yading@10 273 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
yading@10 274 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
yading@10 275 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
yading@10 276 "paddb %%mm0, %%mm1 \n\t" \
yading@10 277 "paddb %%mm0, %%mm2 \n\t" \
yading@10 278 "paddb %%mm0, %%mm3 \n\t" \
yading@10 279 "paddb %%mm0, %%mm4 \n\t" \
yading@10 280 "movq %%mm1, (%0) \n\t" \
yading@10 281 "movq %%mm2, (%0, %3) \n\t" \
yading@10 282 "movq %%mm3, (%0, %3, 2) \n\t" \
yading@10 283 "movq %%mm4, (%0, %1) \n\t"
yading@10 284
yading@10 285 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
yading@10 286 int line_size)
yading@10 287 {
yading@10 288 x86_reg line_skip = line_size;
yading@10 289 x86_reg line_skip3;
yading@10 290
yading@10 291 __asm__ volatile (
yading@10 292 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
yading@10 293 "lea (%3, %3, 2), %1 \n\t"
yading@10 294 put_signed_pixels_clamped_mmx_half(0)
yading@10 295 "lea (%0, %3, 4), %0 \n\t"
yading@10 296 put_signed_pixels_clamped_mmx_half(64)
yading@10 297 : "+&r"(pixels), "=&r"(line_skip3)
yading@10 298 : "r"(block), "r"(line_skip)
yading@10 299 : "memory");
yading@10 300 }
yading@10 301
yading@10 302 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
yading@10 303 int line_size)
yading@10 304 {
yading@10 305 const int16_t *p;
yading@10 306 uint8_t *pix;
yading@10 307 int i;
yading@10 308
yading@10 309 /* read the pixels */
yading@10 310 p = block;
yading@10 311 pix = pixels;
yading@10 312 MOVQ_ZERO(mm7);
yading@10 313 i = 4;
yading@10 314 do {
yading@10 315 __asm__ volatile (
yading@10 316 "movq (%2), %%mm0 \n\t"
yading@10 317 "movq 8(%2), %%mm1 \n\t"
yading@10 318 "movq 16(%2), %%mm2 \n\t"
yading@10 319 "movq 24(%2), %%mm3 \n\t"
yading@10 320 "movq %0, %%mm4 \n\t"
yading@10 321 "movq %1, %%mm6 \n\t"
yading@10 322 "movq %%mm4, %%mm5 \n\t"
yading@10 323 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 324 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 325 "paddsw %%mm4, %%mm0 \n\t"
yading@10 326 "paddsw %%mm5, %%mm1 \n\t"
yading@10 327 "movq %%mm6, %%mm5 \n\t"
yading@10 328 "punpcklbw %%mm7, %%mm6 \n\t"
yading@10 329 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 330 "paddsw %%mm6, %%mm2 \n\t"
yading@10 331 "paddsw %%mm5, %%mm3 \n\t"
yading@10 332 "packuswb %%mm1, %%mm0 \n\t"
yading@10 333 "packuswb %%mm3, %%mm2 \n\t"
yading@10 334 "movq %%mm0, %0 \n\t"
yading@10 335 "movq %%mm2, %1 \n\t"
yading@10 336 : "+m"(*pix), "+m"(*(pix + line_size))
yading@10 337 : "r"(p)
yading@10 338 : "memory");
yading@10 339 pix += line_size * 2;
yading@10 340 p += 16;
yading@10 341 } while (--i);
yading@10 342 }
yading@10 343
yading@10 344 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
yading@10 345 ptrdiff_t line_size, int h)
yading@10 346 {
yading@10 347 __asm__ volatile (
yading@10 348 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 349 ".p2align 3 \n\t"
yading@10 350 "1: \n\t"
yading@10 351 "movq (%1 ), %%mm0 \n\t"
yading@10 352 "movq (%1, %3), %%mm1 \n\t"
yading@10 353 "movq %%mm0, (%2) \n\t"
yading@10 354 "movq %%mm1, (%2, %3) \n\t"
yading@10 355 "add %%"REG_a", %1 \n\t"
yading@10 356 "add %%"REG_a", %2 \n\t"
yading@10 357 "movq (%1 ), %%mm0 \n\t"
yading@10 358 "movq (%1, %3), %%mm1 \n\t"
yading@10 359 "movq %%mm0, (%2) \n\t"
yading@10 360 "movq %%mm1, (%2, %3) \n\t"
yading@10 361 "add %%"REG_a", %1 \n\t"
yading@10 362 "add %%"REG_a", %2 \n\t"
yading@10 363 "subl $4, %0 \n\t"
yading@10 364 "jnz 1b \n\t"
yading@10 365 : "+g"(h), "+r"(pixels), "+r"(block)
yading@10 366 : "r"((x86_reg)line_size)
yading@10 367 : "%"REG_a, "memory"
yading@10 368 );
yading@10 369 }
yading@10 370
yading@10 371 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
yading@10 372 ptrdiff_t line_size, int h)
yading@10 373 {
yading@10 374 __asm__ volatile (
yading@10 375 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 376 ".p2align 3 \n\t"
yading@10 377 "1: \n\t"
yading@10 378 "movq (%1 ), %%mm0 \n\t"
yading@10 379 "movq 8(%1 ), %%mm4 \n\t"
yading@10 380 "movq (%1, %3), %%mm1 \n\t"
yading@10 381 "movq 8(%1, %3), %%mm5 \n\t"
yading@10 382 "movq %%mm0, (%2) \n\t"
yading@10 383 "movq %%mm4, 8(%2) \n\t"
yading@10 384 "movq %%mm1, (%2, %3) \n\t"
yading@10 385 "movq %%mm5, 8(%2, %3) \n\t"
yading@10 386 "add %%"REG_a", %1 \n\t"
yading@10 387 "add %%"REG_a", %2 \n\t"
yading@10 388 "movq (%1 ), %%mm0 \n\t"
yading@10 389 "movq 8(%1 ), %%mm4 \n\t"
yading@10 390 "movq (%1, %3), %%mm1 \n\t"
yading@10 391 "movq 8(%1, %3), %%mm5 \n\t"
yading@10 392 "movq %%mm0, (%2) \n\t"
yading@10 393 "movq %%mm4, 8(%2) \n\t"
yading@10 394 "movq %%mm1, (%2, %3) \n\t"
yading@10 395 "movq %%mm5, 8(%2, %3) \n\t"
yading@10 396 "add %%"REG_a", %1 \n\t"
yading@10 397 "add %%"REG_a", %2 \n\t"
yading@10 398 "subl $4, %0 \n\t"
yading@10 399 "jnz 1b \n\t"
yading@10 400 : "+g"(h), "+r"(pixels), "+r"(block)
yading@10 401 : "r"((x86_reg)line_size)
yading@10 402 : "%"REG_a, "memory"
yading@10 403 );
yading@10 404 }
yading@10 405
yading@10 406 #define CLEAR_BLOCKS(name, n) \
yading@10 407 static void name(int16_t *blocks) \
yading@10 408 { \
yading@10 409 __asm__ volatile ( \
yading@10 410 "pxor %%mm7, %%mm7 \n\t" \
yading@10 411 "mov %1, %%"REG_a" \n\t" \
yading@10 412 "1: \n\t" \
yading@10 413 "movq %%mm7, (%0, %%"REG_a") \n\t" \
yading@10 414 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
yading@10 415 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
yading@10 416 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
yading@10 417 "add $32, %%"REG_a" \n\t" \
yading@10 418 "js 1b \n\t" \
yading@10 419 :: "r"(((uint8_t *)blocks) + 128 * n), \
yading@10 420 "i"(-128 * n) \
yading@10 421 : "%"REG_a \
yading@10 422 ); \
yading@10 423 }
yading@10 424 CLEAR_BLOCKS(clear_blocks_mmx, 6)
yading@10 425 CLEAR_BLOCKS(clear_block_mmx, 1)
yading@10 426
yading@10 427 static void clear_block_sse(int16_t *block)
yading@10 428 {
yading@10 429 __asm__ volatile (
yading@10 430 "xorps %%xmm0, %%xmm0 \n"
yading@10 431 "movaps %%xmm0, (%0) \n"
yading@10 432 "movaps %%xmm0, 16(%0) \n"
yading@10 433 "movaps %%xmm0, 32(%0) \n"
yading@10 434 "movaps %%xmm0, 48(%0) \n"
yading@10 435 "movaps %%xmm0, 64(%0) \n"
yading@10 436 "movaps %%xmm0, 80(%0) \n"
yading@10 437 "movaps %%xmm0, 96(%0) \n"
yading@10 438 "movaps %%xmm0, 112(%0) \n"
yading@10 439 :: "r"(block)
yading@10 440 : "memory"
yading@10 441 );
yading@10 442 }
yading@10 443
yading@10 444 static void clear_blocks_sse(int16_t *blocks)
yading@10 445 {
yading@10 446 __asm__ volatile (
yading@10 447 "xorps %%xmm0, %%xmm0 \n"
yading@10 448 "mov %1, %%"REG_a" \n"
yading@10 449 "1: \n"
yading@10 450 "movaps %%xmm0, (%0, %%"REG_a") \n"
yading@10 451 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
yading@10 452 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
yading@10 453 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
yading@10 454 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
yading@10 455 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
yading@10 456 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
yading@10 457 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
yading@10 458 "add $128, %%"REG_a" \n"
yading@10 459 "js 1b \n"
yading@10 460 :: "r"(((uint8_t *)blocks) + 128 * 6),
yading@10 461 "i"(-128 * 6)
yading@10 462 : "%"REG_a
yading@10 463 );
yading@10 464 }
yading@10 465
yading@10 466 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
yading@10 467 {
yading@10 468 x86_reg i = 0;
yading@10 469 __asm__ volatile (
yading@10 470 "jmp 2f \n\t"
yading@10 471 "1: \n\t"
yading@10 472 "movq (%1, %0), %%mm0 \n\t"
yading@10 473 "movq (%2, %0), %%mm1 \n\t"
yading@10 474 "paddb %%mm0, %%mm1 \n\t"
yading@10 475 "movq %%mm1, (%2, %0) \n\t"
yading@10 476 "movq 8(%1, %0), %%mm0 \n\t"
yading@10 477 "movq 8(%2, %0), %%mm1 \n\t"
yading@10 478 "paddb %%mm0, %%mm1 \n\t"
yading@10 479 "movq %%mm1, 8(%2, %0) \n\t"
yading@10 480 "add $16, %0 \n\t"
yading@10 481 "2: \n\t"
yading@10 482 "cmp %3, %0 \n\t"
yading@10 483 "js 1b \n\t"
yading@10 484 : "+r"(i)
yading@10 485 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
yading@10 486 );
yading@10 487 for ( ; i < w; i++)
yading@10 488 dst[i + 0] += src[i + 0];
yading@10 489 }
yading@10 490
yading@10 491 #if HAVE_7REGS
yading@10 492 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
yading@10 493 const uint8_t *diff, int w,
yading@10 494 int *left, int *left_top)
yading@10 495 {
yading@10 496 x86_reg w2 = -w;
yading@10 497 x86_reg x;
yading@10 498 int l = *left & 0xff;
yading@10 499 int tl = *left_top & 0xff;
yading@10 500 int t;
yading@10 501 __asm__ volatile (
yading@10 502 "mov %7, %3 \n"
yading@10 503 "1: \n"
yading@10 504 "movzbl (%3, %4), %2 \n"
yading@10 505 "mov %2, %k3 \n"
yading@10 506 "sub %b1, %b3 \n"
yading@10 507 "add %b0, %b3 \n"
yading@10 508 "mov %2, %1 \n"
yading@10 509 "cmp %0, %2 \n"
yading@10 510 "cmovg %0, %2 \n"
yading@10 511 "cmovg %1, %0 \n"
yading@10 512 "cmp %k3, %0 \n"
yading@10 513 "cmovg %k3, %0 \n"
yading@10 514 "mov %7, %3 \n"
yading@10 515 "cmp %2, %0 \n"
yading@10 516 "cmovl %2, %0 \n"
yading@10 517 "add (%6, %4), %b0 \n"
yading@10 518 "mov %b0, (%5, %4) \n"
yading@10 519 "inc %4 \n"
yading@10 520 "jl 1b \n"
yading@10 521 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
yading@10 522 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
yading@10 523 );
yading@10 524 *left = l;
yading@10 525 *left_top = tl;
yading@10 526 }
yading@10 527 #endif
yading@10 528
yading@10 529 /* Draw the edges of width 'w' of an image of size width, height
yading@10 530 * this MMX version can only handle w == 8 || w == 16. */
yading@10 531 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
yading@10 532 int w, int h, int sides)
yading@10 533 {
yading@10 534 uint8_t *ptr, *last_line;
yading@10 535 int i;
yading@10 536
yading@10 537 last_line = buf + (height - 1) * wrap;
yading@10 538 /* left and right */
yading@10 539 ptr = buf;
yading@10 540 if (w == 8) {
yading@10 541 __asm__ volatile (
yading@10 542 "1: \n\t"
yading@10 543 "movd (%0), %%mm0 \n\t"
yading@10 544 "punpcklbw %%mm0, %%mm0 \n\t"
yading@10 545 "punpcklwd %%mm0, %%mm0 \n\t"
yading@10 546 "punpckldq %%mm0, %%mm0 \n\t"
yading@10 547 "movq %%mm0, -8(%0) \n\t"
yading@10 548 "movq -8(%0, %2), %%mm1 \n\t"
yading@10 549 "punpckhbw %%mm1, %%mm1 \n\t"
yading@10 550 "punpckhwd %%mm1, %%mm1 \n\t"
yading@10 551 "punpckhdq %%mm1, %%mm1 \n\t"
yading@10 552 "movq %%mm1, (%0, %2) \n\t"
yading@10 553 "add %1, %0 \n\t"
yading@10 554 "cmp %3, %0 \n\t"
yading@10 555 "jb 1b \n\t"
yading@10 556 : "+r"(ptr)
yading@10 557 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
yading@10 558 );
yading@10 559 } else if(w==16){
yading@10 560 __asm__ volatile (
yading@10 561 "1: \n\t"
yading@10 562 "movd (%0), %%mm0 \n\t"
yading@10 563 "punpcklbw %%mm0, %%mm0 \n\t"
yading@10 564 "punpcklwd %%mm0, %%mm0 \n\t"
yading@10 565 "punpckldq %%mm0, %%mm0 \n\t"
yading@10 566 "movq %%mm0, -8(%0) \n\t"
yading@10 567 "movq %%mm0, -16(%0) \n\t"
yading@10 568 "movq -8(%0, %2), %%mm1 \n\t"
yading@10 569 "punpckhbw %%mm1, %%mm1 \n\t"
yading@10 570 "punpckhwd %%mm1, %%mm1 \n\t"
yading@10 571 "punpckhdq %%mm1, %%mm1 \n\t"
yading@10 572 "movq %%mm1, (%0, %2) \n\t"
yading@10 573 "movq %%mm1, 8(%0, %2) \n\t"
yading@10 574 "add %1, %0 \n\t"
yading@10 575 "cmp %3, %0 \n\t"
yading@10 576 "jb 1b \n\t"
yading@10 577 : "+r"(ptr)
yading@10 578 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
yading@10 579 );
yading@10 580 } else {
yading@10 581 av_assert1(w == 4);
yading@10 582 __asm__ volatile (
yading@10 583 "1: \n\t"
yading@10 584 "movd (%0), %%mm0 \n\t"
yading@10 585 "punpcklbw %%mm0, %%mm0 \n\t"
yading@10 586 "punpcklwd %%mm0, %%mm0 \n\t"
yading@10 587 "movd %%mm0, -4(%0) \n\t"
yading@10 588 "movd -4(%0, %2), %%mm1 \n\t"
yading@10 589 "punpcklbw %%mm1, %%mm1 \n\t"
yading@10 590 "punpckhwd %%mm1, %%mm1 \n\t"
yading@10 591 "punpckhdq %%mm1, %%mm1 \n\t"
yading@10 592 "movd %%mm1, (%0, %2) \n\t"
yading@10 593 "add %1, %0 \n\t"
yading@10 594 "cmp %3, %0 \n\t"
yading@10 595 "jb 1b \n\t"
yading@10 596 : "+r"(ptr)
yading@10 597 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
yading@10 598 );
yading@10 599 }
yading@10 600
yading@10 601 /* top and bottom (and hopefully also the corners) */
yading@10 602 if (sides & EDGE_TOP) {
yading@10 603 for (i = 0; i < h; i += 4) {
yading@10 604 ptr = buf - (i + 1) * wrap - w;
yading@10 605 __asm__ volatile (
yading@10 606 "1: \n\t"
yading@10 607 "movq (%1, %0), %%mm0 \n\t"
yading@10 608 "movq %%mm0, (%0) \n\t"
yading@10 609 "movq %%mm0, (%0, %2) \n\t"
yading@10 610 "movq %%mm0, (%0, %2, 2) \n\t"
yading@10 611 "movq %%mm0, (%0, %3) \n\t"
yading@10 612 "add $8, %0 \n\t"
yading@10 613 "cmp %4, %0 \n\t"
yading@10 614 "jb 1b \n\t"
yading@10 615 : "+r"(ptr)
yading@10 616 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
yading@10 617 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
yading@10 618 );
yading@10 619 }
yading@10 620 }
yading@10 621
yading@10 622 if (sides & EDGE_BOTTOM) {
yading@10 623 for (i = 0; i < h; i += 4) {
yading@10 624 ptr = last_line + (i + 1) * wrap - w;
yading@10 625 __asm__ volatile (
yading@10 626 "1: \n\t"
yading@10 627 "movq (%1, %0), %%mm0 \n\t"
yading@10 628 "movq %%mm0, (%0) \n\t"
yading@10 629 "movq %%mm0, (%0, %2) \n\t"
yading@10 630 "movq %%mm0, (%0, %2, 2) \n\t"
yading@10 631 "movq %%mm0, (%0, %3) \n\t"
yading@10 632 "add $8, %0 \n\t"
yading@10 633 "cmp %4, %0 \n\t"
yading@10 634 "jb 1b \n\t"
yading@10 635 : "+r"(ptr)
yading@10 636 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
yading@10 637 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
yading@10 638 "r"(ptr + width + 2 * w)
yading@10 639 );
yading@10 640 }
yading@10 641 }
yading@10 642 }
yading@10 643 #endif /* HAVE_INLINE_ASM */
yading@10 644
yading@10 645
yading@10 646 #if HAVE_YASM
yading@10 647 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
yading@10 648 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
yading@10 649 ptrdiff_t stride) \
yading@10 650 { \
yading@10 651 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
yading@10 652 } \
yading@10 653 \
yading@10 654 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 655 ptrdiff_t stride) \
yading@10 656 { \
yading@10 657 uint64_t temp[8]; \
yading@10 658 uint8_t * const half = (uint8_t*)temp; \
yading@10 659 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
yading@10 660 stride, 8); \
yading@10 661 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
yading@10 662 stride, stride, 8); \
yading@10 663 } \
yading@10 664 \
yading@10 665 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 666 ptrdiff_t stride) \
yading@10 667 { \
yading@10 668 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
yading@10 669 stride, 8); \
yading@10 670 } \
yading@10 671 \
yading@10 672 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 673 ptrdiff_t stride) \
yading@10 674 { \
yading@10 675 uint64_t temp[8]; \
yading@10 676 uint8_t * const half = (uint8_t*)temp; \
yading@10 677 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
yading@10 678 stride, 8); \
yading@10 679 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
yading@10 680 stride, 8); \
yading@10 681 } \
yading@10 682 \
yading@10 683 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 684 ptrdiff_t stride) \
yading@10 685 { \
yading@10 686 uint64_t temp[8]; \
yading@10 687 uint8_t * const half = (uint8_t*)temp; \
yading@10 688 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
yading@10 689 8, stride); \
yading@10 690 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
yading@10 691 stride, stride, 8); \
yading@10 692 } \
yading@10 693 \
yading@10 694 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 695 ptrdiff_t stride) \
yading@10 696 { \
yading@10 697 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
yading@10 698 stride, stride); \
yading@10 699 } \
yading@10 700 \
yading@10 701 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 702 ptrdiff_t stride) \
yading@10 703 { \
yading@10 704 uint64_t temp[8]; \
yading@10 705 uint8_t * const half = (uint8_t*)temp; \
yading@10 706 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
yading@10 707 8, stride); \
yading@10 708 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
yading@10 709 stride, 8); \
yading@10 710 } \
yading@10 711 \
yading@10 712 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 713 ptrdiff_t stride) \
yading@10 714 { \
yading@10 715 uint64_t half[8 + 9]; \
yading@10 716 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 717 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 718 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 719 stride, 9); \
yading@10 720 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
yading@10 721 stride, 9); \
yading@10 722 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 723 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 724 stride, 8, 8); \
yading@10 725 } \
yading@10 726 \
yading@10 727 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 728 ptrdiff_t stride) \
yading@10 729 { \
yading@10 730 uint64_t half[8 + 9]; \
yading@10 731 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 732 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 733 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 734 stride, 9); \
yading@10 735 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
yading@10 736 stride, 9); \
yading@10 737 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 738 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 739 stride, 8, 8); \
yading@10 740 } \
yading@10 741 \
yading@10 742 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 743 ptrdiff_t stride) \
yading@10 744 { \
yading@10 745 uint64_t half[8 + 9]; \
yading@10 746 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 747 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 748 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 749 stride, 9); \
yading@10 750 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
yading@10 751 stride, 9); \
yading@10 752 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 753 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
yading@10 754 stride, 8, 8); \
yading@10 755 } \
yading@10 756 \
yading@10 757 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 758 ptrdiff_t stride) \
yading@10 759 { \
yading@10 760 uint64_t half[8 + 9]; \
yading@10 761 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 762 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 763 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 764 stride, 9); \
yading@10 765 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
yading@10 766 stride, 9); \
yading@10 767 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 768 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
yading@10 769 stride, 8, 8); \
yading@10 770 } \
yading@10 771 \
yading@10 772 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 773 ptrdiff_t stride) \
yading@10 774 { \
yading@10 775 uint64_t half[8 + 9]; \
yading@10 776 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 777 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 778 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 779 stride, 9); \
yading@10 780 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 781 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 782 stride, 8, 8); \
yading@10 783 } \
yading@10 784 \
yading@10 785 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 786 ptrdiff_t stride) \
yading@10 787 { \
yading@10 788 uint64_t half[8 + 9]; \
yading@10 789 uint8_t * const halfH = ((uint8_t*)half) + 64; \
yading@10 790 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 791 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 792 stride, 9); \
yading@10 793 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
yading@10 794 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
yading@10 795 stride, 8, 8); \
yading@10 796 } \
yading@10 797 \
yading@10 798 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 799 ptrdiff_t stride) \
yading@10 800 { \
yading@10 801 uint64_t half[8 + 9]; \
yading@10 802 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 803 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 804 stride, 9); \
yading@10 805 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
yading@10 806 8, stride, 9); \
yading@10 807 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
yading@10 808 stride, 8); \
yading@10 809 } \
yading@10 810 \
yading@10 811 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 812 ptrdiff_t stride) \
yading@10 813 { \
yading@10 814 uint64_t half[8 + 9]; \
yading@10 815 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 816 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 817 stride, 9); \
yading@10 818 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
yading@10 819 stride, 9); \
yading@10 820 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
yading@10 821 stride, 8); \
yading@10 822 } \
yading@10 823 \
yading@10 824 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 825 ptrdiff_t stride) \
yading@10 826 { \
yading@10 827 uint64_t half[9]; \
yading@10 828 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 829 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
yading@10 830 stride, 9); \
yading@10 831 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
yading@10 832 stride, 8); \
yading@10 833 } \
yading@10 834 \
yading@10 835 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
yading@10 836 ptrdiff_t stride) \
yading@10 837 { \
yading@10 838 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
yading@10 839 } \
yading@10 840 \
yading@10 841 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 842 ptrdiff_t stride) \
yading@10 843 { \
yading@10 844 uint64_t temp[32]; \
yading@10 845 uint8_t * const half = (uint8_t*)temp; \
yading@10 846 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
yading@10 847 stride, 16); \
yading@10 848 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
yading@10 849 stride, 16); \
yading@10 850 } \
yading@10 851 \
yading@10 852 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 853 ptrdiff_t stride) \
yading@10 854 { \
yading@10 855 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
yading@10 856 stride, stride, 16);\
yading@10 857 } \
yading@10 858 \
yading@10 859 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 860 ptrdiff_t stride) \
yading@10 861 { \
yading@10 862 uint64_t temp[32]; \
yading@10 863 uint8_t * const half = (uint8_t*)temp; \
yading@10 864 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
yading@10 865 stride, 16); \
yading@10 866 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
yading@10 867 stride, stride, 16); \
yading@10 868 } \
yading@10 869 \
yading@10 870 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 871 ptrdiff_t stride) \
yading@10 872 { \
yading@10 873 uint64_t temp[32]; \
yading@10 874 uint8_t * const half = (uint8_t*)temp; \
yading@10 875 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
yading@10 876 stride); \
yading@10 877 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
yading@10 878 stride, 16); \
yading@10 879 } \
yading@10 880 \
yading@10 881 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 882 ptrdiff_t stride) \
yading@10 883 { \
yading@10 884 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
yading@10 885 stride, stride); \
yading@10 886 } \
yading@10 887 \
yading@10 888 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 889 ptrdiff_t stride) \
yading@10 890 { \
yading@10 891 uint64_t temp[32]; \
yading@10 892 uint8_t * const half = (uint8_t*)temp; \
yading@10 893 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
yading@10 894 stride); \
yading@10 895 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
yading@10 896 stride, stride, 16); \
yading@10 897 } \
yading@10 898 \
yading@10 899 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 900 ptrdiff_t stride) \
yading@10 901 { \
yading@10 902 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 903 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 904 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 905 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 906 stride, 17); \
yading@10 907 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
yading@10 908 stride, 17); \
yading@10 909 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 910 16, 16); \
yading@10 911 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 912 stride, 16, 16); \
yading@10 913 } \
yading@10 914 \
yading@10 915 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 916 ptrdiff_t stride) \
yading@10 917 { \
yading@10 918 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 919 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 920 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 921 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 922 stride, 17); \
yading@10 923 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
yading@10 924 stride, 17); \
yading@10 925 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 926 16, 16); \
yading@10 927 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 928 stride, 16, 16); \
yading@10 929 } \
yading@10 930 \
yading@10 931 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 932 ptrdiff_t stride) \
yading@10 933 { \
yading@10 934 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 935 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 936 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 937 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 938 stride, 17); \
yading@10 939 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
yading@10 940 stride, 17); \
yading@10 941 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 942 16, 16); \
yading@10 943 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
yading@10 944 stride, 16, 16); \
yading@10 945 } \
yading@10 946 \
yading@10 947 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 948 ptrdiff_t stride) \
yading@10 949 { \
yading@10 950 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 951 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 952 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 953 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 954 stride, 17); \
yading@10 955 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
yading@10 956 stride, 17); \
yading@10 957 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 958 16, 16); \
yading@10 959 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
yading@10 960 stride, 16, 16); \
yading@10 961 } \
yading@10 962 \
yading@10 963 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 964 ptrdiff_t stride) \
yading@10 965 { \
yading@10 966 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 967 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 968 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 969 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 970 stride, 17); \
yading@10 971 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 972 16, 16); \
yading@10 973 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
yading@10 974 stride, 16, 16); \
yading@10 975 } \
yading@10 976 \
yading@10 977 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 978 ptrdiff_t stride) \
yading@10 979 { \
yading@10 980 uint64_t half[16 * 2 + 17 * 2]; \
yading@10 981 uint8_t * const halfH = ((uint8_t*)half) + 256; \
yading@10 982 uint8_t * const halfHV = ((uint8_t*)half); \
yading@10 983 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 984 stride, 17); \
yading@10 985 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
yading@10 986 16, 16); \
yading@10 987 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
yading@10 988 stride, 16, 16); \
yading@10 989 } \
yading@10 990 \
yading@10 991 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 992 ptrdiff_t stride) \
yading@10 993 { \
yading@10 994 uint64_t half[17 * 2]; \
yading@10 995 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 996 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 997 stride, 17); \
yading@10 998 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
yading@10 999 stride, 17); \
yading@10 1000 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
yading@10 1001 stride, 16); \
yading@10 1002 } \
yading@10 1003 \
yading@10 1004 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 1005 ptrdiff_t stride) \
yading@10 1006 { \
yading@10 1007 uint64_t half[17 * 2]; \
yading@10 1008 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 1009 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 1010 stride, 17); \
yading@10 1011 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
yading@10 1012 stride, 17); \
yading@10 1013 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
yading@10 1014 stride, 16); \
yading@10 1015 } \
yading@10 1016 \
yading@10 1017 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
yading@10 1018 ptrdiff_t stride) \
yading@10 1019 { \
yading@10 1020 uint64_t half[17 * 2]; \
yading@10 1021 uint8_t * const halfH = ((uint8_t*)half); \
yading@10 1022 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
yading@10 1023 stride, 17); \
yading@10 1024 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
yading@10 1025 stride, 16); \
yading@10 1026 }
yading@10 1027
yading@10 1028 QPEL_OP(put_, ff_pw_16, _, mmxext)
yading@10 1029 QPEL_OP(avg_, ff_pw_16, _, mmxext)
yading@10 1030 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
yading@10 1031 #endif /* HAVE_YASM */
yading@10 1032
yading@10 1033
yading@10 1034 #if HAVE_INLINE_ASM
yading@10 1035 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1036 {
yading@10 1037 put_pixels8_xy2_mmx(dst, src, stride, 8);
yading@10 1038 }
yading@10 1039 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1040 {
yading@10 1041 put_pixels16_xy2_mmx(dst, src, stride, 16);
yading@10 1042 }
yading@10 1043 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1044 {
yading@10 1045 avg_pixels8_xy2_mmx(dst, src, stride, 8);
yading@10 1046 }
yading@10 1047 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1048 {
yading@10 1049 avg_pixels16_xy2_mmx(dst, src, stride, 16);
yading@10 1050 }
yading@10 1051
yading@10 1052 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
yading@10 1053 ptrdiff_t linesize, int block_w, int block_h,
yading@10 1054 int src_x, int src_y, int w, int h);
yading@10 1055
yading@10 1056 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
yading@10 1057 int stride, int h, int ox, int oy,
yading@10 1058 int dxx, int dxy, int dyx, int dyy,
yading@10 1059 int shift, int r, int width, int height,
yading@10 1060 emulated_edge_mc_func *emu_edge_fn)
yading@10 1061 {
yading@10 1062 const int w = 8;
yading@10 1063 const int ix = ox >> (16 + shift);
yading@10 1064 const int iy = oy >> (16 + shift);
yading@10 1065 const int oxs = ox >> 4;
yading@10 1066 const int oys = oy >> 4;
yading@10 1067 const int dxxs = dxx >> 4;
yading@10 1068 const int dxys = dxy >> 4;
yading@10 1069 const int dyxs = dyx >> 4;
yading@10 1070 const int dyys = dyy >> 4;
yading@10 1071 const uint16_t r4[4] = { r, r, r, r };
yading@10 1072 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
yading@10 1073 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
yading@10 1074 const uint64_t shift2 = 2 * shift;
yading@10 1075 #define MAX_STRIDE 4096U
yading@10 1076 #define MAX_H 8U
yading@10 1077 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
yading@10 1078 int x, y;
yading@10 1079
yading@10 1080 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
yading@10 1081 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
yading@10 1082 const int dxh = dxy * (h - 1);
yading@10 1083 const int dyw = dyx * (w - 1);
yading@10 1084 int need_emu = (unsigned)ix >= width - w ||
yading@10 1085 (unsigned)iy >= height - h;
yading@10 1086
yading@10 1087 if ( // non-constant fullpel offset (3% of blocks)
yading@10 1088 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
yading@10 1089 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
yading@10 1090 // uses more than 16 bits of subpel mv (only at huge resolution)
yading@10 1091 || (dxx | dxy | dyx | dyy) & 15
yading@10 1092 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
yading@10 1093 // FIXME could still use mmx for some of the rows
yading@10 1094 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
yading@10 1095 shift, r, width, height);
yading@10 1096 return;
yading@10 1097 }
yading@10 1098
yading@10 1099 src += ix + iy * stride;
yading@10 1100 if (need_emu) {
yading@10 1101 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
yading@10 1102 src = edge_buf;
yading@10 1103 }
yading@10 1104
yading@10 1105 __asm__ volatile (
yading@10 1106 "movd %0, %%mm6 \n\t"
yading@10 1107 "pxor %%mm7, %%mm7 \n\t"
yading@10 1108 "punpcklwd %%mm6, %%mm6 \n\t"
yading@10 1109 "punpcklwd %%mm6, %%mm6 \n\t"
yading@10 1110 :: "r"(1<<shift)
yading@10 1111 );
yading@10 1112
yading@10 1113 for (x = 0; x < w; x += 4) {
yading@10 1114 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
yading@10 1115 oxs - dxys + dxxs * (x + 1),
yading@10 1116 oxs - dxys + dxxs * (x + 2),
yading@10 1117 oxs - dxys + dxxs * (x + 3) };
yading@10 1118 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
yading@10 1119 oys - dyys + dyxs * (x + 1),
yading@10 1120 oys - dyys + dyxs * (x + 2),
yading@10 1121 oys - dyys + dyxs * (x + 3) };
yading@10 1122
yading@10 1123 for (y = 0; y < h; y++) {
yading@10 1124 __asm__ volatile (
yading@10 1125 "movq %0, %%mm4 \n\t"
yading@10 1126 "movq %1, %%mm5 \n\t"
yading@10 1127 "paddw %2, %%mm4 \n\t"
yading@10 1128 "paddw %3, %%mm5 \n\t"
yading@10 1129 "movq %%mm4, %0 \n\t"
yading@10 1130 "movq %%mm5, %1 \n\t"
yading@10 1131 "psrlw $12, %%mm4 \n\t"
yading@10 1132 "psrlw $12, %%mm5 \n\t"
yading@10 1133 : "+m"(*dx4), "+m"(*dy4)
yading@10 1134 : "m"(*dxy4), "m"(*dyy4)
yading@10 1135 );
yading@10 1136
yading@10 1137 __asm__ volatile (
yading@10 1138 "movq %%mm6, %%mm2 \n\t"
yading@10 1139 "movq %%mm6, %%mm1 \n\t"
yading@10 1140 "psubw %%mm4, %%mm2 \n\t"
yading@10 1141 "psubw %%mm5, %%mm1 \n\t"
yading@10 1142 "movq %%mm2, %%mm0 \n\t"
yading@10 1143 "movq %%mm4, %%mm3 \n\t"
yading@10 1144 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
yading@10 1145 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
yading@10 1146 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
yading@10 1147 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
yading@10 1148
yading@10 1149 "movd %4, %%mm5 \n\t"
yading@10 1150 "movd %3, %%mm4 \n\t"
yading@10 1151 "punpcklbw %%mm7, %%mm5 \n\t"
yading@10 1152 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 1153 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
yading@10 1154 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
yading@10 1155
yading@10 1156 "movd %2, %%mm5 \n\t"
yading@10 1157 "movd %1, %%mm4 \n\t"
yading@10 1158 "punpcklbw %%mm7, %%mm5 \n\t"
yading@10 1159 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 1160 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
yading@10 1161 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
yading@10 1162 "paddw %5, %%mm1 \n\t"
yading@10 1163 "paddw %%mm3, %%mm2 \n\t"
yading@10 1164 "paddw %%mm1, %%mm0 \n\t"
yading@10 1165 "paddw %%mm2, %%mm0 \n\t"
yading@10 1166
yading@10 1167 "psrlw %6, %%mm0 \n\t"
yading@10 1168 "packuswb %%mm0, %%mm0 \n\t"
yading@10 1169 "movd %%mm0, %0 \n\t"
yading@10 1170
yading@10 1171 : "=m"(dst[x + y * stride])
yading@10 1172 : "m"(src[0]), "m"(src[1]),
yading@10 1173 "m"(src[stride]), "m"(src[stride + 1]),
yading@10 1174 "m"(*r4), "m"(shift2)
yading@10 1175 );
yading@10 1176 src += stride;
yading@10 1177 }
yading@10 1178 src += 4 - h * stride;
yading@10 1179 }
yading@10 1180 }
yading@10 1181
yading@10 1182
yading@10 1183 #if CONFIG_VIDEODSP
yading@10 1184 #if HAVE_YASM
yading@10 1185 #if ARCH_X86_32
yading@10 1186 static void gmc_mmx(uint8_t *dst, uint8_t *src,
yading@10 1187 int stride, int h, int ox, int oy,
yading@10 1188 int dxx, int dxy, int dyx, int dyy,
yading@10 1189 int shift, int r, int width, int height)
yading@10 1190 {
yading@10 1191 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
yading@10 1192 width, height, &ff_emulated_edge_mc_8);
yading@10 1193 }
yading@10 1194 #endif
yading@10 1195 static void gmc_sse(uint8_t *dst, uint8_t *src,
yading@10 1196 int stride, int h, int ox, int oy,
yading@10 1197 int dxx, int dxy, int dyx, int dyy,
yading@10 1198 int shift, int r, int width, int height)
yading@10 1199 {
yading@10 1200 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
yading@10 1201 width, height, &ff_emulated_edge_mc_8);
yading@10 1202 }
yading@10 1203 #else
yading@10 1204 static void gmc_mmx(uint8_t *dst, uint8_t *src,
yading@10 1205 int stride, int h, int ox, int oy,
yading@10 1206 int dxx, int dxy, int dyx, int dyy,
yading@10 1207 int shift, int r, int width, int height)
yading@10 1208 {
yading@10 1209 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
yading@10 1210 width, height, &ff_emulated_edge_mc_8);
yading@10 1211 }
yading@10 1212 #endif
yading@10 1213 #endif
yading@10 1214
yading@10 1215 /* CAVS-specific */
yading@10 1216 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1217 {
yading@10 1218 put_pixels8_mmx(dst, src, stride, 8);
yading@10 1219 }
yading@10 1220
yading@10 1221 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1222 {
yading@10 1223 avg_pixels8_mmx(dst, src, stride, 8);
yading@10 1224 }
yading@10 1225
yading@10 1226 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1227 {
yading@10 1228 put_pixels16_mmx(dst, src, stride, 16);
yading@10 1229 }
yading@10 1230
yading@10 1231 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
yading@10 1232 {
yading@10 1233 avg_pixels16_mmx(dst, src, stride, 16);
yading@10 1234 }
yading@10 1235
yading@10 1236 /* VC-1-specific */
yading@10 1237 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
yading@10 1238 ptrdiff_t stride, int rnd)
yading@10 1239 {
yading@10 1240 put_pixels8_mmx(dst, src, stride, 8);
yading@10 1241 }
yading@10 1242
yading@10 1243 #if CONFIG_DIRAC_DECODER
yading@10 1244 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
yading@10 1245 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
yading@10 1246 {\
yading@10 1247 if (h&3)\
yading@10 1248 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
yading@10 1249 else\
yading@10 1250 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
yading@10 1251 }\
yading@10 1252 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
yading@10 1253 {\
yading@10 1254 if (h&3)\
yading@10 1255 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
yading@10 1256 else\
yading@10 1257 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
yading@10 1258 }\
yading@10 1259 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
yading@10 1260 {\
yading@10 1261 if (h&3) {\
yading@10 1262 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
yading@10 1263 } else {\
yading@10 1264 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
yading@10 1265 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
yading@10 1266 }\
yading@10 1267 }
yading@10 1268
yading@10 1269 #if HAVE_MMX_INLINE
yading@10 1270 DIRAC_PIXOP(put, put, mmx)
yading@10 1271 DIRAC_PIXOP(avg, avg, mmx)
yading@10 1272 #endif
yading@10 1273
yading@10 1274 #if HAVE_YASM
yading@10 1275 DIRAC_PIXOP(avg, ff_avg, mmxext)
yading@10 1276
yading@10 1277 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
yading@10 1278 {
yading@10 1279 if (h&3)
yading@10 1280 ff_put_dirac_pixels16_c(dst, src, stride, h);
yading@10 1281 else
yading@10 1282 ff_put_pixels16_sse2(dst, src[0], stride, h);
yading@10 1283 }
yading@10 1284 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
yading@10 1285 {
yading@10 1286 if (h&3)
yading@10 1287 ff_avg_dirac_pixels16_c(dst, src, stride, h);
yading@10 1288 else
yading@10 1289 ff_avg_pixels16_sse2(dst, src[0], stride, h);
yading@10 1290 }
yading@10 1291 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
yading@10 1292 {
yading@10 1293 if (h&3) {
yading@10 1294 ff_put_dirac_pixels32_c(dst, src, stride, h);
yading@10 1295 } else {
yading@10 1296 ff_put_pixels16_sse2(dst , src[0] , stride, h);
yading@10 1297 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
yading@10 1298 }
yading@10 1299 }
yading@10 1300 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
yading@10 1301 {
yading@10 1302 if (h&3) {
yading@10 1303 ff_avg_dirac_pixels32_c(dst, src, stride, h);
yading@10 1304 } else {
yading@10 1305 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
yading@10 1306 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
yading@10 1307 }
yading@10 1308 }
yading@10 1309 #endif
yading@10 1310 #endif
yading@10 1311
yading@10 1312 static void vector_clipf_sse(float *dst, const float *src,
yading@10 1313 float min, float max, int len)
yading@10 1314 {
yading@10 1315 x86_reg i = (len - 16) * 4;
yading@10 1316 __asm__ volatile (
yading@10 1317 "movss %3, %%xmm4 \n\t"
yading@10 1318 "movss %4, %%xmm5 \n\t"
yading@10 1319 "shufps $0, %%xmm4, %%xmm4 \n\t"
yading@10 1320 "shufps $0, %%xmm5, %%xmm5 \n\t"
yading@10 1321 "1: \n\t"
yading@10 1322 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
yading@10 1323 "movaps 16(%2, %0), %%xmm1 \n\t"
yading@10 1324 "movaps 32(%2, %0), %%xmm2 \n\t"
yading@10 1325 "movaps 48(%2, %0), %%xmm3 \n\t"
yading@10 1326 "maxps %%xmm4, %%xmm0 \n\t"
yading@10 1327 "maxps %%xmm4, %%xmm1 \n\t"
yading@10 1328 "maxps %%xmm4, %%xmm2 \n\t"
yading@10 1329 "maxps %%xmm4, %%xmm3 \n\t"
yading@10 1330 "minps %%xmm5, %%xmm0 \n\t"
yading@10 1331 "minps %%xmm5, %%xmm1 \n\t"
yading@10 1332 "minps %%xmm5, %%xmm2 \n\t"
yading@10 1333 "minps %%xmm5, %%xmm3 \n\t"
yading@10 1334 "movaps %%xmm0, (%1, %0) \n\t"
yading@10 1335 "movaps %%xmm1, 16(%1, %0) \n\t"
yading@10 1336 "movaps %%xmm2, 32(%1, %0) \n\t"
yading@10 1337 "movaps %%xmm3, 48(%1, %0) \n\t"
yading@10 1338 "sub $64, %0 \n\t"
yading@10 1339 "jge 1b \n\t"
yading@10 1340 : "+&r"(i)
yading@10 1341 : "r"(dst), "r"(src), "m"(min), "m"(max)
yading@10 1342 : "memory"
yading@10 1343 );
yading@10 1344 }
yading@10 1345
yading@10 1346 #endif /* HAVE_INLINE_ASM */
yading@10 1347
yading@10 1348 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
yading@10 1349 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
yading@10 1350
yading@10 1351 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
yading@10 1352 int order);
yading@10 1353 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
yading@10 1354 int order);
yading@10 1355 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
yading@10 1356 const int16_t *v3,
yading@10 1357 int order, int mul);
yading@10 1358 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
yading@10 1359 const int16_t *v3,
yading@10 1360 int order, int mul);
yading@10 1361 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
yading@10 1362 const int16_t *v3,
yading@10 1363 int order, int mul);
yading@10 1364
yading@10 1365 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
yading@10 1366 const int16_t *window, unsigned int len);
yading@10 1367 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
yading@10 1368 const int16_t *window, unsigned int len);
yading@10 1369 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
yading@10 1370 const int16_t *window, unsigned int len);
yading@10 1371 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
yading@10 1372 const int16_t *window, unsigned int len);
yading@10 1373 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
yading@10 1374 const int16_t *window, unsigned int len);
yading@10 1375 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
yading@10 1376 const int16_t *window, unsigned int len);
yading@10 1377
yading@10 1378 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
yading@10 1379 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
yading@10 1380
yading@10 1381 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
yading@10 1382 const uint8_t *diff, int w,
yading@10 1383 int *left, int *left_top);
yading@10 1384 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
yading@10 1385 int w, int left);
yading@10 1386 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
yading@10 1387 int w, int left);
yading@10 1388
yading@10 1389 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
yading@10 1390 int32_t min, int32_t max, unsigned int len);
yading@10 1391 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
yading@10 1392 int32_t min, int32_t max, unsigned int len);
yading@10 1393 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
yading@10 1394 int32_t min, int32_t max, unsigned int len);
yading@10 1395 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
yading@10 1396 int32_t min, int32_t max, unsigned int len);
yading@10 1397
yading@10 1398 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
yading@10 1399 do { \
yading@10 1400 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
yading@10 1401 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
yading@10 1402 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
yading@10 1403 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
yading@10 1404 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
yading@10 1405 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
yading@10 1406 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
yading@10 1407 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
yading@10 1408 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
yading@10 1409 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
yading@10 1410 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
yading@10 1411 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
yading@10 1412 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
yading@10 1413 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
yading@10 1414 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
yading@10 1415 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
yading@10 1416 } while (0)
yading@10 1417
yading@10 1418 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
yading@10 1419 int mm_flags)
yading@10 1420 {
yading@10 1421 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
yading@10 1422
yading@10 1423 #if HAVE_INLINE_ASM
yading@10 1424 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
yading@10 1425 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
yading@10 1426 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
yading@10 1427
yading@10 1428 if (!high_bit_depth) {
yading@10 1429 c->clear_block = clear_block_mmx;
yading@10 1430 c->clear_blocks = clear_blocks_mmx;
yading@10 1431 c->draw_edges = draw_edges_mmx;
yading@10 1432 }
yading@10 1433
yading@10 1434 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
yading@10 1435 c->gmc = gmc_mmx;
yading@10 1436 #endif
yading@10 1437
yading@10 1438 c->add_bytes = add_bytes_mmx;
yading@10 1439 #endif /* HAVE_INLINE_ASM */
yading@10 1440
yading@10 1441 #if HAVE_YASM
yading@10 1442 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
yading@10 1443 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
yading@10 1444 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
yading@10 1445 }
yading@10 1446
yading@10 1447 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
yading@10 1448 #endif
yading@10 1449
yading@10 1450 }
yading@10 1451
yading@10 1452 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
yading@10 1453 int mm_flags)
yading@10 1454 {
yading@10 1455
yading@10 1456 #if HAVE_YASM
yading@10 1457 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
yading@10 1458 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
yading@10 1459
yading@10 1460 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
yading@10 1461 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
yading@10 1462 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
yading@10 1463 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
yading@10 1464 #endif /* HAVE_YASM */
yading@10 1465
yading@10 1466 #if HAVE_MMXEXT_EXTERNAL
yading@10 1467 /* slower than cmov version on AMD */
yading@10 1468 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
yading@10 1469 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
yading@10 1470
yading@10 1471 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
yading@10 1472 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
yading@10 1473
yading@10 1474 if (avctx->flags & CODEC_FLAG_BITEXACT) {
yading@10 1475 c->apply_window_int16 = ff_apply_window_int16_mmxext;
yading@10 1476 } else {
yading@10 1477 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
yading@10 1478 }
yading@10 1479 #endif /* HAVE_MMXEXT_EXTERNAL */
yading@10 1480 }
yading@10 1481
yading@10 1482 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
yading@10 1483 int mm_flags)
yading@10 1484 {
yading@10 1485 #if HAVE_INLINE_ASM
yading@10 1486 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
yading@10 1487
yading@10 1488 if (!high_bit_depth) {
yading@10 1489 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
yading@10 1490 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
yading@10 1491 c->clear_block = clear_block_sse;
yading@10 1492 c->clear_blocks = clear_blocks_sse;
yading@10 1493 }
yading@10 1494 }
yading@10 1495
yading@10 1496 c->vector_clipf = vector_clipf_sse;
yading@10 1497 #endif /* HAVE_INLINE_ASM */
yading@10 1498
yading@10 1499 #if HAVE_YASM
yading@10 1500 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
yading@10 1501 c->gmc = gmc_sse;
yading@10 1502 #endif
yading@10 1503 #endif /* HAVE_YASM */
yading@10 1504 }
yading@10 1505
yading@10 1506 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
yading@10 1507 int mm_flags)
yading@10 1508 {
yading@10 1509 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
yading@10 1510
yading@10 1511 #if HAVE_SSE2_INLINE
yading@10 1512 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
yading@10 1513 c->idct_put = ff_idct_xvid_sse2_put;
yading@10 1514 c->idct_add = ff_idct_xvid_sse2_add;
yading@10 1515 c->idct = ff_idct_xvid_sse2;
yading@10 1516 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
yading@10 1517 }
yading@10 1518 #endif /* HAVE_SSE2_INLINE */
yading@10 1519
yading@10 1520 #if HAVE_SSE2_EXTERNAL
yading@10 1521 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
yading@10 1522 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
yading@10 1523 if (mm_flags & AV_CPU_FLAG_ATOM) {
yading@10 1524 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
yading@10 1525 } else {
yading@10 1526 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
yading@10 1527 }
yading@10 1528 if (avctx->flags & CODEC_FLAG_BITEXACT) {
yading@10 1529 c->apply_window_int16 = ff_apply_window_int16_sse2;
yading@10 1530 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
yading@10 1531 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
yading@10 1532 }
yading@10 1533 c->bswap_buf = ff_bswap32_buf_sse2;
yading@10 1534 #endif /* HAVE_SSE2_EXTERNAL */
yading@10 1535 }
yading@10 1536
yading@10 1537 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
yading@10 1538 int mm_flags)
yading@10 1539 {
yading@10 1540 #if HAVE_SSSE3_EXTERNAL
yading@10 1541 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
yading@10 1542 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
yading@10 1543 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
yading@10 1544
yading@10 1545 if (mm_flags & AV_CPU_FLAG_ATOM)
yading@10 1546 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
yading@10 1547 else
yading@10 1548 c->apply_window_int16 = ff_apply_window_int16_ssse3;
yading@10 1549 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
yading@10 1550 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
yading@10 1551 c->bswap_buf = ff_bswap32_buf_ssse3;
yading@10 1552 #endif /* HAVE_SSSE3_EXTERNAL */
yading@10 1553 }
yading@10 1554
yading@10 1555 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
yading@10 1556 int mm_flags)
yading@10 1557 {
yading@10 1558 #if HAVE_SSE4_EXTERNAL
yading@10 1559 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
yading@10 1560 #endif /* HAVE_SSE4_EXTERNAL */
yading@10 1561 }
yading@10 1562
yading@10 1563 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
yading@10 1564 {
yading@10 1565 int mm_flags = av_get_cpu_flags();
yading@10 1566
yading@10 1567 #if HAVE_7REGS && HAVE_INLINE_ASM
yading@10 1568 if (mm_flags & AV_CPU_FLAG_CMOV)
yading@10 1569 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
yading@10 1570 #endif
yading@10 1571
yading@10 1572 if (mm_flags & AV_CPU_FLAG_MMX) {
yading@10 1573 #if HAVE_INLINE_ASM
yading@10 1574 const int idct_algo = avctx->idct_algo;
yading@10 1575
yading@10 1576 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
yading@10 1577 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
yading@10 1578 c->idct_put = ff_simple_idct_put_mmx;
yading@10 1579 c->idct_add = ff_simple_idct_add_mmx;
yading@10 1580 c->idct = ff_simple_idct_mmx;
yading@10 1581 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
yading@10 1582 } else if (idct_algo == FF_IDCT_XVIDMMX) {
yading@10 1583 if (mm_flags & AV_CPU_FLAG_SSE2) {
yading@10 1584 c->idct_put = ff_idct_xvid_sse2_put;
yading@10 1585 c->idct_add = ff_idct_xvid_sse2_add;
yading@10 1586 c->idct = ff_idct_xvid_sse2;
yading@10 1587 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
yading@10 1588 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
yading@10 1589 c->idct_put = ff_idct_xvid_mmxext_put;
yading@10 1590 c->idct_add = ff_idct_xvid_mmxext_add;
yading@10 1591 c->idct = ff_idct_xvid_mmxext;
yading@10 1592 } else {
yading@10 1593 c->idct_put = ff_idct_xvid_mmx_put;
yading@10 1594 c->idct_add = ff_idct_xvid_mmx_add;
yading@10 1595 c->idct = ff_idct_xvid_mmx;
yading@10 1596 }
yading@10 1597 }
yading@10 1598 }
yading@10 1599 #endif /* HAVE_INLINE_ASM */
yading@10 1600
yading@10 1601 dsputil_init_mmx(c, avctx, mm_flags);
yading@10 1602 }
yading@10 1603
yading@10 1604 if (mm_flags & AV_CPU_FLAG_MMXEXT)
yading@10 1605 dsputil_init_mmxext(c, avctx, mm_flags);
yading@10 1606
yading@10 1607 if (mm_flags & AV_CPU_FLAG_SSE)
yading@10 1608 dsputil_init_sse(c, avctx, mm_flags);
yading@10 1609
yading@10 1610 if (mm_flags & AV_CPU_FLAG_SSE2)
yading@10 1611 dsputil_init_sse2(c, avctx, mm_flags);
yading@10 1612
yading@10 1613 if (mm_flags & AV_CPU_FLAG_SSSE3)
yading@10 1614 dsputil_init_ssse3(c, avctx, mm_flags);
yading@10 1615
yading@10 1616 if (mm_flags & AV_CPU_FLAG_SSE4)
yading@10 1617 dsputil_init_sse4(c, avctx, mm_flags);
yading@10 1618
yading@10 1619 if (CONFIG_ENCODERS)
yading@10 1620 ff_dsputilenc_init_mmx(c, avctx);
yading@10 1621 }