annotate ffmpeg/libavcodec/x86/motion_est.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * MMX optimized motion estimation
yading@10 3 * Copyright (c) 2001 Fabrice Bellard
yading@10 4 * Copyright (c) 2002-2004 Michael Niedermayer
yading@10 5 *
yading@10 6 * mostly by Michael Niedermayer <michaelni@gmx.at>
yading@10 7 *
yading@10 8 * This file is part of FFmpeg.
yading@10 9 *
yading@10 10 * FFmpeg is free software; you can redistribute it and/or
yading@10 11 * modify it under the terms of the GNU Lesser General Public
yading@10 12 * License as published by the Free Software Foundation; either
yading@10 13 * version 2.1 of the License, or (at your option) any later version.
yading@10 14 *
yading@10 15 * FFmpeg is distributed in the hope that it will be useful,
yading@10 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 18 * Lesser General Public License for more details.
yading@10 19 *
yading@10 20 * You should have received a copy of the GNU Lesser General Public
yading@10 21 * License along with FFmpeg; if not, write to the Free Software
yading@10 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 23 */
yading@10 24
yading@10 25 #include "libavutil/attributes.h"
yading@10 26 #include "libavutil/avassert.h"
yading@10 27 #include "libavutil/mem.h"
yading@10 28 #include "libavutil/x86/asm.h"
yading@10 29 #include "dsputil_mmx.h"
yading@10 30
yading@10 31 #if HAVE_INLINE_ASM
yading@10 32
yading@10 33 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
yading@10 34 0x0000000000000000ULL,
yading@10 35 0x0001000100010001ULL,
yading@10 36 0x0002000200020002ULL,
yading@10 37 };
yading@10 38
yading@10 39 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
yading@10 40
yading@10 41 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
yading@10 42 {
yading@10 43 x86_reg len= -(x86_reg)stride*h;
yading@10 44 __asm__ volatile(
yading@10 45 ".p2align 4 \n\t"
yading@10 46 "1: \n\t"
yading@10 47 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 48 "movq (%2, %%"REG_a"), %%mm2 \n\t"
yading@10 49 "movq (%2, %%"REG_a"), %%mm4 \n\t"
yading@10 50 "add %3, %%"REG_a" \n\t"
yading@10 51 "psubusb %%mm0, %%mm2 \n\t"
yading@10 52 "psubusb %%mm4, %%mm0 \n\t"
yading@10 53 "movq (%1, %%"REG_a"), %%mm1 \n\t"
yading@10 54 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 55 "movq (%2, %%"REG_a"), %%mm5 \n\t"
yading@10 56 "psubusb %%mm1, %%mm3 \n\t"
yading@10 57 "psubusb %%mm5, %%mm1 \n\t"
yading@10 58 "por %%mm2, %%mm0 \n\t"
yading@10 59 "por %%mm1, %%mm3 \n\t"
yading@10 60 "movq %%mm0, %%mm1 \n\t"
yading@10 61 "movq %%mm3, %%mm2 \n\t"
yading@10 62 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 63 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 64 "punpcklbw %%mm7, %%mm3 \n\t"
yading@10 65 "punpckhbw %%mm7, %%mm2 \n\t"
yading@10 66 "paddw %%mm1, %%mm0 \n\t"
yading@10 67 "paddw %%mm3, %%mm2 \n\t"
yading@10 68 "paddw %%mm2, %%mm0 \n\t"
yading@10 69 "paddw %%mm0, %%mm6 \n\t"
yading@10 70 "add %3, %%"REG_a" \n\t"
yading@10 71 " js 1b \n\t"
yading@10 72 : "+a" (len)
yading@10 73 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
yading@10 74 );
yading@10 75 }
yading@10 76
yading@10 77 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
yading@10 78 int stride, int h)
yading@10 79 {
yading@10 80 __asm__ volatile(
yading@10 81 ".p2align 4 \n\t"
yading@10 82 "1: \n\t"
yading@10 83 "movq (%1), %%mm0 \n\t"
yading@10 84 "movq (%1, %3), %%mm1 \n\t"
yading@10 85 "psadbw (%2), %%mm0 \n\t"
yading@10 86 "psadbw (%2, %3), %%mm1 \n\t"
yading@10 87 "paddw %%mm0, %%mm6 \n\t"
yading@10 88 "paddw %%mm1, %%mm6 \n\t"
yading@10 89 "lea (%1,%3,2), %1 \n\t"
yading@10 90 "lea (%2,%3,2), %2 \n\t"
yading@10 91 "sub $2, %0 \n\t"
yading@10 92 " jg 1b \n\t"
yading@10 93 : "+r" (h), "+r" (blk1), "+r" (blk2)
yading@10 94 : "r" ((x86_reg)stride)
yading@10 95 );
yading@10 96 }
yading@10 97
yading@10 98 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
yading@10 99 {
yading@10 100 int ret;
yading@10 101 __asm__ volatile(
yading@10 102 "pxor %%xmm2, %%xmm2 \n\t"
yading@10 103 ".p2align 4 \n\t"
yading@10 104 "1: \n\t"
yading@10 105 "movdqu (%1), %%xmm0 \n\t"
yading@10 106 "movdqu (%1, %4), %%xmm1 \n\t"
yading@10 107 "psadbw (%2), %%xmm0 \n\t"
yading@10 108 "psadbw (%2, %4), %%xmm1 \n\t"
yading@10 109 "paddw %%xmm0, %%xmm2 \n\t"
yading@10 110 "paddw %%xmm1, %%xmm2 \n\t"
yading@10 111 "lea (%1,%4,2), %1 \n\t"
yading@10 112 "lea (%2,%4,2), %2 \n\t"
yading@10 113 "sub $2, %0 \n\t"
yading@10 114 " jg 1b \n\t"
yading@10 115 "movhlps %%xmm2, %%xmm0 \n\t"
yading@10 116 "paddw %%xmm0, %%xmm2 \n\t"
yading@10 117 "movd %%xmm2, %3 \n\t"
yading@10 118 : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
yading@10 119 : "r" ((x86_reg)stride)
yading@10 120 );
yading@10 121 return ret;
yading@10 122 }
yading@10 123
yading@10 124 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
yading@10 125 int stride, int h)
yading@10 126 {
yading@10 127 __asm__ volatile(
yading@10 128 ".p2align 4 \n\t"
yading@10 129 "1: \n\t"
yading@10 130 "movq (%1), %%mm0 \n\t"
yading@10 131 "movq (%1, %3), %%mm1 \n\t"
yading@10 132 "pavgb 1(%1), %%mm0 \n\t"
yading@10 133 "pavgb 1(%1, %3), %%mm1 \n\t"
yading@10 134 "psadbw (%2), %%mm0 \n\t"
yading@10 135 "psadbw (%2, %3), %%mm1 \n\t"
yading@10 136 "paddw %%mm0, %%mm6 \n\t"
yading@10 137 "paddw %%mm1, %%mm6 \n\t"
yading@10 138 "lea (%1,%3,2), %1 \n\t"
yading@10 139 "lea (%2,%3,2), %2 \n\t"
yading@10 140 "sub $2, %0 \n\t"
yading@10 141 " jg 1b \n\t"
yading@10 142 : "+r" (h), "+r" (blk1), "+r" (blk2)
yading@10 143 : "r" ((x86_reg)stride)
yading@10 144 );
yading@10 145 }
yading@10 146
yading@10 147 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
yading@10 148 int stride, int h)
yading@10 149 {
yading@10 150 __asm__ volatile(
yading@10 151 "movq (%1), %%mm0 \n\t"
yading@10 152 "add %3, %1 \n\t"
yading@10 153 ".p2align 4 \n\t"
yading@10 154 "1: \n\t"
yading@10 155 "movq (%1), %%mm1 \n\t"
yading@10 156 "movq (%1, %3), %%mm2 \n\t"
yading@10 157 "pavgb %%mm1, %%mm0 \n\t"
yading@10 158 "pavgb %%mm2, %%mm1 \n\t"
yading@10 159 "psadbw (%2), %%mm0 \n\t"
yading@10 160 "psadbw (%2, %3), %%mm1 \n\t"
yading@10 161 "paddw %%mm0, %%mm6 \n\t"
yading@10 162 "paddw %%mm1, %%mm6 \n\t"
yading@10 163 "movq %%mm2, %%mm0 \n\t"
yading@10 164 "lea (%1,%3,2), %1 \n\t"
yading@10 165 "lea (%2,%3,2), %2 \n\t"
yading@10 166 "sub $2, %0 \n\t"
yading@10 167 " jg 1b \n\t"
yading@10 168 : "+r" (h), "+r" (blk1), "+r" (blk2)
yading@10 169 : "r" ((x86_reg)stride)
yading@10 170 );
yading@10 171 }
yading@10 172
yading@10 173 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
yading@10 174 int stride, int h)
yading@10 175 {
yading@10 176 __asm__ volatile(
yading@10 177 "movq "MANGLE(bone)", %%mm5 \n\t"
yading@10 178 "movq (%1), %%mm0 \n\t"
yading@10 179 "pavgb 1(%1), %%mm0 \n\t"
yading@10 180 "add %3, %1 \n\t"
yading@10 181 ".p2align 4 \n\t"
yading@10 182 "1: \n\t"
yading@10 183 "movq (%1), %%mm1 \n\t"
yading@10 184 "movq (%1,%3), %%mm2 \n\t"
yading@10 185 "pavgb 1(%1), %%mm1 \n\t"
yading@10 186 "pavgb 1(%1,%3), %%mm2 \n\t"
yading@10 187 "psubusb %%mm5, %%mm1 \n\t"
yading@10 188 "pavgb %%mm1, %%mm0 \n\t"
yading@10 189 "pavgb %%mm2, %%mm1 \n\t"
yading@10 190 "psadbw (%2), %%mm0 \n\t"
yading@10 191 "psadbw (%2,%3), %%mm1 \n\t"
yading@10 192 "paddw %%mm0, %%mm6 \n\t"
yading@10 193 "paddw %%mm1, %%mm6 \n\t"
yading@10 194 "movq %%mm2, %%mm0 \n\t"
yading@10 195 "lea (%1,%3,2), %1 \n\t"
yading@10 196 "lea (%2,%3,2), %2 \n\t"
yading@10 197 "sub $2, %0 \n\t"
yading@10 198 " jg 1b \n\t"
yading@10 199 : "+r" (h), "+r" (blk1), "+r" (blk2)
yading@10 200 : "r" ((x86_reg)stride)
yading@10 201 );
yading@10 202 }
yading@10 203
yading@10 204 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
yading@10 205 {
yading@10 206 x86_reg len= -(x86_reg)stride*h;
yading@10 207 __asm__ volatile(
yading@10 208 ".p2align 4 \n\t"
yading@10 209 "1: \n\t"
yading@10 210 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 211 "movq (%2, %%"REG_a"), %%mm1 \n\t"
yading@10 212 "movq (%1, %%"REG_a"), %%mm2 \n\t"
yading@10 213 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 214 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 215 "punpcklbw %%mm7, %%mm1 \n\t"
yading@10 216 "punpckhbw %%mm7, %%mm2 \n\t"
yading@10 217 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 218 "paddw %%mm0, %%mm1 \n\t"
yading@10 219 "paddw %%mm2, %%mm3 \n\t"
yading@10 220 "movq (%3, %%"REG_a"), %%mm4 \n\t"
yading@10 221 "movq (%3, %%"REG_a"), %%mm2 \n\t"
yading@10 222 "paddw %%mm5, %%mm1 \n\t"
yading@10 223 "paddw %%mm5, %%mm3 \n\t"
yading@10 224 "psrlw $1, %%mm1 \n\t"
yading@10 225 "psrlw $1, %%mm3 \n\t"
yading@10 226 "packuswb %%mm3, %%mm1 \n\t"
yading@10 227 "psubusb %%mm1, %%mm4 \n\t"
yading@10 228 "psubusb %%mm2, %%mm1 \n\t"
yading@10 229 "por %%mm4, %%mm1 \n\t"
yading@10 230 "movq %%mm1, %%mm0 \n\t"
yading@10 231 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 232 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 233 "paddw %%mm1, %%mm0 \n\t"
yading@10 234 "paddw %%mm0, %%mm6 \n\t"
yading@10 235 "add %4, %%"REG_a" \n\t"
yading@10 236 " js 1b \n\t"
yading@10 237 : "+a" (len)
yading@10 238 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
yading@10 239 );
yading@10 240 }
yading@10 241
yading@10 242 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
yading@10 243 {
yading@10 244 x86_reg len= -(x86_reg)stride*h;
yading@10 245 __asm__ volatile(
yading@10 246 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 247 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
yading@10 248 "movq %%mm0, %%mm1 \n\t"
yading@10 249 "movq %%mm2, %%mm3 \n\t"
yading@10 250 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 251 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 252 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 253 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 254 "paddw %%mm2, %%mm0 \n\t"
yading@10 255 "paddw %%mm3, %%mm1 \n\t"
yading@10 256 ".p2align 4 \n\t"
yading@10 257 "1: \n\t"
yading@10 258 "movq (%2, %%"REG_a"), %%mm2 \n\t"
yading@10 259 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
yading@10 260 "movq %%mm2, %%mm3 \n\t"
yading@10 261 "movq %%mm4, %%mm5 \n\t"
yading@10 262 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 263 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 264 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 265 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 266 "paddw %%mm4, %%mm2 \n\t"
yading@10 267 "paddw %%mm5, %%mm3 \n\t"
yading@10 268 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
yading@10 269 "paddw %%mm2, %%mm0 \n\t"
yading@10 270 "paddw %%mm3, %%mm1 \n\t"
yading@10 271 "paddw %%mm5, %%mm0 \n\t"
yading@10 272 "paddw %%mm5, %%mm1 \n\t"
yading@10 273 "movq (%3, %%"REG_a"), %%mm4 \n\t"
yading@10 274 "movq (%3, %%"REG_a"), %%mm5 \n\t"
yading@10 275 "psrlw $2, %%mm0 \n\t"
yading@10 276 "psrlw $2, %%mm1 \n\t"
yading@10 277 "packuswb %%mm1, %%mm0 \n\t"
yading@10 278 "psubusb %%mm0, %%mm4 \n\t"
yading@10 279 "psubusb %%mm5, %%mm0 \n\t"
yading@10 280 "por %%mm4, %%mm0 \n\t"
yading@10 281 "movq %%mm0, %%mm4 \n\t"
yading@10 282 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 283 "punpckhbw %%mm7, %%mm4 \n\t"
yading@10 284 "paddw %%mm0, %%mm6 \n\t"
yading@10 285 "paddw %%mm4, %%mm6 \n\t"
yading@10 286 "movq %%mm2, %%mm0 \n\t"
yading@10 287 "movq %%mm3, %%mm1 \n\t"
yading@10 288 "add %4, %%"REG_a" \n\t"
yading@10 289 " js 1b \n\t"
yading@10 290 : "+a" (len)
yading@10 291 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
yading@10 292 );
yading@10 293 }
yading@10 294
yading@10 295 static inline int sum_mmx(void)
yading@10 296 {
yading@10 297 int ret;
yading@10 298 __asm__ volatile(
yading@10 299 "movq %%mm6, %%mm0 \n\t"
yading@10 300 "psrlq $32, %%mm6 \n\t"
yading@10 301 "paddw %%mm0, %%mm6 \n\t"
yading@10 302 "movq %%mm6, %%mm0 \n\t"
yading@10 303 "psrlq $16, %%mm6 \n\t"
yading@10 304 "paddw %%mm0, %%mm6 \n\t"
yading@10 305 "movd %%mm6, %0 \n\t"
yading@10 306 : "=r" (ret)
yading@10 307 );
yading@10 308 return ret&0xFFFF;
yading@10 309 }
yading@10 310
yading@10 311 static inline int sum_mmxext(void)
yading@10 312 {
yading@10 313 int ret;
yading@10 314 __asm__ volatile(
yading@10 315 "movd %%mm6, %0 \n\t"
yading@10 316 : "=r" (ret)
yading@10 317 );
yading@10 318 return ret;
yading@10 319 }
yading@10 320
yading@10 321 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
yading@10 322 {
yading@10 323 sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
yading@10 324 }
yading@10 325 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
yading@10 326 {
yading@10 327 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
yading@10 328 }
yading@10 329
yading@10 330
yading@10 331 #define PIX_SAD(suf)\
yading@10 332 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 333 {\
yading@10 334 av_assert2(h==8);\
yading@10 335 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 336 "pxor %%mm6, %%mm6 \n\t":);\
yading@10 337 \
yading@10 338 sad8_1_ ## suf(blk1, blk2, stride, 8);\
yading@10 339 \
yading@10 340 return sum_ ## suf();\
yading@10 341 }\
yading@10 342 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 343 {\
yading@10 344 av_assert2(h==8);\
yading@10 345 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 346 "pxor %%mm6, %%mm6 \n\t"\
yading@10 347 "movq %0, %%mm5 \n\t"\
yading@10 348 :: "m"(round_tab[1]) \
yading@10 349 );\
yading@10 350 \
yading@10 351 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
yading@10 352 \
yading@10 353 return sum_ ## suf();\
yading@10 354 }\
yading@10 355 \
yading@10 356 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 357 {\
yading@10 358 av_assert2(h==8);\
yading@10 359 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 360 "pxor %%mm6, %%mm6 \n\t"\
yading@10 361 "movq %0, %%mm5 \n\t"\
yading@10 362 :: "m"(round_tab[1]) \
yading@10 363 );\
yading@10 364 \
yading@10 365 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
yading@10 366 \
yading@10 367 return sum_ ## suf();\
yading@10 368 }\
yading@10 369 \
yading@10 370 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 371 {\
yading@10 372 av_assert2(h==8);\
yading@10 373 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 374 "pxor %%mm6, %%mm6 \n\t"\
yading@10 375 ::);\
yading@10 376 \
yading@10 377 sad8_4_ ## suf(blk1, blk2, stride, 8);\
yading@10 378 \
yading@10 379 return sum_ ## suf();\
yading@10 380 }\
yading@10 381 \
yading@10 382 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 383 {\
yading@10 384 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 385 "pxor %%mm6, %%mm6 \n\t":);\
yading@10 386 \
yading@10 387 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
yading@10 388 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
yading@10 389 \
yading@10 390 return sum_ ## suf();\
yading@10 391 }\
yading@10 392 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 393 {\
yading@10 394 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 395 "pxor %%mm6, %%mm6 \n\t"\
yading@10 396 "movq %0, %%mm5 \n\t"\
yading@10 397 :: "m"(round_tab[1]) \
yading@10 398 );\
yading@10 399 \
yading@10 400 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
yading@10 401 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
yading@10 402 \
yading@10 403 return sum_ ## suf();\
yading@10 404 }\
yading@10 405 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 406 {\
yading@10 407 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 408 "pxor %%mm6, %%mm6 \n\t"\
yading@10 409 "movq %0, %%mm5 \n\t"\
yading@10 410 :: "m"(round_tab[1]) \
yading@10 411 );\
yading@10 412 \
yading@10 413 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
yading@10 414 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
yading@10 415 \
yading@10 416 return sum_ ## suf();\
yading@10 417 }\
yading@10 418 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
yading@10 419 {\
yading@10 420 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
yading@10 421 "pxor %%mm6, %%mm6 \n\t"\
yading@10 422 ::);\
yading@10 423 \
yading@10 424 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
yading@10 425 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
yading@10 426 \
yading@10 427 return sum_ ## suf();\
yading@10 428 }\
yading@10 429
yading@10 430 PIX_SAD(mmx)
yading@10 431 PIX_SAD(mmxext)
yading@10 432
yading@10 433 #endif /* HAVE_INLINE_ASM */
yading@10 434
yading@10 435 av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
yading@10 436 {
yading@10 437 #if HAVE_INLINE_ASM
yading@10 438 int mm_flags = av_get_cpu_flags();
yading@10 439
yading@10 440 if (mm_flags & AV_CPU_FLAG_MMX) {
yading@10 441 c->pix_abs[0][0] = sad16_mmx;
yading@10 442 c->pix_abs[0][1] = sad16_x2_mmx;
yading@10 443 c->pix_abs[0][2] = sad16_y2_mmx;
yading@10 444 c->pix_abs[0][3] = sad16_xy2_mmx;
yading@10 445 c->pix_abs[1][0] = sad8_mmx;
yading@10 446 c->pix_abs[1][1] = sad8_x2_mmx;
yading@10 447 c->pix_abs[1][2] = sad8_y2_mmx;
yading@10 448 c->pix_abs[1][3] = sad8_xy2_mmx;
yading@10 449
yading@10 450 c->sad[0]= sad16_mmx;
yading@10 451 c->sad[1]= sad8_mmx;
yading@10 452 }
yading@10 453 if (mm_flags & AV_CPU_FLAG_MMXEXT) {
yading@10 454 c->pix_abs[0][0] = sad16_mmxext;
yading@10 455 c->pix_abs[1][0] = sad8_mmxext;
yading@10 456
yading@10 457 c->sad[0] = sad16_mmxext;
yading@10 458 c->sad[1] = sad8_mmxext;
yading@10 459
yading@10 460 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
yading@10 461 c->pix_abs[0][1] = sad16_x2_mmxext;
yading@10 462 c->pix_abs[0][2] = sad16_y2_mmxext;
yading@10 463 c->pix_abs[0][3] = sad16_xy2_mmxext;
yading@10 464 c->pix_abs[1][1] = sad8_x2_mmxext;
yading@10 465 c->pix_abs[1][2] = sad8_y2_mmxext;
yading@10 466 c->pix_abs[1][3] = sad8_xy2_mmxext;
yading@10 467 }
yading@10 468 }
yading@10 469 if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
yading@10 470 c->sad[0]= sad16_sse2;
yading@10 471 }
yading@10 472 #endif /* HAVE_INLINE_ASM */
yading@10 473 }