annotate ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
yading@10 3 * Copyright (c) 2000, 2001 Fabrice Bellard
yading@10 4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
yading@10 5 *
yading@10 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
yading@10 7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
yading@10 8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
yading@10 9 *
yading@10 10 * This file is part of FFmpeg.
yading@10 11 *
yading@10 12 * FFmpeg is free software; you can redistribute it and/or
yading@10 13 * modify it under the terms of the GNU Lesser General Public
yading@10 14 * License as published by the Free Software Foundation; either
yading@10 15 * version 2.1 of the License, or (at your option) any later version.
yading@10 16 *
yading@10 17 * FFmpeg is distributed in the hope that it will be useful,
yading@10 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 20 * Lesser General Public License for more details.
yading@10 21 *
yading@10 22 * You should have received a copy of the GNU Lesser General Public
yading@10 23 * License along with FFmpeg; if not, write to the Free Software
yading@10 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 25 */
yading@10 26
yading@10 27 // put_pixels
yading@10 28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 29 {
yading@10 30 MOVQ_BFE(mm6);
yading@10 31 __asm__ volatile(
yading@10 32 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 33 ".p2align 3 \n\t"
yading@10 34 "1: \n\t"
yading@10 35 "movq (%1), %%mm0 \n\t"
yading@10 36 "movq 1(%1), %%mm1 \n\t"
yading@10 37 "movq (%1, %3), %%mm2 \n\t"
yading@10 38 "movq 1(%1, %3), %%mm3 \n\t"
yading@10 39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 40 "movq %%mm4, (%2) \n\t"
yading@10 41 "movq %%mm5, (%2, %3) \n\t"
yading@10 42 "add %%"REG_a", %1 \n\t"
yading@10 43 "add %%"REG_a", %2 \n\t"
yading@10 44 "movq (%1), %%mm0 \n\t"
yading@10 45 "movq 1(%1), %%mm1 \n\t"
yading@10 46 "movq (%1, %3), %%mm2 \n\t"
yading@10 47 "movq 1(%1, %3), %%mm3 \n\t"
yading@10 48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 49 "movq %%mm4, (%2) \n\t"
yading@10 50 "movq %%mm5, (%2, %3) \n\t"
yading@10 51 "add %%"REG_a", %1 \n\t"
yading@10 52 "add %%"REG_a", %2 \n\t"
yading@10 53 "subl $4, %0 \n\t"
yading@10 54 "jnz 1b \n\t"
yading@10 55 :"+g"(h), "+S"(pixels), "+D"(block)
yading@10 56 :"r"((x86_reg)line_size)
yading@10 57 :REG_a, "memory");
yading@10 58 }
yading@10 59
yading@10 60 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 61 {
yading@10 62 MOVQ_BFE(mm6);
yading@10 63 __asm__ volatile(
yading@10 64 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 65 ".p2align 3 \n\t"
yading@10 66 "1: \n\t"
yading@10 67 "movq (%1), %%mm0 \n\t"
yading@10 68 "movq 1(%1), %%mm1 \n\t"
yading@10 69 "movq (%1, %3), %%mm2 \n\t"
yading@10 70 "movq 1(%1, %3), %%mm3 \n\t"
yading@10 71 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 72 "movq %%mm4, (%2) \n\t"
yading@10 73 "movq %%mm5, (%2, %3) \n\t"
yading@10 74 "movq 8(%1), %%mm0 \n\t"
yading@10 75 "movq 9(%1), %%mm1 \n\t"
yading@10 76 "movq 8(%1, %3), %%mm2 \n\t"
yading@10 77 "movq 9(%1, %3), %%mm3 \n\t"
yading@10 78 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 79 "movq %%mm4, 8(%2) \n\t"
yading@10 80 "movq %%mm5, 8(%2, %3) \n\t"
yading@10 81 "add %%"REG_a", %1 \n\t"
yading@10 82 "add %%"REG_a", %2 \n\t"
yading@10 83 "movq (%1), %%mm0 \n\t"
yading@10 84 "movq 1(%1), %%mm1 \n\t"
yading@10 85 "movq (%1, %3), %%mm2 \n\t"
yading@10 86 "movq 1(%1, %3), %%mm3 \n\t"
yading@10 87 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 88 "movq %%mm4, (%2) \n\t"
yading@10 89 "movq %%mm5, (%2, %3) \n\t"
yading@10 90 "movq 8(%1), %%mm0 \n\t"
yading@10 91 "movq 9(%1), %%mm1 \n\t"
yading@10 92 "movq 8(%1, %3), %%mm2 \n\t"
yading@10 93 "movq 9(%1, %3), %%mm3 \n\t"
yading@10 94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
yading@10 95 "movq %%mm4, 8(%2) \n\t"
yading@10 96 "movq %%mm5, 8(%2, %3) \n\t"
yading@10 97 "add %%"REG_a", %1 \n\t"
yading@10 98 "add %%"REG_a", %2 \n\t"
yading@10 99 "subl $4, %0 \n\t"
yading@10 100 "jnz 1b \n\t"
yading@10 101 :"+g"(h), "+S"(pixels), "+D"(block)
yading@10 102 :"r"((x86_reg)line_size)
yading@10 103 :REG_a, "memory");
yading@10 104 }
yading@10 105
yading@10 106 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 107 {
yading@10 108 MOVQ_BFE(mm6);
yading@10 109 __asm__ volatile(
yading@10 110 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 111 "movq (%1), %%mm0 \n\t"
yading@10 112 ".p2align 3 \n\t"
yading@10 113 "1: \n\t"
yading@10 114 "movq (%1, %3), %%mm1 \n\t"
yading@10 115 "movq (%1, %%"REG_a"),%%mm2 \n\t"
yading@10 116 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
yading@10 117 "movq %%mm4, (%2) \n\t"
yading@10 118 "movq %%mm5, (%2, %3) \n\t"
yading@10 119 "add %%"REG_a", %1 \n\t"
yading@10 120 "add %%"REG_a", %2 \n\t"
yading@10 121 "movq (%1, %3), %%mm1 \n\t"
yading@10 122 "movq (%1, %%"REG_a"),%%mm0 \n\t"
yading@10 123 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
yading@10 124 "movq %%mm4, (%2) \n\t"
yading@10 125 "movq %%mm5, (%2, %3) \n\t"
yading@10 126 "add %%"REG_a", %1 \n\t"
yading@10 127 "add %%"REG_a", %2 \n\t"
yading@10 128 "subl $4, %0 \n\t"
yading@10 129 "jnz 1b \n\t"
yading@10 130 :"+g"(h), "+S"(pixels), "+D"(block)
yading@10 131 :"r"((x86_reg)line_size)
yading@10 132 :REG_a, "memory");
yading@10 133 }
yading@10 134
yading@10 135 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 136 {
yading@10 137 MOVQ_ZERO(mm7);
yading@10 138 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
yading@10 139 __asm__ volatile(
yading@10 140 "movq (%1), %%mm0 \n\t"
yading@10 141 "movq 1(%1), %%mm4 \n\t"
yading@10 142 "movq %%mm0, %%mm1 \n\t"
yading@10 143 "movq %%mm4, %%mm5 \n\t"
yading@10 144 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 145 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 146 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 147 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 148 "paddusw %%mm0, %%mm4 \n\t"
yading@10 149 "paddusw %%mm1, %%mm5 \n\t"
yading@10 150 "xor %%"REG_a", %%"REG_a" \n\t"
yading@10 151 "add %3, %1 \n\t"
yading@10 152 ".p2align 3 \n\t"
yading@10 153 "1: \n\t"
yading@10 154 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 155 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
yading@10 156 "movq %%mm0, %%mm1 \n\t"
yading@10 157 "movq %%mm2, %%mm3 \n\t"
yading@10 158 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 159 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 160 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 161 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 162 "paddusw %%mm2, %%mm0 \n\t"
yading@10 163 "paddusw %%mm3, %%mm1 \n\t"
yading@10 164 "paddusw %%mm6, %%mm4 \n\t"
yading@10 165 "paddusw %%mm6, %%mm5 \n\t"
yading@10 166 "paddusw %%mm0, %%mm4 \n\t"
yading@10 167 "paddusw %%mm1, %%mm5 \n\t"
yading@10 168 "psrlw $2, %%mm4 \n\t"
yading@10 169 "psrlw $2, %%mm5 \n\t"
yading@10 170 "packuswb %%mm5, %%mm4 \n\t"
yading@10 171 "movq %%mm4, (%2, %%"REG_a") \n\t"
yading@10 172 "add %3, %%"REG_a" \n\t"
yading@10 173
yading@10 174 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
yading@10 175 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
yading@10 176 "movq %%mm2, %%mm3 \n\t"
yading@10 177 "movq %%mm4, %%mm5 \n\t"
yading@10 178 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 179 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 180 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 181 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 182 "paddusw %%mm2, %%mm4 \n\t"
yading@10 183 "paddusw %%mm3, %%mm5 \n\t"
yading@10 184 "paddusw %%mm6, %%mm0 \n\t"
yading@10 185 "paddusw %%mm6, %%mm1 \n\t"
yading@10 186 "paddusw %%mm4, %%mm0 \n\t"
yading@10 187 "paddusw %%mm5, %%mm1 \n\t"
yading@10 188 "psrlw $2, %%mm0 \n\t"
yading@10 189 "psrlw $2, %%mm1 \n\t"
yading@10 190 "packuswb %%mm1, %%mm0 \n\t"
yading@10 191 "movq %%mm0, (%2, %%"REG_a") \n\t"
yading@10 192 "add %3, %%"REG_a" \n\t"
yading@10 193
yading@10 194 "subl $2, %0 \n\t"
yading@10 195 "jnz 1b \n\t"
yading@10 196 :"+g"(h), "+S"(pixels)
yading@10 197 :"D"(block), "r"((x86_reg)line_size)
yading@10 198 :REG_a, "memory");
yading@10 199 }
yading@10 200
yading@10 201 // avg_pixels
yading@10 202 #ifndef NO_RND
yading@10 203 // in case more speed is needed - unroling would certainly help
yading@10 204 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 205 {
yading@10 206 MOVQ_BFE(mm6);
yading@10 207 JUMPALIGN();
yading@10 208 do {
yading@10 209 __asm__ volatile(
yading@10 210 "movq %0, %%mm0 \n\t"
yading@10 211 "movq %1, %%mm1 \n\t"
yading@10 212 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 213 "movq %%mm2, %0 \n\t"
yading@10 214 :"+m"(*block)
yading@10 215 :"m"(*pixels)
yading@10 216 :"memory");
yading@10 217 pixels += line_size;
yading@10 218 block += line_size;
yading@10 219 }
yading@10 220 while (--h);
yading@10 221 }
yading@10 222 #endif // NO_RND
yading@10 223
yading@10 224 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 225 {
yading@10 226 MOVQ_BFE(mm6);
yading@10 227 JUMPALIGN();
yading@10 228 do {
yading@10 229 __asm__ volatile(
yading@10 230 "movq %0, %%mm0 \n\t"
yading@10 231 "movq %1, %%mm1 \n\t"
yading@10 232 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 233 "movq %%mm2, %0 \n\t"
yading@10 234 "movq 8%0, %%mm0 \n\t"
yading@10 235 "movq 8%1, %%mm1 \n\t"
yading@10 236 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 237 "movq %%mm2, 8%0 \n\t"
yading@10 238 :"+m"(*block)
yading@10 239 :"m"(*pixels)
yading@10 240 :"memory");
yading@10 241 pixels += line_size;
yading@10 242 block += line_size;
yading@10 243 }
yading@10 244 while (--h);
yading@10 245 }
yading@10 246
yading@10 247 #ifndef NO_RND
yading@10 248 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 249 {
yading@10 250 MOVQ_BFE(mm6);
yading@10 251 JUMPALIGN();
yading@10 252 do {
yading@10 253 __asm__ volatile(
yading@10 254 "movq %1, %%mm0 \n\t"
yading@10 255 "movq 1%1, %%mm1 \n\t"
yading@10 256 "movq %0, %%mm3 \n\t"
yading@10 257 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 258 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
yading@10 259 "movq %%mm0, %0 \n\t"
yading@10 260 :"+m"(*block)
yading@10 261 :"m"(*pixels)
yading@10 262 :"memory");
yading@10 263 pixels += line_size;
yading@10 264 block += line_size;
yading@10 265 } while (--h);
yading@10 266 }
yading@10 267 #endif // NO_RND
yading@10 268
yading@10 269 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 270 {
yading@10 271 MOVQ_BFE(mm6);
yading@10 272 JUMPALIGN();
yading@10 273 do {
yading@10 274 __asm__ volatile(
yading@10 275 "movq %1, %%mm0 \n\t"
yading@10 276 "movq 1%1, %%mm1 \n\t"
yading@10 277 "movq %0, %%mm3 \n\t"
yading@10 278 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 279 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
yading@10 280 "movq %%mm0, %0 \n\t"
yading@10 281 "movq 8%1, %%mm0 \n\t"
yading@10 282 "movq 9%1, %%mm1 \n\t"
yading@10 283 "movq 8%0, %%mm3 \n\t"
yading@10 284 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 285 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
yading@10 286 "movq %%mm0, 8%0 \n\t"
yading@10 287 :"+m"(*block)
yading@10 288 :"m"(*pixels)
yading@10 289 :"memory");
yading@10 290 pixels += line_size;
yading@10 291 block += line_size;
yading@10 292 } while (--h);
yading@10 293 }
yading@10 294
yading@10 295 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 296 {
yading@10 297 MOVQ_BFE(mm6);
yading@10 298 __asm__ volatile(
yading@10 299 "lea (%3, %3), %%"REG_a" \n\t"
yading@10 300 "movq (%1), %%mm0 \n\t"
yading@10 301 ".p2align 3 \n\t"
yading@10 302 "1: \n\t"
yading@10 303 "movq (%1, %3), %%mm1 \n\t"
yading@10 304 "movq (%1, %%"REG_a"), %%mm2 \n\t"
yading@10 305 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
yading@10 306 "movq (%2), %%mm3 \n\t"
yading@10 307 OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
yading@10 308 "movq (%2, %3), %%mm3 \n\t"
yading@10 309 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
yading@10 310 "movq %%mm0, (%2) \n\t"
yading@10 311 "movq %%mm1, (%2, %3) \n\t"
yading@10 312 "add %%"REG_a", %1 \n\t"
yading@10 313 "add %%"REG_a", %2 \n\t"
yading@10 314
yading@10 315 "movq (%1, %3), %%mm1 \n\t"
yading@10 316 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 317 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
yading@10 318 "movq (%2), %%mm3 \n\t"
yading@10 319 OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
yading@10 320 "movq (%2, %3), %%mm3 \n\t"
yading@10 321 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
yading@10 322 "movq %%mm2, (%2) \n\t"
yading@10 323 "movq %%mm1, (%2, %3) \n\t"
yading@10 324 "add %%"REG_a", %1 \n\t"
yading@10 325 "add %%"REG_a", %2 \n\t"
yading@10 326
yading@10 327 "subl $4, %0 \n\t"
yading@10 328 "jnz 1b \n\t"
yading@10 329 :"+g"(h), "+S"(pixels), "+D"(block)
yading@10 330 :"r"((x86_reg)line_size)
yading@10 331 :REG_a, "memory");
yading@10 332 }
yading@10 333
yading@10 334 // this routine is 'slightly' suboptimal but mostly unused
yading@10 335 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 336 {
yading@10 337 MOVQ_ZERO(mm7);
yading@10 338 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
yading@10 339 __asm__ volatile(
yading@10 340 "movq (%1), %%mm0 \n\t"
yading@10 341 "movq 1(%1), %%mm4 \n\t"
yading@10 342 "movq %%mm0, %%mm1 \n\t"
yading@10 343 "movq %%mm4, %%mm5 \n\t"
yading@10 344 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 345 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 346 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 347 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 348 "paddusw %%mm0, %%mm4 \n\t"
yading@10 349 "paddusw %%mm1, %%mm5 \n\t"
yading@10 350 "xor %%"REG_a", %%"REG_a" \n\t"
yading@10 351 "add %3, %1 \n\t"
yading@10 352 ".p2align 3 \n\t"
yading@10 353 "1: \n\t"
yading@10 354 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 355 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
yading@10 356 "movq %%mm0, %%mm1 \n\t"
yading@10 357 "movq %%mm2, %%mm3 \n\t"
yading@10 358 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 359 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 360 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 361 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 362 "paddusw %%mm2, %%mm0 \n\t"
yading@10 363 "paddusw %%mm3, %%mm1 \n\t"
yading@10 364 "paddusw %%mm6, %%mm4 \n\t"
yading@10 365 "paddusw %%mm6, %%mm5 \n\t"
yading@10 366 "paddusw %%mm0, %%mm4 \n\t"
yading@10 367 "paddusw %%mm1, %%mm5 \n\t"
yading@10 368 "psrlw $2, %%mm4 \n\t"
yading@10 369 "psrlw $2, %%mm5 \n\t"
yading@10 370 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 371 "packuswb %%mm5, %%mm4 \n\t"
yading@10 372 "pcmpeqd %%mm2, %%mm2 \n\t"
yading@10 373 "paddb %%mm2, %%mm2 \n\t"
yading@10 374 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
yading@10 375 "movq %%mm5, (%2, %%"REG_a") \n\t"
yading@10 376 "add %3, %%"REG_a" \n\t"
yading@10 377
yading@10 378 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
yading@10 379 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
yading@10 380 "movq %%mm2, %%mm3 \n\t"
yading@10 381 "movq %%mm4, %%mm5 \n\t"
yading@10 382 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 383 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 384 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 385 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 386 "paddusw %%mm2, %%mm4 \n\t"
yading@10 387 "paddusw %%mm3, %%mm5 \n\t"
yading@10 388 "paddusw %%mm6, %%mm0 \n\t"
yading@10 389 "paddusw %%mm6, %%mm1 \n\t"
yading@10 390 "paddusw %%mm4, %%mm0 \n\t"
yading@10 391 "paddusw %%mm5, %%mm1 \n\t"
yading@10 392 "psrlw $2, %%mm0 \n\t"
yading@10 393 "psrlw $2, %%mm1 \n\t"
yading@10 394 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 395 "packuswb %%mm1, %%mm0 \n\t"
yading@10 396 "pcmpeqd %%mm2, %%mm2 \n\t"
yading@10 397 "paddb %%mm2, %%mm2 \n\t"
yading@10 398 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
yading@10 399 "movq %%mm1, (%2, %%"REG_a") \n\t"
yading@10 400 "add %3, %%"REG_a" \n\t"
yading@10 401
yading@10 402 "subl $2, %0 \n\t"
yading@10 403 "jnz 1b \n\t"
yading@10 404 :"+g"(h), "+S"(pixels)
yading@10 405 :"D"(block), "r"((x86_reg)line_size)
yading@10 406 :REG_a, "memory");
yading@10 407 }
yading@10 408
yading@10 409 //FIXME optimize
yading@10 410 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 411 DEF(put, pixels8_y2)(block , pixels , line_size, h);
yading@10 412 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
yading@10 413 }
yading@10 414
yading@10 415 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 416 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
yading@10 417 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
yading@10 418 }
yading@10 419
yading@10 420 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 421 DEF(avg, pixels8_y2)(block , pixels , line_size, h);
yading@10 422 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
yading@10 423 }
yading@10 424
yading@10 425 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 426 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
yading@10 427 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
yading@10 428 }