annotate ffmpeg/libavcodec/x86/dsputil_rnd_template.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
yading@10 3 * Copyright (c) 2000, 2001 Fabrice Bellard
yading@10 4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
yading@10 5 *
yading@10 6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
yading@10 7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
yading@10 8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
yading@10 9 *
yading@10 10 * This file is part of FFmpeg.
yading@10 11 *
yading@10 12 * FFmpeg is free software; you can redistribute it and/or
yading@10 13 * modify it under the terms of the GNU Lesser General Public
yading@10 14 * License as published by the Free Software Foundation; either
yading@10 15 * version 2.1 of the License, or (at your option) any later version.
yading@10 16 *
yading@10 17 * FFmpeg is distributed in the hope that it will be useful,
yading@10 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 20 * Lesser General Public License for more details.
yading@10 21 *
yading@10 22 * You should have received a copy of the GNU Lesser General Public
yading@10 23 * License along with FFmpeg; if not, write to the Free Software
yading@10 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 25 */
yading@10 26
yading@10 27 // put_pixels
yading@10 28 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 29 {
yading@10 30 MOVQ_ZERO(mm7);
yading@10 31 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
yading@10 32 __asm__ volatile(
yading@10 33 "movq (%1), %%mm0 \n\t"
yading@10 34 "movq 1(%1), %%mm4 \n\t"
yading@10 35 "movq %%mm0, %%mm1 \n\t"
yading@10 36 "movq %%mm4, %%mm5 \n\t"
yading@10 37 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 38 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 39 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 40 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 41 "paddusw %%mm0, %%mm4 \n\t"
yading@10 42 "paddusw %%mm1, %%mm5 \n\t"
yading@10 43 "xor %%"REG_a", %%"REG_a" \n\t"
yading@10 44 "add %3, %1 \n\t"
yading@10 45 ".p2align 3 \n\t"
yading@10 46 "1: \n\t"
yading@10 47 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 48 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
yading@10 49 "movq %%mm0, %%mm1 \n\t"
yading@10 50 "movq %%mm2, %%mm3 \n\t"
yading@10 51 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 52 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 53 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 54 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 55 "paddusw %%mm2, %%mm0 \n\t"
yading@10 56 "paddusw %%mm3, %%mm1 \n\t"
yading@10 57 "paddusw %%mm6, %%mm4 \n\t"
yading@10 58 "paddusw %%mm6, %%mm5 \n\t"
yading@10 59 "paddusw %%mm0, %%mm4 \n\t"
yading@10 60 "paddusw %%mm1, %%mm5 \n\t"
yading@10 61 "psrlw $2, %%mm4 \n\t"
yading@10 62 "psrlw $2, %%mm5 \n\t"
yading@10 63 "packuswb %%mm5, %%mm4 \n\t"
yading@10 64 "movq %%mm4, (%2, %%"REG_a") \n\t"
yading@10 65 "add %3, %%"REG_a" \n\t"
yading@10 66
yading@10 67 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
yading@10 68 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
yading@10 69 "movq %%mm2, %%mm3 \n\t"
yading@10 70 "movq %%mm4, %%mm5 \n\t"
yading@10 71 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 72 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 73 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 74 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 75 "paddusw %%mm2, %%mm4 \n\t"
yading@10 76 "paddusw %%mm3, %%mm5 \n\t"
yading@10 77 "paddusw %%mm6, %%mm0 \n\t"
yading@10 78 "paddusw %%mm6, %%mm1 \n\t"
yading@10 79 "paddusw %%mm4, %%mm0 \n\t"
yading@10 80 "paddusw %%mm5, %%mm1 \n\t"
yading@10 81 "psrlw $2, %%mm0 \n\t"
yading@10 82 "psrlw $2, %%mm1 \n\t"
yading@10 83 "packuswb %%mm1, %%mm0 \n\t"
yading@10 84 "movq %%mm0, (%2, %%"REG_a") \n\t"
yading@10 85 "add %3, %%"REG_a" \n\t"
yading@10 86
yading@10 87 "subl $2, %0 \n\t"
yading@10 88 "jnz 1b \n\t"
yading@10 89 :"+g"(h), "+S"(pixels)
yading@10 90 :"D"(block), "r"((x86_reg)line_size)
yading@10 91 :REG_a, "memory");
yading@10 92 }
yading@10 93
yading@10 94 // in case more speed is needed - unroling would certainly help
yading@10 95 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 96 {
yading@10 97 MOVQ_BFE(mm6);
yading@10 98 JUMPALIGN();
yading@10 99 do {
yading@10 100 __asm__ volatile(
yading@10 101 "movq %0, %%mm0 \n\t"
yading@10 102 "movq %1, %%mm1 \n\t"
yading@10 103 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 104 "movq %%mm2, %0 \n\t"
yading@10 105 :"+m"(*block)
yading@10 106 :"m"(*pixels)
yading@10 107 :"memory");
yading@10 108 pixels += line_size;
yading@10 109 block += line_size;
yading@10 110 }
yading@10 111 while (--h);
yading@10 112 }
yading@10 113
yading@10 114 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 115 {
yading@10 116 MOVQ_BFE(mm6);
yading@10 117 JUMPALIGN();
yading@10 118 do {
yading@10 119 __asm__ volatile(
yading@10 120 "movq %0, %%mm0 \n\t"
yading@10 121 "movq %1, %%mm1 \n\t"
yading@10 122 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 123 "movq %%mm2, %0 \n\t"
yading@10 124 "movq 8%0, %%mm0 \n\t"
yading@10 125 "movq 8%1, %%mm1 \n\t"
yading@10 126 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
yading@10 127 "movq %%mm2, 8%0 \n\t"
yading@10 128 :"+m"(*block)
yading@10 129 :"m"(*pixels)
yading@10 130 :"memory");
yading@10 131 pixels += line_size;
yading@10 132 block += line_size;
yading@10 133 }
yading@10 134 while (--h);
yading@10 135 }
yading@10 136
yading@10 137 // this routine is 'slightly' suboptimal but mostly unused
yading@10 138 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
yading@10 139 {
yading@10 140 MOVQ_ZERO(mm7);
yading@10 141 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
yading@10 142 __asm__ volatile(
yading@10 143 "movq (%1), %%mm0 \n\t"
yading@10 144 "movq 1(%1), %%mm4 \n\t"
yading@10 145 "movq %%mm0, %%mm1 \n\t"
yading@10 146 "movq %%mm4, %%mm5 \n\t"
yading@10 147 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 148 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 149 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 150 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 151 "paddusw %%mm0, %%mm4 \n\t"
yading@10 152 "paddusw %%mm1, %%mm5 \n\t"
yading@10 153 "xor %%"REG_a", %%"REG_a" \n\t"
yading@10 154 "add %3, %1 \n\t"
yading@10 155 ".p2align 3 \n\t"
yading@10 156 "1: \n\t"
yading@10 157 "movq (%1, %%"REG_a"), %%mm0 \n\t"
yading@10 158 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
yading@10 159 "movq %%mm0, %%mm1 \n\t"
yading@10 160 "movq %%mm2, %%mm3 \n\t"
yading@10 161 "punpcklbw %%mm7, %%mm0 \n\t"
yading@10 162 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 163 "punpckhbw %%mm7, %%mm1 \n\t"
yading@10 164 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 165 "paddusw %%mm2, %%mm0 \n\t"
yading@10 166 "paddusw %%mm3, %%mm1 \n\t"
yading@10 167 "paddusw %%mm6, %%mm4 \n\t"
yading@10 168 "paddusw %%mm6, %%mm5 \n\t"
yading@10 169 "paddusw %%mm0, %%mm4 \n\t"
yading@10 170 "paddusw %%mm1, %%mm5 \n\t"
yading@10 171 "psrlw $2, %%mm4 \n\t"
yading@10 172 "psrlw $2, %%mm5 \n\t"
yading@10 173 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 174 "packuswb %%mm5, %%mm4 \n\t"
yading@10 175 "pcmpeqd %%mm2, %%mm2 \n\t"
yading@10 176 "paddb %%mm2, %%mm2 \n\t"
yading@10 177 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
yading@10 178 "movq %%mm5, (%2, %%"REG_a") \n\t"
yading@10 179 "add %3, %%"REG_a" \n\t"
yading@10 180
yading@10 181 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
yading@10 182 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
yading@10 183 "movq %%mm2, %%mm3 \n\t"
yading@10 184 "movq %%mm4, %%mm5 \n\t"
yading@10 185 "punpcklbw %%mm7, %%mm2 \n\t"
yading@10 186 "punpcklbw %%mm7, %%mm4 \n\t"
yading@10 187 "punpckhbw %%mm7, %%mm3 \n\t"
yading@10 188 "punpckhbw %%mm7, %%mm5 \n\t"
yading@10 189 "paddusw %%mm2, %%mm4 \n\t"
yading@10 190 "paddusw %%mm3, %%mm5 \n\t"
yading@10 191 "paddusw %%mm6, %%mm0 \n\t"
yading@10 192 "paddusw %%mm6, %%mm1 \n\t"
yading@10 193 "paddusw %%mm4, %%mm0 \n\t"
yading@10 194 "paddusw %%mm5, %%mm1 \n\t"
yading@10 195 "psrlw $2, %%mm0 \n\t"
yading@10 196 "psrlw $2, %%mm1 \n\t"
yading@10 197 "movq (%2, %%"REG_a"), %%mm3 \n\t"
yading@10 198 "packuswb %%mm1, %%mm0 \n\t"
yading@10 199 "pcmpeqd %%mm2, %%mm2 \n\t"
yading@10 200 "paddb %%mm2, %%mm2 \n\t"
yading@10 201 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
yading@10 202 "movq %%mm1, (%2, %%"REG_a") \n\t"
yading@10 203 "add %3, %%"REG_a" \n\t"
yading@10 204
yading@10 205 "subl $2, %0 \n\t"
yading@10 206 "jnz 1b \n\t"
yading@10 207 :"+g"(h), "+S"(pixels)
yading@10 208 :"D"(block), "r"((x86_reg)line_size)
yading@10 209 :REG_a, "memory");
yading@10 210 }
yading@10 211
yading@10 212 //FIXME optimize
yading@10 213 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 214 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
yading@10 215 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
yading@10 216 }
yading@10 217
yading@10 218 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
yading@10 219 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
yading@10 220 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
yading@10 221 }