annotate ffmpeg/libavcodec/x86/simple_idct.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Simple IDCT MMX
yading@10 3 *
yading@10 4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
yading@10 5 *
yading@10 6 * This file is part of FFmpeg.
yading@10 7 *
yading@10 8 * FFmpeg is free software; you can redistribute it and/or
yading@10 9 * modify it under the terms of the GNU Lesser General Public
yading@10 10 * License as published by the Free Software Foundation; either
yading@10 11 * version 2.1 of the License, or (at your option) any later version.
yading@10 12 *
yading@10 13 * FFmpeg is distributed in the hope that it will be useful,
yading@10 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 16 * Lesser General Public License for more details.
yading@10 17 *
yading@10 18 * You should have received a copy of the GNU Lesser General Public
yading@10 19 * License along with FFmpeg; if not, write to the Free Software
yading@10 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 21 */
yading@10 22 #include "libavcodec/simple_idct.h"
yading@10 23 #include "libavutil/mem.h"
yading@10 24 #include "dsputil_mmx.h"
yading@10 25
yading@10 26 #if HAVE_INLINE_ASM
yading@10 27
yading@10 28 /*
yading@10 29 23170.475006
yading@10 30 22725.260826
yading@10 31 21406.727617
yading@10 32 19265.545870
yading@10 33 16384.000000
yading@10 34 12872.826198
yading@10 35 8866.956905
yading@10 36 4520.335430
yading@10 37 */
yading@10 38 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 39 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 40 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 41 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
yading@10 43 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 44 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 45 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
yading@10 46
yading@10 47 #define ROW_SHIFT 11
yading@10 48 #define COL_SHIFT 20 // 6
yading@10 49
yading@10 50 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
yading@10 51 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
yading@10 52
yading@10 53 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
yading@10 54 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
yading@10 55 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
yading@10 56 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
yading@10 57 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
yading@10 58 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
yading@10 59 // 0, 0, 0, 0,
yading@10 60 // 0, 0, 0, 0,
yading@10 61
yading@10 62 C4, C4, C4, C4,
yading@10 63 C4, -C4, C4, -C4,
yading@10 64
yading@10 65 C2, C6, C2, C6,
yading@10 66 C6, -C2, C6, -C2,
yading@10 67
yading@10 68 C1, C3, C1, C3,
yading@10 69 C5, C7, C5, C7,
yading@10 70
yading@10 71 C3, -C7, C3, -C7,
yading@10 72 -C1, -C5, -C1, -C5,
yading@10 73
yading@10 74 C5, -C1, C5, -C1,
yading@10 75 C7, C3, C7, C3,
yading@10 76
yading@10 77 C7, -C5, C7, -C5,
yading@10 78 C3, -C1, C3, -C1
yading@10 79 };
yading@10 80
yading@10 81 static inline void idct(int16_t *block)
yading@10 82 {
yading@10 83 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
yading@10 84 int16_t * const temp= (int16_t*)align_tmp;
yading@10 85
yading@10 86 __asm__ volatile(
yading@10 87 #if 0 //Alternative, simpler variant
yading@10 88
yading@10 89 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
yading@10 90 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 91 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 92 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 93 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 94 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 95 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 96 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 97 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 98 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 99 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 100 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 101 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 102 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 103 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 104 #rounder ", %%mm4 \n\t"\
yading@10 105 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 106 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 107 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 108 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
yading@10 109 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 110 #rounder ", %%mm0 \n\t"\
yading@10 111 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 112 "paddd %%mm0, %%mm0 \n\t" \
yading@10 113 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
yading@10 114 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 115 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
yading@10 116 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 117 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 118 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 119 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 120 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 121 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
yading@10 122 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 123 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 124 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
yading@10 125 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
yading@10 126 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 127 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 128 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 129 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
yading@10 130 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
yading@10 131 "movq %%mm7, " #dst " \n\t"\
yading@10 132 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
yading@10 133 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 134 "movq %%mm2, 24+" #dst " \n\t"\
yading@10 135 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 136 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 137 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 138 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 139 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
yading@10 140 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 141 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 142 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 143 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
yading@10 144 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 145 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 146 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 147 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
yading@10 148 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 149 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 150 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 151 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
yading@10 152 "movq %%mm2, 8+" #dst " \n\t"\
yading@10 153 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 154 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
yading@10 155 "movq %%mm4, 16+" #dst " \n\t"\
yading@10 156
yading@10 157 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 158 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 159 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 160 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 161 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 162 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 163 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 164 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 165 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 166 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 167 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 168 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 169 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 170 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 171 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 172 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 173 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 174 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 175 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 176 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
yading@10 177 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
yading@10 178 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
yading@10 179 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 180 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 181 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
yading@10 182 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 183 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 184 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 185 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 186 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 187 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
yading@10 188 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 189 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 190 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
yading@10 191 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 192 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 193 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 194 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 195 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 196 "movd %%mm7, " #dst " \n\t"\
yading@10 197 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 198 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 199 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 200 "movd %%mm2, 96+" #dst " \n\t"\
yading@10 201 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 202 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 203 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
yading@10 204 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 205 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 206 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 207 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 208 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 209 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
yading@10 210 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 211 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 212 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 213 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 214 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 215 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 216 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 217 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
yading@10 218 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 219 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 220 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 221 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 222 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 223 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 224 "movd %%mm2, 32+" #dst " \n\t"\
yading@10 225 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
yading@10 226 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 227 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 228 "movd %%mm4, 64+" #dst " \n\t"\
yading@10 229 "movd %%mm5, 80+" #dst " \n\t"\
yading@10 230
yading@10 231
yading@10 232 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
yading@10 233 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 234 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 235 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 236 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 237 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
yading@10 238 "pand %%mm0, %%mm4 \n\t"\
yading@10 239 "por %%mm1, %%mm4 \n\t"\
yading@10 240 "por %%mm2, %%mm4 \n\t"\
yading@10 241 "por %%mm3, %%mm4 \n\t"\
yading@10 242 "packssdw %%mm4,%%mm4 \n\t"\
yading@10 243 "movd %%mm4, %%eax \n\t"\
yading@10 244 "orl %%eax, %%eax \n\t"\
yading@10 245 "jz 1f \n\t"\
yading@10 246 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 247 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 248 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 249 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 250 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 251 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 252 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 253 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 254 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 255 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 256 #rounder ", %%mm4 \n\t"\
yading@10 257 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 258 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 259 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 260 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
yading@10 261 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 262 #rounder ", %%mm0 \n\t"\
yading@10 263 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 264 "paddd %%mm0, %%mm0 \n\t" \
yading@10 265 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
yading@10 266 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 267 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
yading@10 268 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 269 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 270 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 271 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 272 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 273 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
yading@10 274 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 275 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 276 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
yading@10 277 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
yading@10 278 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 279 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 280 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 281 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
yading@10 282 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
yading@10 283 "movq %%mm7, " #dst " \n\t"\
yading@10 284 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
yading@10 285 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 286 "movq %%mm2, 24+" #dst " \n\t"\
yading@10 287 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 288 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 289 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 290 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 291 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
yading@10 292 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 293 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 294 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 295 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
yading@10 296 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 297 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 298 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 299 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
yading@10 300 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 301 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 302 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 303 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
yading@10 304 "movq %%mm2, 8+" #dst " \n\t"\
yading@10 305 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 306 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
yading@10 307 "movq %%mm4, 16+" #dst " \n\t"\
yading@10 308 "jmp 2f \n\t"\
yading@10 309 "1: \n\t"\
yading@10 310 "pslld $16, %%mm0 \n\t"\
yading@10 311 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
yading@10 312 "psrad $13, %%mm0 \n\t"\
yading@10 313 "packssdw %%mm0, %%mm0 \n\t"\
yading@10 314 "movq %%mm0, " #dst " \n\t"\
yading@10 315 "movq %%mm0, 8+" #dst " \n\t"\
yading@10 316 "movq %%mm0, 16+" #dst " \n\t"\
yading@10 317 "movq %%mm0, 24+" #dst " \n\t"\
yading@10 318 "2: \n\t"
yading@10 319
yading@10 320
yading@10 321 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
yading@10 322 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
yading@10 323 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
yading@10 324 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
yading@10 325 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
yading@10 326
yading@10 327 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
yading@10 328 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
yading@10 329 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
yading@10 330
yading@10 331
yading@10 332 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 333 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 334 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 335 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 336 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 337
yading@10 338 #else
yading@10 339
yading@10 340 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
yading@10 341 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 342 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 343 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 344 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 345 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
yading@10 346 "pand %%mm0, %%mm4 \n\t"\
yading@10 347 "por %%mm1, %%mm4 \n\t"\
yading@10 348 "por %%mm2, %%mm4 \n\t"\
yading@10 349 "por %%mm3, %%mm4 \n\t"\
yading@10 350 "packssdw %%mm4,%%mm4 \n\t"\
yading@10 351 "movd %%mm4, %%eax \n\t"\
yading@10 352 "orl %%eax, %%eax \n\t"\
yading@10 353 "jz 1f \n\t"\
yading@10 354 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 355 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 356 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 357 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 358 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 359 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 360 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 361 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 362 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 363 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 364 #rounder ", %%mm4 \n\t"\
yading@10 365 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 366 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 367 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 368 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
yading@10 369 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 370 #rounder ", %%mm0 \n\t"\
yading@10 371 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 372 "paddd %%mm0, %%mm0 \n\t" \
yading@10 373 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
yading@10 374 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 375 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
yading@10 376 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 377 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 378 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 379 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 380 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 381 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
yading@10 382 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 383 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 384 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
yading@10 385 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
yading@10 386 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 387 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 388 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 389 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
yading@10 390 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
yading@10 391 "movq %%mm7, " #dst " \n\t"\
yading@10 392 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
yading@10 393 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 394 "movq %%mm2, 24+" #dst " \n\t"\
yading@10 395 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 396 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 397 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 398 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 399 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
yading@10 400 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 401 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 402 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 403 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
yading@10 404 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 405 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 406 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 407 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
yading@10 408 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 409 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 410 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 411 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
yading@10 412 "movq %%mm2, 8+" #dst " \n\t"\
yading@10 413 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 414 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
yading@10 415 "movq %%mm4, 16+" #dst " \n\t"\
yading@10 416 "jmp 2f \n\t"\
yading@10 417 "1: \n\t"\
yading@10 418 "pslld $16, %%mm0 \n\t"\
yading@10 419 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
yading@10 420 "psrad $13, %%mm0 \n\t"\
yading@10 421 "packssdw %%mm0, %%mm0 \n\t"\
yading@10 422 "movq %%mm0, " #dst " \n\t"\
yading@10 423 "movq %%mm0, 8+" #dst " \n\t"\
yading@10 424 "movq %%mm0, 16+" #dst " \n\t"\
yading@10 425 "movq %%mm0, 24+" #dst " \n\t"\
yading@10 426 "2: \n\t"
yading@10 427
yading@10 428 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
yading@10 429 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 430 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 431 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 432 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 433 "movq %%mm0, %%mm4 \n\t"\
yading@10 434 "por %%mm1, %%mm4 \n\t"\
yading@10 435 "por %%mm2, %%mm4 \n\t"\
yading@10 436 "por %%mm3, %%mm4 \n\t"\
yading@10 437 "packssdw %%mm4,%%mm4 \n\t"\
yading@10 438 "movd %%mm4, %%eax \n\t"\
yading@10 439 "orl %%eax, %%eax \n\t"\
yading@10 440 "jz " #bt " \n\t"\
yading@10 441 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 442 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 443 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 444 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 445 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 446 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 447 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 448 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 449 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 450 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 451 #rounder ", %%mm4 \n\t"\
yading@10 452 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 453 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 454 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 455 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
yading@10 456 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 457 #rounder ", %%mm0 \n\t"\
yading@10 458 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 459 "paddd %%mm0, %%mm0 \n\t" \
yading@10 460 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
yading@10 461 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 462 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
yading@10 463 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 464 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 465 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 466 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 467 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 468 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
yading@10 469 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 470 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 471 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
yading@10 472 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
yading@10 473 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 474 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 475 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 476 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
yading@10 477 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
yading@10 478 "movq %%mm7, " #dst " \n\t"\
yading@10 479 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
yading@10 480 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 481 "movq %%mm2, 24+" #dst " \n\t"\
yading@10 482 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 483 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 484 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 485 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 486 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
yading@10 487 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 488 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 489 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 490 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
yading@10 491 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 492 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 493 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 494 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
yading@10 495 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 496 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 497 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 498 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
yading@10 499 "movq %%mm2, 8+" #dst " \n\t"\
yading@10 500 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 501 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
yading@10 502 "movq %%mm4, 16+" #dst " \n\t"\
yading@10 503
yading@10 504 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
yading@10 505 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 506 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 507 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 508 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 509 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 510 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 511 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 512 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 513 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 514 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 515 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 516 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 517 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 518 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 519 #rounder ", %%mm4 \n\t"\
yading@10 520 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 521 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 522 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 523 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
yading@10 524 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 525 #rounder ", %%mm0 \n\t"\
yading@10 526 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 527 "paddd %%mm0, %%mm0 \n\t" \
yading@10 528 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
yading@10 529 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 530 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
yading@10 531 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 532 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 533 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 534 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 535 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 536 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
yading@10 537 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 538 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 539 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
yading@10 540 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
yading@10 541 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 542 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 543 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 544 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
yading@10 545 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
yading@10 546 "movq %%mm7, " #dst " \n\t"\
yading@10 547 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
yading@10 548 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 549 "movq %%mm2, 24+" #dst " \n\t"\
yading@10 550 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 551 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 552 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 553 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 554 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
yading@10 555 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 556 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 557 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 558 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
yading@10 559 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 560 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 561 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 562 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
yading@10 563 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 564 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 565 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 566 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
yading@10 567 "movq %%mm2, 8+" #dst " \n\t"\
yading@10 568 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 569 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
yading@10 570 "movq %%mm4, 16+" #dst " \n\t"\
yading@10 571
yading@10 572 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
yading@10 573 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
yading@10 574 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
yading@10 575 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
yading@10 576 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
yading@10 577
yading@10 578 #undef IDCT
yading@10 579 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 580 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 581 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 582 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 583 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 584 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 585 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 586 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 587 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 588 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 589 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 590 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 591 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 592 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 593 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 594 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 595 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 596 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 597 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 598 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
yading@10 599 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
yading@10 600 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
yading@10 601 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 602 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 603 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
yading@10 604 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 605 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 606 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 607 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 608 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 609 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
yading@10 610 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 611 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 612 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
yading@10 613 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 614 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 615 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 616 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 617 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 618 "movd %%mm7, " #dst " \n\t"\
yading@10 619 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 620 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 621 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 622 "movd %%mm2, 96+" #dst " \n\t"\
yading@10 623 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 624 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 625 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
yading@10 626 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 627 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 628 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 629 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 630 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 631 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
yading@10 632 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 633 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 634 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 635 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 636 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 637 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 638 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 639 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
yading@10 640 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 641 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 642 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 643 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 644 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 645 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 646 "movd %%mm2, 32+" #dst " \n\t"\
yading@10 647 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
yading@10 648 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 649 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 650 "movd %%mm4, 64+" #dst " \n\t"\
yading@10 651 "movd %%mm5, 80+" #dst " \n\t"
yading@10 652
yading@10 653
yading@10 654 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 655 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 656 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 657 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 658 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 659 "jmp 9f \n\t"
yading@10 660
yading@10 661 "# .p2align 4 \n\t"\
yading@10 662 "4: \n\t"
yading@10 663 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
yading@10 664 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
yading@10 665
yading@10 666 #undef IDCT
yading@10 667 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 668 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 669 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 670 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 671 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 672 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 673 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 674 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 675 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 676 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 677 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 678 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 679 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 680 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 681 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 682 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 683 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
yading@10 684 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
yading@10 685 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
yading@10 686 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 687 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 688 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 689 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
yading@10 690 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 691 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 692 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 693 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 694 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
yading@10 695 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 696 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 697 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 698 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 699 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
yading@10 700 "movd %%mm1, " #dst " \n\t"\
yading@10 701 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 702 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 703 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 704 "movd %%mm2, 96+" #dst " \n\t"\
yading@10 705 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 706 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 707 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
yading@10 708 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 709 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
yading@10 710 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 711 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 712 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 713 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 714 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 715 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
yading@10 716 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 717 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
yading@10 718 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 719 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 720 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 721 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 722 "movd %%mm2, 32+" #dst " \n\t"\
yading@10 723 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
yading@10 724 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 725 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 726 "movd %%mm1, 64+" #dst " \n\t"\
yading@10 727 "movd %%mm5, 80+" #dst " \n\t"
yading@10 728
yading@10 729 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 730 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 731 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 732 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 733 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 734 "jmp 9f \n\t"
yading@10 735
yading@10 736 "# .p2align 4 \n\t"\
yading@10 737 "6: \n\t"
yading@10 738 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
yading@10 739
yading@10 740 #undef IDCT
yading@10 741 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 742 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 743 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 744 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 745 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 746 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 747 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 748 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 749 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 750 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
yading@10 751 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 752 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 753 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 754 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
yading@10 755 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 756 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 757 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 758 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 759 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
yading@10 760 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 761 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 762 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 763 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 764 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
yading@10 765 "movd %%mm1, " #dst " \n\t"\
yading@10 766 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 767 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 768 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 769 "movd %%mm2, 96+" #dst " \n\t"\
yading@10 770 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 771 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 772 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
yading@10 773 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 774 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
yading@10 775 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 776 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 777 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 778 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 779 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 780 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
yading@10 781 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 782 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
yading@10 783 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 784 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 785 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 786 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 787 "movd %%mm2, 32+" #dst " \n\t"\
yading@10 788 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
yading@10 789 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 790 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 791 "movd %%mm1, 64+" #dst " \n\t"\
yading@10 792 "movd %%mm5, 80+" #dst " \n\t"
yading@10 793
yading@10 794
yading@10 795 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 796 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 797 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 798 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 799 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 800 "jmp 9f \n\t"
yading@10 801
yading@10 802 "# .p2align 4 \n\t"\
yading@10 803 "2: \n\t"
yading@10 804 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
yading@10 805
yading@10 806 #undef IDCT
yading@10 807 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 808 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 809 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 810 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
yading@10 811 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 812 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 813 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 814 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 815 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 816 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 817 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 818 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 819 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
yading@10 820 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
yading@10 821 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 822 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
yading@10 823 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
yading@10 824 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
yading@10 825 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 826 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 827 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 828 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
yading@10 829 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 830 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 831 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
yading@10 832 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 833 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 834 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 835 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 836 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 837 "movd %%mm7, " #dst " \n\t"\
yading@10 838 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 839 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 840 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
yading@10 841 "movd %%mm2, 96+" #dst " \n\t"\
yading@10 842 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 843 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 844 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
yading@10 845 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 846 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 847 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
yading@10 848 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 849 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
yading@10 850 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
yading@10 851 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
yading@10 852 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
yading@10 853 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 854 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 855 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 856 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 857 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 858 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
yading@10 859 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 860 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 861 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 862 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 863 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
yading@10 864 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 865 "movd %%mm2, 32+" #dst " \n\t"\
yading@10 866 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
yading@10 867 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 868 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 869 "movd %%mm4, 64+" #dst " \n\t"\
yading@10 870 "movd %%mm5, 80+" #dst " \n\t"
yading@10 871
yading@10 872 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 873 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 874 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 875 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 876 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 877 "jmp 9f \n\t"
yading@10 878
yading@10 879 "# .p2align 4 \n\t"\
yading@10 880 "3: \n\t"
yading@10 881 #undef IDCT
yading@10 882 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 883 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 884 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 885 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 886 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 887 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 888 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 889 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 890 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 891 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 892 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 893 "movq 64(%2), %%mm3 \n\t"\
yading@10 894 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 895 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 896 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 897 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 898 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 899 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 900 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
yading@10 901 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 902 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
yading@10 903 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 904 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 905 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 906 "movd %%mm7, " #dst " \n\t"\
yading@10 907 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 908 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 909 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
yading@10 910 "movd %%mm1, 96+" #dst " \n\t"\
yading@10 911 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 912 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 913 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 914 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 915 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 916 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
yading@10 917 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
yading@10 918 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 919 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 920 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 921 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 922 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 923 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 924 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 925 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 926 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
yading@10 927 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 928 "movd %%mm1, 32+" #dst " \n\t"\
yading@10 929 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
yading@10 930 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 931 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 932 "movd %%mm4, 64+" #dst " \n\t"\
yading@10 933 "movd %%mm5, 80+" #dst " \n\t"
yading@10 934
yading@10 935
yading@10 936 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 937 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 938 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 939 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 940 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 941 "jmp 9f \n\t"
yading@10 942
yading@10 943 "# .p2align 4 \n\t"\
yading@10 944 "5: \n\t"
yading@10 945 #undef IDCT
yading@10 946 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 947 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 948 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 949 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 950 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 951 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 952 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 953 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 954 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 955 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 956 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 957 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 958 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 959 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 960 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 961 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
yading@10 962 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
yading@10 963 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
yading@10 964 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
yading@10 965 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
yading@10 966 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 967 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 968 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 969 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
yading@10 970 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 971 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 972 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
yading@10 973 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
yading@10 974 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
yading@10 975 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
yading@10 976 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
yading@10 977 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
yading@10 978 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 979 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 980 "psrad $" #shift ", %%mm3 \n\t"\
yading@10 981 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
yading@10 982 "movq %%mm4, " #dst " \n\t"\
yading@10 983 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 984 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
yading@10 985 "movq %%mm0, 16+" #dst " \n\t"\
yading@10 986 "movq %%mm0, 96+" #dst " \n\t"\
yading@10 987 "movq %%mm4, 112+" #dst " \n\t"\
yading@10 988 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 989 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 990 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 991 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 992 "movq %%mm5, 32+" #dst " \n\t"\
yading@10 993 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 994 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 995 "movq %%mm6, 48+" #dst " \n\t"\
yading@10 996 "movq %%mm6, 64+" #dst " \n\t"\
yading@10 997 "movq %%mm5, 80+" #dst " \n\t"
yading@10 998
yading@10 999
yading@10 1000 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 1001 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 1002 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 1003 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 1004 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 1005 "jmp 9f \n\t"
yading@10 1006
yading@10 1007
yading@10 1008 "# .p2align 4 \n\t"\
yading@10 1009 "1: \n\t"
yading@10 1010 #undef IDCT
yading@10 1011 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 1012 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 1013 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
yading@10 1014 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
yading@10 1015 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 1016 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 1017 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 1018 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 1019 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
yading@10 1020 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
yading@10 1021 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
yading@10 1022 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
yading@10 1023 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 1024 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
yading@10 1025 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
yading@10 1026 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
yading@10 1027 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
yading@10 1028 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 1029 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
yading@10 1030 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
yading@10 1031 "movq 64(%2), %%mm1 \n\t"\
yading@10 1032 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
yading@10 1033 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 1034 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
yading@10 1035 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 1036 "psrad $" #shift ", %%mm7 \n\t"\
yading@10 1037 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 1038 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
yading@10 1039 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 1040 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
yading@10 1041 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 1042 "psrad $" #shift ", %%mm3 \n\t"\
yading@10 1043 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
yading@10 1044 "movd %%mm7, " #dst " \n\t"\
yading@10 1045 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
yading@10 1046 "movd %%mm0, 16+" #dst " \n\t"\
yading@10 1047 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
yading@10 1048 "movd %%mm3, 96+" #dst " \n\t"\
yading@10 1049 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
yading@10 1050 "movd %%mm4, 112+" #dst " \n\t"\
yading@10 1051 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
yading@10 1052 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
yading@10 1053 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
yading@10 1054 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
yading@10 1055 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
yading@10 1056 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
yading@10 1057 "psrad $" #shift ", %%mm3 \n\t"\
yading@10 1058 "psrad $" #shift ", %%mm5 \n\t"\
yading@10 1059 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
yading@10 1060 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 1061 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
yading@10 1062 "psrad $" #shift ", %%mm6 \n\t"\
yading@10 1063 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
yading@10 1064 "movd %%mm3, 32+" #dst " \n\t"\
yading@10 1065 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 1066 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
yading@10 1067 "movd %%mm6, 48+" #dst " \n\t"\
yading@10 1068 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
yading@10 1069 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
yading@10 1070 "movd %%mm4, 64+" #dst " \n\t"\
yading@10 1071 "movd %%mm5, 80+" #dst " \n\t"
yading@10 1072
yading@10 1073
yading@10 1074 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 1075 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 1076 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 1077 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 1078 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 1079 "jmp 9f \n\t"
yading@10 1080
yading@10 1081
yading@10 1082 "# .p2align 4 \n\t"
yading@10 1083 "7: \n\t"
yading@10 1084 #undef IDCT
yading@10 1085 #define IDCT(src0, src4, src1, src5, dst, shift) \
yading@10 1086 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
yading@10 1087 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
yading@10 1088 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 1089 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 1090 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 1091 "psrad $" #shift ", %%mm4 \n\t"\
yading@10 1092 "psrad $" #shift ", %%mm0 \n\t"\
yading@10 1093 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
yading@10 1094 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
yading@10 1095 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
yading@10 1096 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
yading@10 1097 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
yading@10 1098 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
yading@10 1099 "psrad $" #shift ", %%mm1 \n\t"\
yading@10 1100 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
yading@10 1101 "movq %%mm4, " #dst " \n\t"\
yading@10 1102 "psrad $" #shift ", %%mm2 \n\t"\
yading@10 1103 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
yading@10 1104 "movq %%mm0, 16+" #dst " \n\t"\
yading@10 1105 "movq %%mm0, 96+" #dst " \n\t"\
yading@10 1106 "movq %%mm4, 112+" #dst " \n\t"\
yading@10 1107 "movq %%mm0, 32+" #dst " \n\t"\
yading@10 1108 "movq %%mm4, 48+" #dst " \n\t"\
yading@10 1109 "movq %%mm4, 64+" #dst " \n\t"\
yading@10 1110 "movq %%mm0, 80+" #dst " \n\t"
yading@10 1111
yading@10 1112 //IDCT( src0, src4, src1, src5, dst, shift)
yading@10 1113 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
yading@10 1114 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
yading@10 1115 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
yading@10 1116 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
yading@10 1117
yading@10 1118
yading@10 1119 #endif
yading@10 1120
yading@10 1121 /*
yading@10 1122 Input
yading@10 1123 00 40 04 44 20 60 24 64
yading@10 1124 10 30 14 34 50 70 54 74
yading@10 1125 01 41 03 43 21 61 23 63
yading@10 1126 11 31 13 33 51 71 53 73
yading@10 1127 02 42 06 46 22 62 26 66
yading@10 1128 12 32 16 36 52 72 56 76
yading@10 1129 05 45 07 47 25 65 27 67
yading@10 1130 15 35 17 37 55 75 57 77
yading@10 1131
yading@10 1132 Temp
yading@10 1133 00 04 10 14 20 24 30 34
yading@10 1134 40 44 50 54 60 64 70 74
yading@10 1135 01 03 11 13 21 23 31 33
yading@10 1136 41 43 51 53 61 63 71 73
yading@10 1137 02 06 12 16 22 26 32 36
yading@10 1138 42 46 52 56 62 66 72 76
yading@10 1139 05 07 15 17 25 27 35 37
yading@10 1140 45 47 55 57 65 67 75 77
yading@10 1141 */
yading@10 1142
yading@10 1143 "9: \n\t"
yading@10 1144 :: "r" (block), "r" (temp), "r" (coeffs)
yading@10 1145 : "%eax"
yading@10 1146 );
yading@10 1147 }
yading@10 1148
yading@10 1149 void ff_simple_idct_mmx(int16_t *block)
yading@10 1150 {
yading@10 1151 idct(block);
yading@10 1152 }
yading@10 1153
yading@10 1154 //FIXME merge add/put into the idct
yading@10 1155
yading@10 1156 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
yading@10 1157 {
yading@10 1158 idct(block);
yading@10 1159 ff_put_pixels_clamped_mmx(block, dest, line_size);
yading@10 1160 }
yading@10 1161 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
yading@10 1162 {
yading@10 1163 idct(block);
yading@10 1164 ff_add_pixels_clamped_mmx(block, dest, line_size);
yading@10 1165 }
yading@10 1166
yading@10 1167 #endif /* HAVE_INLINE_ASM */