annotate ffmpeg/libavcodec/sh4/idct_sh4.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * idct for sh4
yading@10 3 *
yading@10 4 * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
yading@10 5 *
yading@10 6 * This file is part of FFmpeg.
yading@10 7 *
yading@10 8 * FFmpeg is free software; you can redistribute it and/or
yading@10 9 * modify it under the terms of the GNU Lesser General Public
yading@10 10 * License as published by the Free Software Foundation; either
yading@10 11 * version 2.1 of the License, or (at your option) any later version.
yading@10 12 *
yading@10 13 * FFmpeg is distributed in the hope that it will be useful,
yading@10 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 16 * Lesser General Public License for more details.
yading@10 17 *
yading@10 18 * You should have received a copy of the GNU Lesser General Public
yading@10 19 * License along with FFmpeg; if not, write to the Free Software
yading@10 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 21 */
yading@10 22
yading@10 23 #include "dsputil_sh4.h"
yading@10 24 #include "sh4.h"
yading@10 25
yading@10 26 #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */
yading@10 27 #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */
yading@10 28 #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */
yading@10 29 #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */
yading@10 30 #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */
yading@10 31 #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */
yading@10 32 #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */
yading@10 33
yading@10 34 static const float even_table[] __attribute__ ((aligned(8))) = {
yading@10 35 c4, c4, c4, c4,
yading@10 36 c2, c6,-c6,-c2,
yading@10 37 c4,-c4,-c4, c4,
yading@10 38 c6,-c2, c2,-c6
yading@10 39 };
yading@10 40
yading@10 41 static const float odd_table[] __attribute__ ((aligned(8))) = {
yading@10 42 c1, c3, c5, c7,
yading@10 43 c3,-c7,-c1,-c5,
yading@10 44 c5,-c1, c7, c3,
yading@10 45 c7,-c5, c3,-c1
yading@10 46 };
yading@10 47
yading@10 48 #undef c1
yading@10 49 #undef c2
yading@10 50 #undef c3
yading@10 51 #undef c4
yading@10 52 #undef c5
yading@10 53 #undef c6
yading@10 54 #undef c7
yading@10 55
yading@10 56 #define load_matrix(table) \
yading@10 57 do { \
yading@10 58 const float *t = table; \
yading@10 59 __asm__ volatile( \
yading@10 60 " fschg\n" \
yading@10 61 " fmov @%0+,xd0\n" \
yading@10 62 " fmov @%0+,xd2\n" \
yading@10 63 " fmov @%0+,xd4\n" \
yading@10 64 " fmov @%0+,xd6\n" \
yading@10 65 " fmov @%0+,xd8\n" \
yading@10 66 " fmov @%0+,xd10\n" \
yading@10 67 " fmov @%0+,xd12\n" \
yading@10 68 " fmov @%0+,xd14\n" \
yading@10 69 " fschg\n" \
yading@10 70 : "+r"(t) \
yading@10 71 ); \
yading@10 72 } while (0)
yading@10 73
yading@10 74 #define ftrv() \
yading@10 75 __asm__ volatile("ftrv xmtrx,fv0" \
yading@10 76 : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3));
yading@10 77
yading@10 78 #define DEFREG \
yading@10 79 register float fr0 __asm__("fr0"); \
yading@10 80 register float fr1 __asm__("fr1"); \
yading@10 81 register float fr2 __asm__("fr2"); \
yading@10 82 register float fr3 __asm__("fr3")
yading@10 83
yading@10 84 #define DESCALE(x,n) (x)*(1.0f/(1<<(n)))
yading@10 85
yading@10 86 /* this code work worse on gcc cvs. 3.2.3 work fine */
yading@10 87
yading@10 88
yading@10 89 //optimized
yading@10 90
yading@10 91 void ff_idct_sh4(int16_t *block)
yading@10 92 {
yading@10 93 DEFREG;
yading@10 94
yading@10 95 int i;
yading@10 96 float tblock[8*8],*fblock;
yading@10 97 int ofs1,ofs2,ofs3;
yading@10 98 int fpscr;
yading@10 99
yading@10 100 fp_single_enter(fpscr);
yading@10 101
yading@10 102 /* row */
yading@10 103
yading@10 104 /* even part */
yading@10 105 load_matrix(even_table);
yading@10 106
yading@10 107 fblock = tblock+4;
yading@10 108 i = 8;
yading@10 109 do {
yading@10 110 fr0 = block[0];
yading@10 111 fr1 = block[2];
yading@10 112 fr2 = block[4];
yading@10 113 fr3 = block[6];
yading@10 114 block+=8;
yading@10 115 ftrv();
yading@10 116 *--fblock = fr3;
yading@10 117 *--fblock = fr2;
yading@10 118 *--fblock = fr1;
yading@10 119 *--fblock = fr0;
yading@10 120 fblock+=8+4;
yading@10 121 } while(--i);
yading@10 122 block-=8*8;
yading@10 123 fblock-=8*8+4;
yading@10 124
yading@10 125 load_matrix(odd_table);
yading@10 126
yading@10 127 i = 8;
yading@10 128
yading@10 129 do {
yading@10 130 float t0,t1,t2,t3;
yading@10 131 fr0 = block[1];
yading@10 132 fr1 = block[3];
yading@10 133 fr2 = block[5];
yading@10 134 fr3 = block[7];
yading@10 135 block+=8;
yading@10 136 ftrv();
yading@10 137 t0 = *fblock++;
yading@10 138 t1 = *fblock++;
yading@10 139 t2 = *fblock++;
yading@10 140 t3 = *fblock++;
yading@10 141 fblock+=4;
yading@10 142 *--fblock = t0 - fr0;
yading@10 143 *--fblock = t1 - fr1;
yading@10 144 *--fblock = t2 - fr2;
yading@10 145 *--fblock = t3 - fr3;
yading@10 146 *--fblock = t3 + fr3;
yading@10 147 *--fblock = t2 + fr2;
yading@10 148 *--fblock = t1 + fr1;
yading@10 149 *--fblock = t0 + fr0;
yading@10 150 fblock+=8;
yading@10 151 } while(--i);
yading@10 152 block-=8*8;
yading@10 153 fblock-=8*8;
yading@10 154
yading@10 155 /* col */
yading@10 156
yading@10 157 /* even part */
yading@10 158 load_matrix(even_table);
yading@10 159
yading@10 160 ofs1 = sizeof(float)*2*8;
yading@10 161 ofs2 = sizeof(float)*4*8;
yading@10 162 ofs3 = sizeof(float)*6*8;
yading@10 163
yading@10 164 i = 8;
yading@10 165
yading@10 166 #define OA(fblock,ofs) *(float*)((char*)fblock + ofs)
yading@10 167
yading@10 168 do {
yading@10 169 fr0 = OA(fblock, 0);
yading@10 170 fr1 = OA(fblock,ofs1);
yading@10 171 fr2 = OA(fblock,ofs2);
yading@10 172 fr3 = OA(fblock,ofs3);
yading@10 173 ftrv();
yading@10 174 OA(fblock,0 ) = fr0;
yading@10 175 OA(fblock,ofs1) = fr1;
yading@10 176 OA(fblock,ofs2) = fr2;
yading@10 177 OA(fblock,ofs3) = fr3;
yading@10 178 fblock++;
yading@10 179 } while(--i);
yading@10 180 fblock-=8;
yading@10 181
yading@10 182 load_matrix(odd_table);
yading@10 183
yading@10 184 i=8;
yading@10 185 do {
yading@10 186 float t0,t1,t2,t3;
yading@10 187 t0 = OA(fblock, 0); /* [8*0] */
yading@10 188 t1 = OA(fblock,ofs1); /* [8*2] */
yading@10 189 t2 = OA(fblock,ofs2); /* [8*4] */
yading@10 190 t3 = OA(fblock,ofs3); /* [8*6] */
yading@10 191 fblock+=8;
yading@10 192 fr0 = OA(fblock, 0); /* [8*1] */
yading@10 193 fr1 = OA(fblock,ofs1); /* [8*3] */
yading@10 194 fr2 = OA(fblock,ofs2); /* [8*5] */
yading@10 195 fr3 = OA(fblock,ofs3); /* [8*7] */
yading@10 196 fblock+=-8+1;
yading@10 197 ftrv();
yading@10 198 block[8*0] = DESCALE(t0 + fr0,3);
yading@10 199 block[8*7] = DESCALE(t0 - fr0,3);
yading@10 200 block[8*1] = DESCALE(t1 + fr1,3);
yading@10 201 block[8*6] = DESCALE(t1 - fr1,3);
yading@10 202 block[8*2] = DESCALE(t2 + fr2,3);
yading@10 203 block[8*5] = DESCALE(t2 - fr2,3);
yading@10 204 block[8*3] = DESCALE(t3 + fr3,3);
yading@10 205 block[8*4] = DESCALE(t3 - fr3,3);
yading@10 206 block++;
yading@10 207 } while(--i);
yading@10 208
yading@10 209 fp_single_leave(fpscr);
yading@10 210 }