yading@10: /* yading@10: * idct for sh4 yading@10: * yading@10: * Copyright (c) 2001-2003 BERO yading@10: * yading@10: * This file is part of FFmpeg. yading@10: * yading@10: * FFmpeg is free software; you can redistribute it and/or yading@10: * modify it under the terms of the GNU Lesser General Public yading@10: * License as published by the Free Software Foundation; either yading@10: * version 2.1 of the License, or (at your option) any later version. yading@10: * yading@10: * FFmpeg is distributed in the hope that it will be useful, yading@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: * Lesser General Public License for more details. yading@10: * yading@10: * You should have received a copy of the GNU Lesser General Public yading@10: * License along with FFmpeg; if not, write to the Free Software yading@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: */ yading@10: yading@10: #include "dsputil_sh4.h" yading@10: #include "sh4.h" yading@10: yading@10: #define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */ yading@10: #define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */ yading@10: #define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */ yading@10: #define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */ yading@10: #define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */ yading@10: #define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */ yading@10: #define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */ yading@10: yading@10: static const float even_table[] __attribute__ ((aligned(8))) = { yading@10: c4, c4, c4, c4, yading@10: c2, c6,-c6,-c2, yading@10: c4,-c4,-c4, c4, yading@10: c6,-c2, c2,-c6 yading@10: }; yading@10: yading@10: static const float odd_table[] __attribute__ ((aligned(8))) = { yading@10: c1, c3, c5, c7, yading@10: c3,-c7,-c1,-c5, yading@10: c5,-c1, c7, c3, yading@10: c7,-c5, c3,-c1 yading@10: }; yading@10: yading@10: #undef c1 yading@10: #undef c2 yading@10: #undef c3 yading@10: #undef c4 yading@10: #undef c5 yading@10: #undef c6 yading@10: #undef c7 yading@10: yading@10: #define load_matrix(table) \ yading@10: do { \ yading@10: const float *t = table; \ yading@10: __asm__ volatile( \ yading@10: " fschg\n" \ yading@10: " fmov @%0+,xd0\n" \ yading@10: " fmov @%0+,xd2\n" \ yading@10: " fmov @%0+,xd4\n" \ yading@10: " fmov @%0+,xd6\n" \ yading@10: " fmov @%0+,xd8\n" \ yading@10: " fmov @%0+,xd10\n" \ yading@10: " fmov @%0+,xd12\n" \ yading@10: " fmov @%0+,xd14\n" \ yading@10: " fschg\n" \ yading@10: : "+r"(t) \ yading@10: ); \ yading@10: } while (0) yading@10: yading@10: #define ftrv() \ yading@10: __asm__ volatile("ftrv xmtrx,fv0" \ yading@10: : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3)); yading@10: yading@10: #define DEFREG \ yading@10: register float fr0 __asm__("fr0"); \ yading@10: register float fr1 __asm__("fr1"); \ yading@10: register float fr2 __asm__("fr2"); \ yading@10: register float fr3 __asm__("fr3") yading@10: yading@10: #define DESCALE(x,n) (x)*(1.0f/(1<<(n))) yading@10: yading@10: /* this code work worse on gcc cvs. 3.2.3 work fine */ yading@10: yading@10: yading@10: //optimized yading@10: yading@10: void ff_idct_sh4(int16_t *block) yading@10: { yading@10: DEFREG; yading@10: yading@10: int i; yading@10: float tblock[8*8],*fblock; yading@10: int ofs1,ofs2,ofs3; yading@10: int fpscr; yading@10: yading@10: fp_single_enter(fpscr); yading@10: yading@10: /* row */ yading@10: yading@10: /* even part */ yading@10: load_matrix(even_table); yading@10: yading@10: fblock = tblock+4; yading@10: i = 8; yading@10: do { yading@10: fr0 = block[0]; yading@10: fr1 = block[2]; yading@10: fr2 = block[4]; yading@10: fr3 = block[6]; yading@10: block+=8; yading@10: ftrv(); yading@10: *--fblock = fr3; yading@10: *--fblock = fr2; yading@10: *--fblock = fr1; yading@10: *--fblock = fr0; yading@10: fblock+=8+4; yading@10: } while(--i); yading@10: block-=8*8; yading@10: fblock-=8*8+4; yading@10: yading@10: load_matrix(odd_table); yading@10: yading@10: i = 8; yading@10: yading@10: do { yading@10: float t0,t1,t2,t3; yading@10: fr0 = block[1]; yading@10: fr1 = block[3]; yading@10: fr2 = block[5]; yading@10: fr3 = block[7]; yading@10: block+=8; yading@10: ftrv(); yading@10: t0 = *fblock++; yading@10: t1 = *fblock++; yading@10: t2 = *fblock++; yading@10: t3 = *fblock++; yading@10: fblock+=4; yading@10: *--fblock = t0 - fr0; yading@10: *--fblock = t1 - fr1; yading@10: *--fblock = t2 - fr2; yading@10: *--fblock = t3 - fr3; yading@10: *--fblock = t3 + fr3; yading@10: *--fblock = t2 + fr2; yading@10: *--fblock = t1 + fr1; yading@10: *--fblock = t0 + fr0; yading@10: fblock+=8; yading@10: } while(--i); yading@10: block-=8*8; yading@10: fblock-=8*8; yading@10: yading@10: /* col */ yading@10: yading@10: /* even part */ yading@10: load_matrix(even_table); yading@10: yading@10: ofs1 = sizeof(float)*2*8; yading@10: ofs2 = sizeof(float)*4*8; yading@10: ofs3 = sizeof(float)*6*8; yading@10: yading@10: i = 8; yading@10: yading@10: #define OA(fblock,ofs) *(float*)((char*)fblock + ofs) yading@10: yading@10: do { yading@10: fr0 = OA(fblock, 0); yading@10: fr1 = OA(fblock,ofs1); yading@10: fr2 = OA(fblock,ofs2); yading@10: fr3 = OA(fblock,ofs3); yading@10: ftrv(); yading@10: OA(fblock,0 ) = fr0; yading@10: OA(fblock,ofs1) = fr1; yading@10: OA(fblock,ofs2) = fr2; yading@10: OA(fblock,ofs3) = fr3; yading@10: fblock++; yading@10: } while(--i); yading@10: fblock-=8; yading@10: yading@10: load_matrix(odd_table); yading@10: yading@10: i=8; yading@10: do { yading@10: float t0,t1,t2,t3; yading@10: t0 = OA(fblock, 0); /* [8*0] */ yading@10: t1 = OA(fblock,ofs1); /* [8*2] */ yading@10: t2 = OA(fblock,ofs2); /* [8*4] */ yading@10: t3 = OA(fblock,ofs3); /* [8*6] */ yading@10: fblock+=8; yading@10: fr0 = OA(fblock, 0); /* [8*1] */ yading@10: fr1 = OA(fblock,ofs1); /* [8*3] */ yading@10: fr2 = OA(fblock,ofs2); /* [8*5] */ yading@10: fr3 = OA(fblock,ofs3); /* [8*7] */ yading@10: fblock+=-8+1; yading@10: ftrv(); yading@10: block[8*0] = DESCALE(t0 + fr0,3); yading@10: block[8*7] = DESCALE(t0 - fr0,3); yading@10: block[8*1] = DESCALE(t1 + fr1,3); yading@10: block[8*6] = DESCALE(t1 - fr1,3); yading@10: block[8*2] = DESCALE(t2 + fr2,3); yading@10: block[8*5] = DESCALE(t2 - fr2,3); yading@10: block[8*3] = DESCALE(t3 + fr3,3); yading@10: block[8*4] = DESCALE(t3 - fr3,3); yading@10: block++; yading@10: } while(--i); yading@10: yading@10: fp_single_leave(fpscr); yading@10: }