yading@10: /* yading@10: * Optimization of some functions from mpegvideo.c for armv5te yading@10: * Copyright (c) 2007 Siarhei Siamashka yading@10: * yading@10: * This file is part of FFmpeg. yading@10: * yading@10: * FFmpeg is free software; you can redistribute it and/or yading@10: * modify it under the terms of the GNU Lesser General Public yading@10: * License as published by the Free Software Foundation; either yading@10: * version 2.1 of the License, or (at your option) any later version. yading@10: * yading@10: * FFmpeg is distributed in the hope that it will be useful, yading@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: * Lesser General Public License for more details. yading@10: * yading@10: * You should have received a copy of the GNU Lesser General Public yading@10: * License along with FFmpeg; if not, write to the Free Software yading@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: */ yading@10: yading@10: #include "config.h" yading@10: #include "libavutil/arm/asm.S" yading@10: yading@10: /* yading@10: * Special optimized version of dct_unquantize_h263_helper_c, it yading@10: * requires the block to be at least 8 bytes aligned, and may process yading@10: * more elements than requested. But it is guaranteed to never yading@10: * process more than 64 elements provided that count argument is <= 64, yading@10: * so it is safe. This function is optimized for a common distribution yading@10: * of values for nCoeffs (they are mostly multiple of 8 plus one or yading@10: * two extra elements). So this function processes data as 8 elements yading@10: * per loop iteration and contains optional 2 elements processing in yading@10: * the end. yading@10: * yading@10: * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) yading@10: */ yading@10: yading@10: .macro dequant_t dst, src, mul, add, tmp yading@10: rsbs \tmp, ip, \src, asr #16 yading@10: it gt yading@10: addgt \tmp, \add, #0 yading@10: it lt yading@10: rsblt \tmp, \add, #0 yading@10: it ne yading@10: smlatbne \dst, \src, \mul, \tmp yading@10: .endm yading@10: yading@10: .macro dequant_b dst, src, mul, add, tmp yading@10: rsbs \tmp, ip, \src, lsl #16 yading@10: it gt yading@10: addgt \tmp, \add, #0 yading@10: it lt yading@10: rsblt \tmp, \add, #0 yading@10: it ne yading@10: smlabbne \dst, \src, \mul, \tmp yading@10: .endm yading@10: yading@10: function ff_dct_unquantize_h263_armv5te, export=1 yading@10: push {r4-r9,lr} yading@10: mov ip, #0 yading@10: subs r3, r3, #2 yading@10: ble 2f yading@10: ldrd r4, r5, [r0, #0] yading@10: 1: yading@10: ldrd r6, r7, [r0, #8] yading@10: yading@10: dequant_t r9, r4, r1, r2, r9 yading@10: dequant_t lr, r5, r1, r2, lr yading@10: dequant_b r4, r4, r1, r2, r8 yading@10: dequant_b r5, r5, r1, r2, r8 yading@10: yading@10: strh r4, [r0], #2 yading@10: strh r9, [r0], #2 yading@10: strh r5, [r0], #2 yading@10: strh lr, [r0], #2 yading@10: yading@10: dequant_t r9, r6, r1, r2, r9 yading@10: dequant_t lr, r7, r1, r2, lr yading@10: dequant_b r6, r6, r1, r2, r8 yading@10: dequant_b r7, r7, r1, r2, r8 yading@10: yading@10: strh r6, [r0], #2 yading@10: strh r9, [r0], #2 yading@10: strh r7, [r0], #2 yading@10: strh lr, [r0], #2 yading@10: yading@10: subs r3, r3, #8 yading@10: it gt yading@10: ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */ yading@10: bgt 1b yading@10: yading@10: adds r3, r3, #2 yading@10: it le yading@10: pople {r4-r9,pc} yading@10: 2: yading@10: ldrsh r9, [r0, #0] yading@10: ldrsh lr, [r0, #2] yading@10: mov r8, r2 yading@10: cmp r9, #0 yading@10: it lt yading@10: rsblt r8, r2, #0 yading@10: it ne yading@10: smlabbne r9, r9, r1, r8 yading@10: mov r8, r2 yading@10: cmp lr, #0 yading@10: it lt yading@10: rsblt r8, r2, #0 yading@10: it ne yading@10: smlabbne lr, lr, r1, r8 yading@10: strh r9, [r0], #2 yading@10: strh lr, [r0], #2 yading@10: pop {r4-r9,pc} yading@10: endfunc