yading@10
|
1 /*
|
yading@10
|
2 * Optimization of some functions from mpegvideo.c for armv5te
|
yading@10
|
3 * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
yading@10
|
4 *
|
yading@10
|
5 * This file is part of FFmpeg.
|
yading@10
|
6 *
|
yading@10
|
7 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 * License as published by the Free Software Foundation; either
|
yading@10
|
10 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 *
|
yading@10
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 * Lesser General Public License for more details.
|
yading@10
|
16 *
|
yading@10
|
17 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 */
|
yading@10
|
21
|
yading@10
|
22 #include "config.h"
|
yading@10
|
23 #include "libavutil/arm/asm.S"
|
yading@10
|
24
|
yading@10
|
25 /*
|
yading@10
|
26 * Special optimized version of dct_unquantize_h263_helper_c, it
|
yading@10
|
27 * requires the block to be at least 8 bytes aligned, and may process
|
yading@10
|
28 * more elements than requested. But it is guaranteed to never
|
yading@10
|
29 * process more than 64 elements provided that count argument is <= 64,
|
yading@10
|
30 * so it is safe. This function is optimized for a common distribution
|
yading@10
|
31 * of values for nCoeffs (they are mostly multiple of 8 plus one or
|
yading@10
|
32 * two extra elements). So this function processes data as 8 elements
|
yading@10
|
33 * per loop iteration and contains optional 2 elements processing in
|
yading@10
|
34 * the end.
|
yading@10
|
35 *
|
yading@10
|
36 * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
|
yading@10
|
37 */
|
yading@10
|
38
|
yading@10
|
39 .macro dequant_t dst, src, mul, add, tmp
|
yading@10
|
40 rsbs \tmp, ip, \src, asr #16
|
yading@10
|
41 it gt
|
yading@10
|
42 addgt \tmp, \add, #0
|
yading@10
|
43 it lt
|
yading@10
|
44 rsblt \tmp, \add, #0
|
yading@10
|
45 it ne
|
yading@10
|
46 smlatbne \dst, \src, \mul, \tmp
|
yading@10
|
47 .endm
|
yading@10
|
48
|
yading@10
|
49 .macro dequant_b dst, src, mul, add, tmp
|
yading@10
|
50 rsbs \tmp, ip, \src, lsl #16
|
yading@10
|
51 it gt
|
yading@10
|
52 addgt \tmp, \add, #0
|
yading@10
|
53 it lt
|
yading@10
|
54 rsblt \tmp, \add, #0
|
yading@10
|
55 it ne
|
yading@10
|
56 smlabbne \dst, \src, \mul, \tmp
|
yading@10
|
57 .endm
|
yading@10
|
58
|
yading@10
|
59 function ff_dct_unquantize_h263_armv5te, export=1
|
yading@10
|
60 push {r4-r9,lr}
|
yading@10
|
61 mov ip, #0
|
yading@10
|
62 subs r3, r3, #2
|
yading@10
|
63 ble 2f
|
yading@10
|
64 ldrd r4, r5, [r0, #0]
|
yading@10
|
65 1:
|
yading@10
|
66 ldrd r6, r7, [r0, #8]
|
yading@10
|
67
|
yading@10
|
68 dequant_t r9, r4, r1, r2, r9
|
yading@10
|
69 dequant_t lr, r5, r1, r2, lr
|
yading@10
|
70 dequant_b r4, r4, r1, r2, r8
|
yading@10
|
71 dequant_b r5, r5, r1, r2, r8
|
yading@10
|
72
|
yading@10
|
73 strh r4, [r0], #2
|
yading@10
|
74 strh r9, [r0], #2
|
yading@10
|
75 strh r5, [r0], #2
|
yading@10
|
76 strh lr, [r0], #2
|
yading@10
|
77
|
yading@10
|
78 dequant_t r9, r6, r1, r2, r9
|
yading@10
|
79 dequant_t lr, r7, r1, r2, lr
|
yading@10
|
80 dequant_b r6, r6, r1, r2, r8
|
yading@10
|
81 dequant_b r7, r7, r1, r2, r8
|
yading@10
|
82
|
yading@10
|
83 strh r6, [r0], #2
|
yading@10
|
84 strh r9, [r0], #2
|
yading@10
|
85 strh r7, [r0], #2
|
yading@10
|
86 strh lr, [r0], #2
|
yading@10
|
87
|
yading@10
|
88 subs r3, r3, #8
|
yading@10
|
89 it gt
|
yading@10
|
90 ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
|
yading@10
|
91 bgt 1b
|
yading@10
|
92
|
yading@10
|
93 adds r3, r3, #2
|
yading@10
|
94 it le
|
yading@10
|
95 pople {r4-r9,pc}
|
yading@10
|
96 2:
|
yading@10
|
97 ldrsh r9, [r0, #0]
|
yading@10
|
98 ldrsh lr, [r0, #2]
|
yading@10
|
99 mov r8, r2
|
yading@10
|
100 cmp r9, #0
|
yading@10
|
101 it lt
|
yading@10
|
102 rsblt r8, r2, #0
|
yading@10
|
103 it ne
|
yading@10
|
104 smlabbne r9, r9, r1, r8
|
yading@10
|
105 mov r8, r2
|
yading@10
|
106 cmp lr, #0
|
yading@10
|
107 it lt
|
yading@10
|
108 rsblt r8, r2, #0
|
yading@10
|
109 it ne
|
yading@10
|
110 smlabbne lr, lr, r1, r8
|
yading@10
|
111 strh r9, [r0], #2
|
yading@10
|
112 strh lr, [r0], #2
|
yading@10
|
113 pop {r4-r9,pc}
|
yading@10
|
114 endfunc
|