yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* MMX/SSE2-optimized functions for the VP6 decoder
|
yading@10
|
3 ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
|
yading@10
|
4 ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
|
yading@10
|
5 ;*
|
yading@10
|
6 ;* This file is part of FFmpeg.
|
yading@10
|
7 ;*
|
yading@10
|
8 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
10 ;* License as published by the Free Software Foundation; either
|
yading@10
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
12 ;*
|
yading@10
|
13 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
16 ;* Lesser General Public License for more details.
|
yading@10
|
17 ;*
|
yading@10
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
19 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
21 ;******************************************************************************
|
yading@10
|
22
|
yading@10
|
23 %include "libavutil/x86/x86util.asm"
|
yading@10
|
24
|
yading@10
|
25 cextern pw_64
|
yading@10
|
26
|
yading@10
|
27 SECTION .text
|
yading@10
|
28
|
yading@10
|
29 %macro DIAG4 6
|
yading@10
|
30 %if mmsize == 8
|
yading@10
|
31 movq m0, [%1+%2]
|
yading@10
|
32 movq m1, [%1+%3]
|
yading@10
|
33 movq m3, m0
|
yading@10
|
34 movq m4, m1
|
yading@10
|
35 punpcklbw m0, m7
|
yading@10
|
36 punpcklbw m1, m7
|
yading@10
|
37 punpckhbw m3, m7
|
yading@10
|
38 punpckhbw m4, m7
|
yading@10
|
39 pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
yading@10
|
40 pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
|
yading@10
|
41 pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
yading@10
|
42 pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
|
yading@10
|
43 paddw m0, m1
|
yading@10
|
44 paddw m3, m4
|
yading@10
|
45 movq m1, [%1+%4]
|
yading@10
|
46 movq m2, [%1+%5]
|
yading@10
|
47 movq m4, m1
|
yading@10
|
48 movq m5, m2
|
yading@10
|
49 punpcklbw m1, m7
|
yading@10
|
50 punpcklbw m2, m7
|
yading@10
|
51 punpckhbw m4, m7
|
yading@10
|
52 punpckhbw m5, m7
|
yading@10
|
53 pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
yading@10
|
54 pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
|
yading@10
|
55 pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
yading@10
|
56 pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
|
yading@10
|
57 paddw m1, m2
|
yading@10
|
58 paddw m4, m5
|
yading@10
|
59 paddsw m0, m1
|
yading@10
|
60 paddsw m3, m4
|
yading@10
|
61 paddsw m0, m6 ; Add 64
|
yading@10
|
62 paddsw m3, m6 ; Add 64
|
yading@10
|
63 psraw m0, 7
|
yading@10
|
64 psraw m3, 7
|
yading@10
|
65 packuswb m0, m3
|
yading@10
|
66 movq [%6], m0
|
yading@10
|
67 %else ; mmsize == 16
|
yading@10
|
68 movq m0, [%1+%2]
|
yading@10
|
69 movq m1, [%1+%3]
|
yading@10
|
70 punpcklbw m0, m7
|
yading@10
|
71 punpcklbw m1, m7
|
yading@10
|
72 pmullw m0, m4 ; src[x-8 ] * biweight [0]
|
yading@10
|
73 pmullw m1, m5 ; src[x ] * biweight [1]
|
yading@10
|
74 paddw m0, m1
|
yading@10
|
75 movq m1, [%1+%4]
|
yading@10
|
76 movq m2, [%1+%5]
|
yading@10
|
77 punpcklbw m1, m7
|
yading@10
|
78 punpcklbw m2, m7
|
yading@10
|
79 pmullw m1, m6 ; src[x+8 ] * biweight [2]
|
yading@10
|
80 pmullw m2, m3 ; src[x+16] * biweight [3]
|
yading@10
|
81 paddw m1, m2
|
yading@10
|
82 paddsw m0, m1
|
yading@10
|
83 paddsw m0, [pw_64] ; Add 64
|
yading@10
|
84 psraw m0, 7
|
yading@10
|
85 packuswb m0, m0
|
yading@10
|
86 movq [%6], m0
|
yading@10
|
87 %endif ; mmsize == 8/16
|
yading@10
|
88 %endmacro
|
yading@10
|
89
|
yading@10
|
90 %macro SPLAT4REGS 0
|
yading@10
|
91 %if mmsize == 8
|
yading@10
|
92 movq m5, m3
|
yading@10
|
93 punpcklwd m3, m3
|
yading@10
|
94 movq m4, m3
|
yading@10
|
95 punpckldq m3, m3
|
yading@10
|
96 punpckhdq m4, m4
|
yading@10
|
97 punpckhwd m5, m5
|
yading@10
|
98 movq m2, m5
|
yading@10
|
99 punpckhdq m2, m2
|
yading@10
|
100 punpckldq m5, m5
|
yading@10
|
101 movq [rsp+8*11], m3
|
yading@10
|
102 movq [rsp+8*12], m4
|
yading@10
|
103 movq [rsp+8*13], m5
|
yading@10
|
104 movq [rsp+8*14], m2
|
yading@10
|
105 %else ; mmsize == 16
|
yading@10
|
106 pshuflw m4, m3, 0x0
|
yading@10
|
107 pshuflw m5, m3, 0x55
|
yading@10
|
108 pshuflw m6, m3, 0xAA
|
yading@10
|
109 pshuflw m3, m3, 0xFF
|
yading@10
|
110 punpcklqdq m4, m4
|
yading@10
|
111 punpcklqdq m5, m5
|
yading@10
|
112 punpcklqdq m6, m6
|
yading@10
|
113 punpcklqdq m3, m3
|
yading@10
|
114 %endif ; mmsize == 8/16
|
yading@10
|
115 %endmacro
|
yading@10
|
116
|
yading@10
|
117 %macro vp6_filter_diag4 0
|
yading@10
|
118 ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
|
yading@10
|
119 ; const int16_t h_weight[4], const int16_t v_weights[4])
|
yading@10
|
120 cglobal vp6_filter_diag4, 5, 7, 8
|
yading@10
|
121 mov r5, rsp ; backup stack pointer
|
yading@10
|
122 and rsp, ~(mmsize-1) ; align stack
|
yading@10
|
123 %if mmsize == 16
|
yading@10
|
124 sub rsp, 8*11
|
yading@10
|
125 %else
|
yading@10
|
126 sub rsp, 8*15
|
yading@10
|
127 movq m6, [pw_64]
|
yading@10
|
128 %endif
|
yading@10
|
129 %if ARCH_X86_64
|
yading@10
|
130 movsxd r2, r2d
|
yading@10
|
131 %endif
|
yading@10
|
132
|
yading@10
|
133 sub r1, r2
|
yading@10
|
134
|
yading@10
|
135 pxor m7, m7
|
yading@10
|
136 movq m3, [r3]
|
yading@10
|
137 SPLAT4REGS
|
yading@10
|
138
|
yading@10
|
139 mov r3, rsp
|
yading@10
|
140 mov r6, 11
|
yading@10
|
141 .nextrow:
|
yading@10
|
142 DIAG4 r1, -1, 0, 1, 2, r3
|
yading@10
|
143 add r3, 8
|
yading@10
|
144 add r1, r2
|
yading@10
|
145 dec r6
|
yading@10
|
146 jnz .nextrow
|
yading@10
|
147
|
yading@10
|
148 movq m3, [r4]
|
yading@10
|
149 SPLAT4REGS
|
yading@10
|
150
|
yading@10
|
151 lea r3, [rsp+8]
|
yading@10
|
152 mov r6, 8
|
yading@10
|
153 .nextcol:
|
yading@10
|
154 DIAG4 r3, -8, 0, 8, 16, r0
|
yading@10
|
155 add r3, 8
|
yading@10
|
156 add r0, r2
|
yading@10
|
157 dec r6
|
yading@10
|
158 jnz .nextcol
|
yading@10
|
159
|
yading@10
|
160 mov rsp, r5 ; restore stack pointer
|
yading@10
|
161 RET
|
yading@10
|
162 %endmacro
|
yading@10
|
163
|
yading@10
|
164 %if ARCH_X86_32
|
yading@10
|
165 INIT_MMX mmx
|
yading@10
|
166 vp6_filter_diag4
|
yading@10
|
167 %endif
|
yading@10
|
168
|
yading@10
|
169 INIT_XMM sse2
|
yading@10
|
170 vp6_filter_diag4
|