yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* MMX optimized discrete wavelet trasnform
|
yading@10
|
3 ;* Copyright (c) 2010 David Conrad
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of FFmpeg.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 SECTION_RODATA
|
yading@10
|
25 pw_1: times 8 dw 1
|
yading@10
|
26 pw_2: times 8 dw 2
|
yading@10
|
27 pw_8: times 8 dw 8
|
yading@10
|
28 pw_16: times 8 dw 16
|
yading@10
|
29 pw_1991: times 4 dw 9,-1
|
yading@10
|
30
|
yading@10
|
31 section .text
|
yading@10
|
32
|
yading@10
|
33 ; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
|
yading@10
|
34 %macro COMPOSE_53iL0 4
|
yading@10
|
35 paddw %2, %3
|
yading@10
|
36 paddw %2, %4
|
yading@10
|
37 psraw %2, 2
|
yading@10
|
38 psubw %1, %2
|
yading@10
|
39 %endm
|
yading@10
|
40
|
yading@10
|
41 ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
|
yading@10
|
42 ; if %4 is supplied, %1 is loaded unaligned from there
|
yading@10
|
43 ; m2: clobbered m3: pw_8 m4: pw_1991
|
yading@10
|
44 %macro COMPOSE_DD97iH0 3-4
|
yading@10
|
45 paddw m0, %3
|
yading@10
|
46 paddw m1, %2
|
yading@10
|
47 psubw m0, m3
|
yading@10
|
48 mova m2, m1
|
yading@10
|
49 punpcklwd m1, m0
|
yading@10
|
50 punpckhwd m2, m0
|
yading@10
|
51 pmaddwd m1, m4
|
yading@10
|
52 pmaddwd m2, m4
|
yading@10
|
53 %if %0 > 3
|
yading@10
|
54 movu %1, %4
|
yading@10
|
55 %endif
|
yading@10
|
56 psrad m1, 4
|
yading@10
|
57 psrad m2, 4
|
yading@10
|
58 packssdw m1, m2
|
yading@10
|
59 paddw m1, %1
|
yading@10
|
60 %endm
|
yading@10
|
61
|
yading@10
|
62 %macro COMPOSE_VERTICAL 1
|
yading@10
|
63 ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
yading@10
|
64 ; int width)
|
yading@10
|
65 cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
|
yading@10
|
66 mova m2, [pw_2]
|
yading@10
|
67 %if ARCH_X86_64
|
yading@10
|
68 mov widthd, widthd
|
yading@10
|
69 %endif
|
yading@10
|
70 .loop:
|
yading@10
|
71 sub widthq, mmsize/2
|
yading@10
|
72 mova m1, [b0q+2*widthq]
|
yading@10
|
73 mova m0, [b1q+2*widthq]
|
yading@10
|
74 COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
|
yading@10
|
75 mova [b1q+2*widthq], m0
|
yading@10
|
76 jg .loop
|
yading@10
|
77 REP_RET
|
yading@10
|
78
|
yading@10
|
79 ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
yading@10
|
80 ; int width)
|
yading@10
|
81 cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
|
yading@10
|
82 mova m1, [pw_1]
|
yading@10
|
83 %if ARCH_X86_64
|
yading@10
|
84 mov widthd, widthd
|
yading@10
|
85 %endif
|
yading@10
|
86 .loop:
|
yading@10
|
87 sub widthq, mmsize/2
|
yading@10
|
88 mova m0, [b0q+2*widthq]
|
yading@10
|
89 paddw m0, [b2q+2*widthq]
|
yading@10
|
90 paddw m0, m1
|
yading@10
|
91 psraw m0, 1
|
yading@10
|
92 paddw m0, [b1q+2*widthq]
|
yading@10
|
93 mova [b1q+2*widthq], m0
|
yading@10
|
94 jg .loop
|
yading@10
|
95 REP_RET
|
yading@10
|
96
|
yading@10
|
97 ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
yading@10
|
98 ; IDWTELEM *b3, IDWTELEM *b4, int width)
|
yading@10
|
99 cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
|
yading@10
|
100 mova m3, [pw_8]
|
yading@10
|
101 mova m4, [pw_1991]
|
yading@10
|
102 %if ARCH_X86_64
|
yading@10
|
103 mov widthd, widthd
|
yading@10
|
104 %endif
|
yading@10
|
105 .loop:
|
yading@10
|
106 sub widthq, mmsize/2
|
yading@10
|
107 mova m0, [b0q+2*widthq]
|
yading@10
|
108 mova m1, [b1q+2*widthq]
|
yading@10
|
109 COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
|
yading@10
|
110 mova [b2q+2*widthq], m1
|
yading@10
|
111 jg .loop
|
yading@10
|
112 REP_RET
|
yading@10
|
113
|
yading@10
|
114 ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
yading@10
|
115 ; IDWTELEM *b3, IDWTELEM *b4, int width)
|
yading@10
|
116 cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
|
yading@10
|
117 mova m3, [pw_16]
|
yading@10
|
118 mova m4, [pw_1991]
|
yading@10
|
119 %if ARCH_X86_64
|
yading@10
|
120 mov widthd, widthd
|
yading@10
|
121 %endif
|
yading@10
|
122 .loop:
|
yading@10
|
123 sub widthq, mmsize/2
|
yading@10
|
124 mova m0, [b0q+2*widthq]
|
yading@10
|
125 mova m1, [b1q+2*widthq]
|
yading@10
|
126 mova m5, [b2q+2*widthq]
|
yading@10
|
127 paddw m0, [b4q+2*widthq]
|
yading@10
|
128 paddw m1, [b3q+2*widthq]
|
yading@10
|
129 psubw m0, m3
|
yading@10
|
130 mova m2, m1
|
yading@10
|
131 punpcklwd m1, m0
|
yading@10
|
132 punpckhwd m2, m0
|
yading@10
|
133 pmaddwd m1, m4
|
yading@10
|
134 pmaddwd m2, m4
|
yading@10
|
135 psrad m1, 5
|
yading@10
|
136 psrad m2, 5
|
yading@10
|
137 packssdw m1, m2
|
yading@10
|
138 psubw m5, m1
|
yading@10
|
139 mova [b2q+2*widthq], m5
|
yading@10
|
140 jg .loop
|
yading@10
|
141 REP_RET
|
yading@10
|
142
|
yading@10
|
143 ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
|
yading@10
|
144 cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
|
yading@10
|
145 mova m3, [pw_1]
|
yading@10
|
146 %if ARCH_X86_64
|
yading@10
|
147 mov widthd, widthd
|
yading@10
|
148 %endif
|
yading@10
|
149 .loop:
|
yading@10
|
150 sub widthq, mmsize/2
|
yading@10
|
151 mova m1, [b1q+2*widthq]
|
yading@10
|
152 mova m0, [b0q+2*widthq]
|
yading@10
|
153 mova m2, m1
|
yading@10
|
154 paddw m1, m3
|
yading@10
|
155 psraw m1, 1
|
yading@10
|
156 psubw m0, m1
|
yading@10
|
157 mova [b0q+2*widthq], m0
|
yading@10
|
158 paddw m2, m0
|
yading@10
|
159 mova [b1q+2*widthq], m2
|
yading@10
|
160 jg .loop
|
yading@10
|
161 REP_RET
|
yading@10
|
162 %endmacro
|
yading@10
|
163
|
yading@10
|
164 ; extend the left and right edges of the tmp array by %1 and %2 respectively
|
yading@10
|
165 %macro EDGE_EXTENSION 3
|
yading@10
|
166 mov %3, [tmpq]
|
yading@10
|
167 %assign %%i 1
|
yading@10
|
168 %rep %1
|
yading@10
|
169 mov [tmpq-2*%%i], %3
|
yading@10
|
170 %assign %%i %%i+1
|
yading@10
|
171 %endrep
|
yading@10
|
172 mov %3, [tmpq+2*w2q-2]
|
yading@10
|
173 %assign %%i 0
|
yading@10
|
174 %rep %2
|
yading@10
|
175 mov [tmpq+2*w2q+2*%%i], %3
|
yading@10
|
176 %assign %%i %%i+1
|
yading@10
|
177 %endrep
|
yading@10
|
178 %endmacro
|
yading@10
|
179
|
yading@10
|
180
|
yading@10
|
181 %macro HAAR_HORIZONTAL 2
|
yading@10
|
182 ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
|
yading@10
|
183 cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
|
yading@10
|
184 mov w2d, wd
|
yading@10
|
185 xor xq, xq
|
yading@10
|
186 shr w2d, 1
|
yading@10
|
187 lea b_w2q, [bq+wq]
|
yading@10
|
188 mova m3, [pw_1]
|
yading@10
|
189 .lowpass_loop:
|
yading@10
|
190 movu m1, [b_w2q + 2*xq]
|
yading@10
|
191 mova m0, [bq + 2*xq]
|
yading@10
|
192 paddw m1, m3
|
yading@10
|
193 psraw m1, 1
|
yading@10
|
194 psubw m0, m1
|
yading@10
|
195 mova [tmpq + 2*xq], m0
|
yading@10
|
196 add xq, mmsize/2
|
yading@10
|
197 cmp xq, w2q
|
yading@10
|
198 jl .lowpass_loop
|
yading@10
|
199
|
yading@10
|
200 xor xq, xq
|
yading@10
|
201 and w2q, ~(mmsize/2 - 1)
|
yading@10
|
202 cmp w2q, mmsize/2
|
yading@10
|
203 jl .end
|
yading@10
|
204
|
yading@10
|
205 .highpass_loop:
|
yading@10
|
206 movu m1, [b_w2q + 2*xq]
|
yading@10
|
207 mova m0, [tmpq + 2*xq]
|
yading@10
|
208 paddw m1, m0
|
yading@10
|
209
|
yading@10
|
210 ; shift and interleave
|
yading@10
|
211 %if %2 == 1
|
yading@10
|
212 paddw m0, m3
|
yading@10
|
213 paddw m1, m3
|
yading@10
|
214 psraw m0, 1
|
yading@10
|
215 psraw m1, 1
|
yading@10
|
216 %endif
|
yading@10
|
217 mova m2, m0
|
yading@10
|
218 punpcklwd m0, m1
|
yading@10
|
219 punpckhwd m2, m1
|
yading@10
|
220 mova [bq+4*xq], m0
|
yading@10
|
221 mova [bq+4*xq+mmsize], m2
|
yading@10
|
222
|
yading@10
|
223 add xq, mmsize/2
|
yading@10
|
224 cmp xq, w2q
|
yading@10
|
225 jl .highpass_loop
|
yading@10
|
226 .end:
|
yading@10
|
227 REP_RET
|
yading@10
|
228 %endmacro
|
yading@10
|
229
|
yading@10
|
230
|
yading@10
|
231 INIT_XMM
|
yading@10
|
232 ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
|
yading@10
|
233 cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
|
yading@10
|
234 mov w2d, wd
|
yading@10
|
235 xor xd, xd
|
yading@10
|
236 shr w2d, 1
|
yading@10
|
237 lea b_w2q, [bq+wq]
|
yading@10
|
238 movu m4, [bq+wq]
|
yading@10
|
239 mova m7, [pw_2]
|
yading@10
|
240 pslldq m4, 14
|
yading@10
|
241 .lowpass_loop:
|
yading@10
|
242 movu m1, [b_w2q + 2*xq]
|
yading@10
|
243 mova m0, [bq + 2*xq]
|
yading@10
|
244 mova m2, m1
|
yading@10
|
245 palignr m1, m4, 14
|
yading@10
|
246 mova m4, m2
|
yading@10
|
247 COMPOSE_53iL0 m0, m1, m2, m7
|
yading@10
|
248 mova [tmpq + 2*xq], m0
|
yading@10
|
249 add xd, mmsize/2
|
yading@10
|
250 cmp xd, w2d
|
yading@10
|
251 jl .lowpass_loop
|
yading@10
|
252
|
yading@10
|
253 EDGE_EXTENSION 1, 2, xw
|
yading@10
|
254 ; leave the last up to 7 (sse) or 3 (mmx) values for C
|
yading@10
|
255 xor xd, xd
|
yading@10
|
256 and w2d, ~(mmsize/2 - 1)
|
yading@10
|
257 cmp w2d, mmsize/2
|
yading@10
|
258 jl .end
|
yading@10
|
259
|
yading@10
|
260 mova m7, [tmpq-mmsize]
|
yading@10
|
261 mova m0, [tmpq]
|
yading@10
|
262 mova m5, [pw_1]
|
yading@10
|
263 mova m3, [pw_8]
|
yading@10
|
264 mova m4, [pw_1991]
|
yading@10
|
265 .highpass_loop:
|
yading@10
|
266 mova m6, m0
|
yading@10
|
267 palignr m0, m7, 14
|
yading@10
|
268 mova m7, [tmpq + 2*xq + 16]
|
yading@10
|
269 mova m1, m7
|
yading@10
|
270 mova m2, m7
|
yading@10
|
271 palignr m1, m6, 2
|
yading@10
|
272 palignr m2, m6, 4
|
yading@10
|
273 COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
|
yading@10
|
274 mova m0, m7
|
yading@10
|
275 mova m7, m6
|
yading@10
|
276
|
yading@10
|
277 ; shift and interleave
|
yading@10
|
278 paddw m6, m5
|
yading@10
|
279 paddw m1, m5
|
yading@10
|
280 psraw m6, 1
|
yading@10
|
281 psraw m1, 1
|
yading@10
|
282 mova m2, m6
|
yading@10
|
283 punpcklwd m6, m1
|
yading@10
|
284 punpckhwd m2, m1
|
yading@10
|
285 mova [bq+4*xq], m6
|
yading@10
|
286 mova [bq+4*xq+mmsize], m2
|
yading@10
|
287
|
yading@10
|
288 add xd, mmsize/2
|
yading@10
|
289 cmp xd, w2d
|
yading@10
|
290 jl .highpass_loop
|
yading@10
|
291 .end:
|
yading@10
|
292 REP_RET
|
yading@10
|
293
|
yading@10
|
294
|
yading@10
|
295 %if ARCH_X86_64 == 0
|
yading@10
|
296 INIT_MMX
|
yading@10
|
297 COMPOSE_VERTICAL mmx
|
yading@10
|
298 HAAR_HORIZONTAL mmx, 0
|
yading@10
|
299 HAAR_HORIZONTAL mmx, 1
|
yading@10
|
300 %endif
|
yading@10
|
301
|
yading@10
|
302 ;;INIT_XMM
|
yading@10
|
303 INIT_XMM
|
yading@10
|
304 COMPOSE_VERTICAL sse2
|
yading@10
|
305 HAAR_HORIZONTAL sse2, 0
|
yading@10
|
306 HAAR_HORIZONTAL sse2, 1
|