annotate ffmpeg/libavcodec/x86/dwt_yasm.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* MMX optimized discrete wavelet trasnform
yading@10 3 ;* Copyright (c) 2010 David Conrad
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_RODATA
yading@10 25 pw_1: times 8 dw 1
yading@10 26 pw_2: times 8 dw 2
yading@10 27 pw_8: times 8 dw 8
yading@10 28 pw_16: times 8 dw 16
yading@10 29 pw_1991: times 4 dw 9,-1
yading@10 30
yading@10 31 section .text
yading@10 32
yading@10 33 ; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
yading@10 34 %macro COMPOSE_53iL0 4
yading@10 35 paddw %2, %3
yading@10 36 paddw %2, %4
yading@10 37 psraw %2, 2
yading@10 38 psubw %1, %2
yading@10 39 %endm
yading@10 40
yading@10 41 ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
yading@10 42 ; if %4 is supplied, %1 is loaded unaligned from there
yading@10 43 ; m2: clobbered m3: pw_8 m4: pw_1991
yading@10 44 %macro COMPOSE_DD97iH0 3-4
yading@10 45 paddw m0, %3
yading@10 46 paddw m1, %2
yading@10 47 psubw m0, m3
yading@10 48 mova m2, m1
yading@10 49 punpcklwd m1, m0
yading@10 50 punpckhwd m2, m0
yading@10 51 pmaddwd m1, m4
yading@10 52 pmaddwd m2, m4
yading@10 53 %if %0 > 3
yading@10 54 movu %1, %4
yading@10 55 %endif
yading@10 56 psrad m1, 4
yading@10 57 psrad m2, 4
yading@10 58 packssdw m1, m2
yading@10 59 paddw m1, %1
yading@10 60 %endm
yading@10 61
yading@10 62 %macro COMPOSE_VERTICAL 1
yading@10 63 ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
yading@10 64 ; int width)
yading@10 65 cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
yading@10 66 mova m2, [pw_2]
yading@10 67 %if ARCH_X86_64
yading@10 68 mov widthd, widthd
yading@10 69 %endif
yading@10 70 .loop:
yading@10 71 sub widthq, mmsize/2
yading@10 72 mova m1, [b0q+2*widthq]
yading@10 73 mova m0, [b1q+2*widthq]
yading@10 74 COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
yading@10 75 mova [b1q+2*widthq], m0
yading@10 76 jg .loop
yading@10 77 REP_RET
yading@10 78
yading@10 79 ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
yading@10 80 ; int width)
yading@10 81 cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
yading@10 82 mova m1, [pw_1]
yading@10 83 %if ARCH_X86_64
yading@10 84 mov widthd, widthd
yading@10 85 %endif
yading@10 86 .loop:
yading@10 87 sub widthq, mmsize/2
yading@10 88 mova m0, [b0q+2*widthq]
yading@10 89 paddw m0, [b2q+2*widthq]
yading@10 90 paddw m0, m1
yading@10 91 psraw m0, 1
yading@10 92 paddw m0, [b1q+2*widthq]
yading@10 93 mova [b1q+2*widthq], m0
yading@10 94 jg .loop
yading@10 95 REP_RET
yading@10 96
yading@10 97 ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
yading@10 98 ; IDWTELEM *b3, IDWTELEM *b4, int width)
yading@10 99 cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
yading@10 100 mova m3, [pw_8]
yading@10 101 mova m4, [pw_1991]
yading@10 102 %if ARCH_X86_64
yading@10 103 mov widthd, widthd
yading@10 104 %endif
yading@10 105 .loop:
yading@10 106 sub widthq, mmsize/2
yading@10 107 mova m0, [b0q+2*widthq]
yading@10 108 mova m1, [b1q+2*widthq]
yading@10 109 COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
yading@10 110 mova [b2q+2*widthq], m1
yading@10 111 jg .loop
yading@10 112 REP_RET
yading@10 113
yading@10 114 ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
yading@10 115 ; IDWTELEM *b3, IDWTELEM *b4, int width)
yading@10 116 cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
yading@10 117 mova m3, [pw_16]
yading@10 118 mova m4, [pw_1991]
yading@10 119 %if ARCH_X86_64
yading@10 120 mov widthd, widthd
yading@10 121 %endif
yading@10 122 .loop:
yading@10 123 sub widthq, mmsize/2
yading@10 124 mova m0, [b0q+2*widthq]
yading@10 125 mova m1, [b1q+2*widthq]
yading@10 126 mova m5, [b2q+2*widthq]
yading@10 127 paddw m0, [b4q+2*widthq]
yading@10 128 paddw m1, [b3q+2*widthq]
yading@10 129 psubw m0, m3
yading@10 130 mova m2, m1
yading@10 131 punpcklwd m1, m0
yading@10 132 punpckhwd m2, m0
yading@10 133 pmaddwd m1, m4
yading@10 134 pmaddwd m2, m4
yading@10 135 psrad m1, 5
yading@10 136 psrad m2, 5
yading@10 137 packssdw m1, m2
yading@10 138 psubw m5, m1
yading@10 139 mova [b2q+2*widthq], m5
yading@10 140 jg .loop
yading@10 141 REP_RET
yading@10 142
yading@10 143 ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
yading@10 144 cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
yading@10 145 mova m3, [pw_1]
yading@10 146 %if ARCH_X86_64
yading@10 147 mov widthd, widthd
yading@10 148 %endif
yading@10 149 .loop:
yading@10 150 sub widthq, mmsize/2
yading@10 151 mova m1, [b1q+2*widthq]
yading@10 152 mova m0, [b0q+2*widthq]
yading@10 153 mova m2, m1
yading@10 154 paddw m1, m3
yading@10 155 psraw m1, 1
yading@10 156 psubw m0, m1
yading@10 157 mova [b0q+2*widthq], m0
yading@10 158 paddw m2, m0
yading@10 159 mova [b1q+2*widthq], m2
yading@10 160 jg .loop
yading@10 161 REP_RET
yading@10 162 %endmacro
yading@10 163
yading@10 164 ; extend the left and right edges of the tmp array by %1 and %2 respectively
yading@10 165 %macro EDGE_EXTENSION 3
yading@10 166 mov %3, [tmpq]
yading@10 167 %assign %%i 1
yading@10 168 %rep %1
yading@10 169 mov [tmpq-2*%%i], %3
yading@10 170 %assign %%i %%i+1
yading@10 171 %endrep
yading@10 172 mov %3, [tmpq+2*w2q-2]
yading@10 173 %assign %%i 0
yading@10 174 %rep %2
yading@10 175 mov [tmpq+2*w2q+2*%%i], %3
yading@10 176 %assign %%i %%i+1
yading@10 177 %endrep
yading@10 178 %endmacro
yading@10 179
yading@10 180
yading@10 181 %macro HAAR_HORIZONTAL 2
yading@10 182 ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
yading@10 183 cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
yading@10 184 mov w2d, wd
yading@10 185 xor xq, xq
yading@10 186 shr w2d, 1
yading@10 187 lea b_w2q, [bq+wq]
yading@10 188 mova m3, [pw_1]
yading@10 189 .lowpass_loop:
yading@10 190 movu m1, [b_w2q + 2*xq]
yading@10 191 mova m0, [bq + 2*xq]
yading@10 192 paddw m1, m3
yading@10 193 psraw m1, 1
yading@10 194 psubw m0, m1
yading@10 195 mova [tmpq + 2*xq], m0
yading@10 196 add xq, mmsize/2
yading@10 197 cmp xq, w2q
yading@10 198 jl .lowpass_loop
yading@10 199
yading@10 200 xor xq, xq
yading@10 201 and w2q, ~(mmsize/2 - 1)
yading@10 202 cmp w2q, mmsize/2
yading@10 203 jl .end
yading@10 204
yading@10 205 .highpass_loop:
yading@10 206 movu m1, [b_w2q + 2*xq]
yading@10 207 mova m0, [tmpq + 2*xq]
yading@10 208 paddw m1, m0
yading@10 209
yading@10 210 ; shift and interleave
yading@10 211 %if %2 == 1
yading@10 212 paddw m0, m3
yading@10 213 paddw m1, m3
yading@10 214 psraw m0, 1
yading@10 215 psraw m1, 1
yading@10 216 %endif
yading@10 217 mova m2, m0
yading@10 218 punpcklwd m0, m1
yading@10 219 punpckhwd m2, m1
yading@10 220 mova [bq+4*xq], m0
yading@10 221 mova [bq+4*xq+mmsize], m2
yading@10 222
yading@10 223 add xq, mmsize/2
yading@10 224 cmp xq, w2q
yading@10 225 jl .highpass_loop
yading@10 226 .end:
yading@10 227 REP_RET
yading@10 228 %endmacro
yading@10 229
yading@10 230
yading@10 231 INIT_XMM
yading@10 232 ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
yading@10 233 cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
yading@10 234 mov w2d, wd
yading@10 235 xor xd, xd
yading@10 236 shr w2d, 1
yading@10 237 lea b_w2q, [bq+wq]
yading@10 238 movu m4, [bq+wq]
yading@10 239 mova m7, [pw_2]
yading@10 240 pslldq m4, 14
yading@10 241 .lowpass_loop:
yading@10 242 movu m1, [b_w2q + 2*xq]
yading@10 243 mova m0, [bq + 2*xq]
yading@10 244 mova m2, m1
yading@10 245 palignr m1, m4, 14
yading@10 246 mova m4, m2
yading@10 247 COMPOSE_53iL0 m0, m1, m2, m7
yading@10 248 mova [tmpq + 2*xq], m0
yading@10 249 add xd, mmsize/2
yading@10 250 cmp xd, w2d
yading@10 251 jl .lowpass_loop
yading@10 252
yading@10 253 EDGE_EXTENSION 1, 2, xw
yading@10 254 ; leave the last up to 7 (sse) or 3 (mmx) values for C
yading@10 255 xor xd, xd
yading@10 256 and w2d, ~(mmsize/2 - 1)
yading@10 257 cmp w2d, mmsize/2
yading@10 258 jl .end
yading@10 259
yading@10 260 mova m7, [tmpq-mmsize]
yading@10 261 mova m0, [tmpq]
yading@10 262 mova m5, [pw_1]
yading@10 263 mova m3, [pw_8]
yading@10 264 mova m4, [pw_1991]
yading@10 265 .highpass_loop:
yading@10 266 mova m6, m0
yading@10 267 palignr m0, m7, 14
yading@10 268 mova m7, [tmpq + 2*xq + 16]
yading@10 269 mova m1, m7
yading@10 270 mova m2, m7
yading@10 271 palignr m1, m6, 2
yading@10 272 palignr m2, m6, 4
yading@10 273 COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
yading@10 274 mova m0, m7
yading@10 275 mova m7, m6
yading@10 276
yading@10 277 ; shift and interleave
yading@10 278 paddw m6, m5
yading@10 279 paddw m1, m5
yading@10 280 psraw m6, 1
yading@10 281 psraw m1, 1
yading@10 282 mova m2, m6
yading@10 283 punpcklwd m6, m1
yading@10 284 punpckhwd m2, m1
yading@10 285 mova [bq+4*xq], m6
yading@10 286 mova [bq+4*xq+mmsize], m2
yading@10 287
yading@10 288 add xd, mmsize/2
yading@10 289 cmp xd, w2d
yading@10 290 jl .highpass_loop
yading@10 291 .end:
yading@10 292 REP_RET
yading@10 293
yading@10 294
yading@10 295 %if ARCH_X86_64 == 0
yading@10 296 INIT_MMX
yading@10 297 COMPOSE_VERTICAL mmx
yading@10 298 HAAR_HORIZONTAL mmx, 0
yading@10 299 HAAR_HORIZONTAL mmx, 1
yading@10 300 %endif
yading@10 301
yading@10 302 ;;INIT_XMM
yading@10 303 INIT_XMM
yading@10 304 COMPOSE_VERTICAL sse2
yading@10 305 HAAR_HORIZONTAL sse2, 0
yading@10 306 HAAR_HORIZONTAL sse2, 1