annotate ffmpeg/libswscale/x86/output.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;******************************************************************************
yading@11 2 ;* x86-optimized vertical line scaling functions
yading@11 3 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
yading@11 4 ;* Kieran Kunhya <kieran@kunhya.com>
yading@11 5 ;*
yading@11 6 ;* This file is part of Libav.
yading@11 7 ;*
yading@11 8 ;* Libav is free software; you can redistribute it and/or
yading@11 9 ;* modify it under the terms of the GNU Lesser General Public
yading@11 10 ;* License as published by the Free Software Foundation; either
yading@11 11 ;* version 2.1 of the License, or (at your option) any later version.
yading@11 12 ;*
yading@11 13 ;* Libav is distributed in the hope that it will be useful,
yading@11 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@11 16 ;* Lesser General Public License for more details.
yading@11 17 ;*
yading@11 18 ;* You should have received a copy of the GNU Lesser General Public
yading@11 19 ;* License along with Libav; if not, write to the Free Software
yading@11 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@11 21 ;******************************************************************************
yading@11 22
yading@11 23 %include "libavutil/x86/x86util.asm"
yading@11 24
yading@11 25 SECTION_RODATA
yading@11 26
yading@11 27 minshort: times 8 dw 0x8000
yading@11 28 yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
yading@11 29 yuv2yuvX_10_start: times 4 dd 0x10000
yading@11 30 yuv2yuvX_9_start: times 4 dd 0x20000
yading@11 31 yuv2yuvX_10_upper: times 8 dw 0x3ff
yading@11 32 yuv2yuvX_9_upper: times 8 dw 0x1ff
yading@11 33 pd_4: times 4 dd 4
yading@11 34 pd_4min0x40000:times 4 dd 4 - (0x40000)
yading@11 35 pw_16: times 8 dw 16
yading@11 36 pw_32: times 8 dw 32
yading@11 37 pw_512: times 8 dw 512
yading@11 38 pw_1024: times 8 dw 1024
yading@11 39
yading@11 40 SECTION .text
yading@11 41
yading@11 42 ;-----------------------------------------------------------------------------
yading@11 43 ; vertical line scaling
yading@11 44 ;
yading@11 45 ; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
yading@11 46 ; const uint8_t *dither, int offset)
yading@11 47 ; and
yading@11 48 ; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
yading@11 49 ; const int16_t **src, uint8_t *dst, int dstW,
yading@11 50 ; const uint8_t *dither, int offset)
yading@11 51 ;
yading@11 52 ; Scale one or $filterSize lines of source data to generate one line of output
yading@11 53 ; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
yading@11 54 ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
yading@11 55 ; of 2. $offset is either 0 or 3. $dither holds 8 values.
yading@11 56 ;-----------------------------------------------------------------------------
yading@11 57
yading@11 58 %macro yuv2planeX_fn 3
yading@11 59
yading@11 60 %if ARCH_X86_32
yading@11 61 %define cntr_reg fltsizeq
yading@11 62 %define movsx mov
yading@11 63 %else
yading@11 64 %define cntr_reg r7
yading@11 65 %define movsx movsxd
yading@11 66 %endif
yading@11 67
yading@11 68 cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
yading@11 69 %if %1 == 8 || %1 == 9 || %1 == 10
yading@11 70 pxor m6, m6
yading@11 71 %endif ; %1 == 8/9/10
yading@11 72
yading@11 73 %if %1 == 8
yading@11 74 %if ARCH_X86_32
yading@11 75 %assign pad 0x2c - (stack_offset & 15)
yading@11 76 SUB rsp, pad
yading@11 77 %define m_dith m7
yading@11 78 %else ; x86-64
yading@11 79 %define m_dith m9
yading@11 80 %endif ; x86-32
yading@11 81
yading@11 82 ; create registers holding dither
yading@11 83 movq m_dith, [ditherq] ; dither
yading@11 84 test offsetd, offsetd
yading@11 85 jz .no_rot
yading@11 86 %if mmsize == 16
yading@11 87 punpcklqdq m_dith, m_dith
yading@11 88 %endif ; mmsize == 16
yading@11 89 PALIGNR m_dith, m_dith, 3, m0
yading@11 90 .no_rot:
yading@11 91 %if mmsize == 16
yading@11 92 punpcklbw m_dith, m6
yading@11 93 %if ARCH_X86_64
yading@11 94 punpcklwd m8, m_dith, m6
yading@11 95 pslld m8, 12
yading@11 96 %else ; x86-32
yading@11 97 punpcklwd m5, m_dith, m6
yading@11 98 pslld m5, 12
yading@11 99 %endif ; x86-32/64
yading@11 100 punpckhwd m_dith, m6
yading@11 101 pslld m_dith, 12
yading@11 102 %if ARCH_X86_32
yading@11 103 mova [rsp+ 0], m5
yading@11 104 mova [rsp+16], m_dith
yading@11 105 %endif
yading@11 106 %else ; mmsize == 8
yading@11 107 punpcklbw m5, m_dith, m6
yading@11 108 punpckhbw m_dith, m6
yading@11 109 punpcklwd m4, m5, m6
yading@11 110 punpckhwd m5, m6
yading@11 111 punpcklwd m3, m_dith, m6
yading@11 112 punpckhwd m_dith, m6
yading@11 113 pslld m4, 12
yading@11 114 pslld m5, 12
yading@11 115 pslld m3, 12
yading@11 116 pslld m_dith, 12
yading@11 117 mova [rsp+ 0], m4
yading@11 118 mova [rsp+ 8], m5
yading@11 119 mova [rsp+16], m3
yading@11 120 mova [rsp+24], m_dith
yading@11 121 %endif ; mmsize == 8/16
yading@11 122 %endif ; %1 == 8
yading@11 123
yading@11 124 xor r5, r5
yading@11 125
yading@11 126 .pixelloop:
yading@11 127 %assign %%i 0
yading@11 128 ; the rep here is for the 8bit output mmx case, where dither covers
yading@11 129 ; 8 pixels but we can only handle 2 pixels per register, and thus 4
yading@11 130 ; pixels per iteration. In order to not have to keep track of where
yading@11 131 ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
yading@11 132 %if %1 == 8
yading@11 133 %assign %%repcnt 16/mmsize
yading@11 134 %else
yading@11 135 %assign %%repcnt 1
yading@11 136 %endif
yading@11 137
yading@11 138 %rep %%repcnt
yading@11 139
yading@11 140 %if %1 == 8
yading@11 141 %if ARCH_X86_32
yading@11 142 mova m2, [rsp+mmsize*(0+%%i)]
yading@11 143 mova m1, [rsp+mmsize*(1+%%i)]
yading@11 144 %else ; x86-64
yading@11 145 mova m2, m8
yading@11 146 mova m1, m_dith
yading@11 147 %endif ; x86-32/64
yading@11 148 %else ; %1 == 9/10/16
yading@11 149 mova m1, [yuv2yuvX_%1_start]
yading@11 150 mova m2, m1
yading@11 151 %endif ; %1 == 8/9/10/16
yading@11 152 movsx cntr_reg, fltsizem
yading@11 153 .filterloop_ %+ %%i:
yading@11 154 ; input pixels
yading@11 155 mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
yading@11 156 %if %1 == 16
yading@11 157 mova m3, [r6+r5*4]
yading@11 158 mova m5, [r6+r5*4+mmsize]
yading@11 159 %else ; %1 == 8/9/10
yading@11 160 mova m3, [r6+r5*2]
yading@11 161 %endif ; %1 == 8/9/10/16
yading@11 162 mov r6, [srcq+gprsize*cntr_reg-gprsize]
yading@11 163 %if %1 == 16
yading@11 164 mova m4, [r6+r5*4]
yading@11 165 mova m6, [r6+r5*4+mmsize]
yading@11 166 %else ; %1 == 8/9/10
yading@11 167 mova m4, [r6+r5*2]
yading@11 168 %endif ; %1 == 8/9/10/16
yading@11 169
yading@11 170 ; coefficients
yading@11 171 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
yading@11 172 %if %1 == 16
yading@11 173 pshuflw m7, m0, 0 ; coeff[0]
yading@11 174 pshuflw m0, m0, 0x55 ; coeff[1]
yading@11 175 pmovsxwd m7, m7 ; word -> dword
yading@11 176 pmovsxwd m0, m0 ; word -> dword
yading@11 177
yading@11 178 pmulld m3, m7
yading@11 179 pmulld m5, m7
yading@11 180 pmulld m4, m0
yading@11 181 pmulld m6, m0
yading@11 182
yading@11 183 paddd m2, m3
yading@11 184 paddd m1, m5
yading@11 185 paddd m2, m4
yading@11 186 paddd m1, m6
yading@11 187 %else ; %1 == 10/9/8
yading@11 188 punpcklwd m5, m3, m4
yading@11 189 punpckhwd m3, m4
yading@11 190 SPLATD m0
yading@11 191
yading@11 192 pmaddwd m5, m0
yading@11 193 pmaddwd m3, m0
yading@11 194
yading@11 195 paddd m2, m5
yading@11 196 paddd m1, m3
yading@11 197 %endif ; %1 == 8/9/10/16
yading@11 198
yading@11 199 sub cntr_reg, 2
yading@11 200 jg .filterloop_ %+ %%i
yading@11 201
yading@11 202 %if %1 == 16
yading@11 203 psrad m2, 31 - %1
yading@11 204 psrad m1, 31 - %1
yading@11 205 %else ; %1 == 10/9/8
yading@11 206 psrad m2, 27 - %1
yading@11 207 psrad m1, 27 - %1
yading@11 208 %endif ; %1 == 8/9/10/16
yading@11 209
yading@11 210 %if %1 == 8
yading@11 211 packssdw m2, m1
yading@11 212 packuswb m2, m2
yading@11 213 movh [dstq+r5*1], m2
yading@11 214 %else ; %1 == 9/10/16
yading@11 215 %if %1 == 16
yading@11 216 packssdw m2, m1
yading@11 217 paddw m2, [minshort]
yading@11 218 %else ; %1 == 9/10
yading@11 219 %if cpuflag(sse4)
yading@11 220 packusdw m2, m1
yading@11 221 %else ; mmxext/sse2
yading@11 222 packssdw m2, m1
yading@11 223 pmaxsw m2, m6
yading@11 224 %endif ; mmxext/sse2/sse4/avx
yading@11 225 pminsw m2, [yuv2yuvX_%1_upper]
yading@11 226 %endif ; %1 == 9/10/16
yading@11 227 mova [dstq+r5*2], m2
yading@11 228 %endif ; %1 == 8/9/10/16
yading@11 229
yading@11 230 add r5, mmsize/2
yading@11 231 sub wd, mmsize/2
yading@11 232
yading@11 233 %assign %%i %%i+2
yading@11 234 %endrep
yading@11 235 jg .pixelloop
yading@11 236
yading@11 237 %if %1 == 8
yading@11 238 %if ARCH_X86_32
yading@11 239 ADD rsp, pad
yading@11 240 RET
yading@11 241 %else ; x86-64
yading@11 242 REP_RET
yading@11 243 %endif ; x86-32/64
yading@11 244 %else ; %1 == 9/10/16
yading@11 245 REP_RET
yading@11 246 %endif ; %1 == 8/9/10/16
yading@11 247 %endmacro
yading@11 248
yading@11 249 %if ARCH_X86_32
yading@11 250 INIT_MMX mmxext
yading@11 251 yuv2planeX_fn 8, 0, 7
yading@11 252 yuv2planeX_fn 9, 0, 5
yading@11 253 yuv2planeX_fn 10, 0, 5
yading@11 254 %endif
yading@11 255
yading@11 256 INIT_XMM sse2
yading@11 257 yuv2planeX_fn 8, 10, 7
yading@11 258 yuv2planeX_fn 9, 7, 5
yading@11 259 yuv2planeX_fn 10, 7, 5
yading@11 260
yading@11 261 INIT_XMM sse4
yading@11 262 yuv2planeX_fn 8, 10, 7
yading@11 263 yuv2planeX_fn 9, 7, 5
yading@11 264 yuv2planeX_fn 10, 7, 5
yading@11 265 yuv2planeX_fn 16, 8, 5
yading@11 266
yading@11 267 %if HAVE_AVX_EXTERNAL
yading@11 268 INIT_XMM avx
yading@11 269 yuv2planeX_fn 8, 10, 7
yading@11 270 yuv2planeX_fn 9, 7, 5
yading@11 271 yuv2planeX_fn 10, 7, 5
yading@11 272 %endif
yading@11 273
yading@11 274 ; %1=outout-bpc, %2=alignment (u/a)
yading@11 275 %macro yuv2plane1_mainloop 2
yading@11 276 .loop_%2:
yading@11 277 %if %1 == 8
yading@11 278 paddsw m0, m2, [srcq+wq*2+mmsize*0]
yading@11 279 paddsw m1, m3, [srcq+wq*2+mmsize*1]
yading@11 280 psraw m0, 7
yading@11 281 psraw m1, 7
yading@11 282 packuswb m0, m1
yading@11 283 mov%2 [dstq+wq], m0
yading@11 284 %elif %1 == 16
yading@11 285 paddd m0, m4, [srcq+wq*4+mmsize*0]
yading@11 286 paddd m1, m4, [srcq+wq*4+mmsize*1]
yading@11 287 paddd m2, m4, [srcq+wq*4+mmsize*2]
yading@11 288 paddd m3, m4, [srcq+wq*4+mmsize*3]
yading@11 289 psrad m0, 3
yading@11 290 psrad m1, 3
yading@11 291 psrad m2, 3
yading@11 292 psrad m3, 3
yading@11 293 %if cpuflag(sse4) ; avx/sse4
yading@11 294 packusdw m0, m1
yading@11 295 packusdw m2, m3
yading@11 296 %else ; mmx/sse2
yading@11 297 packssdw m0, m1
yading@11 298 packssdw m2, m3
yading@11 299 paddw m0, m5
yading@11 300 paddw m2, m5
yading@11 301 %endif ; mmx/sse2/sse4/avx
yading@11 302 mov%2 [dstq+wq*2+mmsize*0], m0
yading@11 303 mov%2 [dstq+wq*2+mmsize*1], m2
yading@11 304 %else ; %1 == 9/10
yading@11 305 paddsw m0, m2, [srcq+wq*2+mmsize*0]
yading@11 306 paddsw m1, m2, [srcq+wq*2+mmsize*1]
yading@11 307 psraw m0, 15 - %1
yading@11 308 psraw m1, 15 - %1
yading@11 309 pmaxsw m0, m4
yading@11 310 pmaxsw m1, m4
yading@11 311 pminsw m0, m3
yading@11 312 pminsw m1, m3
yading@11 313 mov%2 [dstq+wq*2+mmsize*0], m0
yading@11 314 mov%2 [dstq+wq*2+mmsize*1], m1
yading@11 315 %endif
yading@11 316 add wq, mmsize
yading@11 317 jl .loop_%2
yading@11 318 %endmacro
yading@11 319
yading@11 320 %macro yuv2plane1_fn 3
yading@11 321 cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
yading@11 322 movsxdifnidn wq, wd
yading@11 323 add wq, mmsize - 1
yading@11 324 and wq, ~(mmsize - 1)
yading@11 325 %if %1 == 8
yading@11 326 add dstq, wq
yading@11 327 %else ; %1 != 8
yading@11 328 lea dstq, [dstq+wq*2]
yading@11 329 %endif ; %1 == 8
yading@11 330 %if %1 == 16
yading@11 331 lea srcq, [srcq+wq*4]
yading@11 332 %else ; %1 != 16
yading@11 333 lea srcq, [srcq+wq*2]
yading@11 334 %endif ; %1 == 16
yading@11 335 neg wq
yading@11 336
yading@11 337 %if %1 == 8
yading@11 338 pxor m4, m4 ; zero
yading@11 339
yading@11 340 ; create registers holding dither
yading@11 341 movq m3, [ditherq] ; dither
yading@11 342 test offsetd, offsetd
yading@11 343 jz .no_rot
yading@11 344 %if mmsize == 16
yading@11 345 punpcklqdq m3, m3
yading@11 346 %endif ; mmsize == 16
yading@11 347 PALIGNR m3, m3, 3, m2
yading@11 348 .no_rot:
yading@11 349 %if mmsize == 8
yading@11 350 mova m2, m3
yading@11 351 punpckhbw m3, m4 ; byte->word
yading@11 352 punpcklbw m2, m4 ; byte->word
yading@11 353 %else
yading@11 354 punpcklbw m3, m4
yading@11 355 mova m2, m3
yading@11 356 %endif
yading@11 357 %elif %1 == 9
yading@11 358 pxor m4, m4
yading@11 359 mova m3, [pw_512]
yading@11 360 mova m2, [pw_32]
yading@11 361 %elif %1 == 10
yading@11 362 pxor m4, m4
yading@11 363 mova m3, [pw_1024]
yading@11 364 mova m2, [pw_16]
yading@11 365 %else ; %1 == 16
yading@11 366 %if cpuflag(sse4) ; sse4/avx
yading@11 367 mova m4, [pd_4]
yading@11 368 %else ; mmx/sse2
yading@11 369 mova m4, [pd_4min0x40000]
yading@11 370 mova m5, [minshort]
yading@11 371 %endif ; mmx/sse2/sse4/avx
yading@11 372 %endif ; %1 == ..
yading@11 373
yading@11 374 ; actual pixel scaling
yading@11 375 %if mmsize == 8
yading@11 376 yuv2plane1_mainloop %1, a
yading@11 377 %else ; mmsize == 16
yading@11 378 test dstq, 15
yading@11 379 jnz .unaligned
yading@11 380 yuv2plane1_mainloop %1, a
yading@11 381 REP_RET
yading@11 382 .unaligned:
yading@11 383 yuv2plane1_mainloop %1, u
yading@11 384 %endif ; mmsize == 8/16
yading@11 385 REP_RET
yading@11 386 %endmacro
yading@11 387
yading@11 388 %if ARCH_X86_32
yading@11 389 INIT_MMX mmx
yading@11 390 yuv2plane1_fn 8, 0, 5
yading@11 391 yuv2plane1_fn 16, 0, 3
yading@11 392
yading@11 393 INIT_MMX mmxext
yading@11 394 yuv2plane1_fn 9, 0, 3
yading@11 395 yuv2plane1_fn 10, 0, 3
yading@11 396 %endif
yading@11 397
yading@11 398 INIT_XMM sse2
yading@11 399 yuv2plane1_fn 8, 5, 5
yading@11 400 yuv2plane1_fn 9, 5, 3
yading@11 401 yuv2plane1_fn 10, 5, 3
yading@11 402 yuv2plane1_fn 16, 6, 3
yading@11 403
yading@11 404 INIT_XMM sse4
yading@11 405 yuv2plane1_fn 16, 5, 3
yading@11 406
yading@11 407 %if HAVE_AVX_EXTERNAL
yading@11 408 INIT_XMM avx
yading@11 409 yuv2plane1_fn 8, 5, 5
yading@11 410 yuv2plane1_fn 9, 5, 3
yading@11 411 yuv2plane1_fn 10, 5, 3
yading@11 412 yuv2plane1_fn 16, 5, 3
yading@11 413 %endif