annotate ffmpeg/libavcodec/x86/vc1dsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* VC1 deblocking optimizations
yading@10 3 ;* Copyright (c) 2009 David Conrad
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 cextern pw_4
yading@10 25 cextern pw_5
yading@10 26
yading@10 27 section .text
yading@10 28
yading@10 29 ; dst_low, dst_high (src), zero
yading@10 30 ; zero-extends one vector from 8 to 16 bits
yading@10 31 %macro UNPACK_8TO16 4
yading@10 32 mova m%2, m%3
yading@10 33 punpckh%1 m%3, m%4
yading@10 34 punpckl%1 m%2, m%4
yading@10 35 %endmacro
yading@10 36
yading@10 37 %macro STORE_4_WORDS 6
yading@10 38 %if cpuflag(sse4)
yading@10 39 pextrw %1, %5, %6+0
yading@10 40 pextrw %2, %5, %6+1
yading@10 41 pextrw %3, %5, %6+2
yading@10 42 pextrw %4, %5, %6+3
yading@10 43 %else
yading@10 44 movd %6d, %5
yading@10 45 %if mmsize==16
yading@10 46 psrldq %5, 4
yading@10 47 %else
yading@10 48 psrlq %5, 32
yading@10 49 %endif
yading@10 50 mov %1, %6w
yading@10 51 shr %6, 16
yading@10 52 mov %2, %6w
yading@10 53 movd %6d, %5
yading@10 54 mov %3, %6w
yading@10 55 shr %6, 16
yading@10 56 mov %4, %6w
yading@10 57 %endif
yading@10 58 %endmacro
yading@10 59
yading@10 60 ; in: p1 p0 q0 q1, clobbers p0
yading@10 61 ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
yading@10 62 %macro VC1_LOOP_FILTER_A0 4
yading@10 63 psubw %1, %4
yading@10 64 psubw %2, %3
yading@10 65 paddw %1, %1
yading@10 66 pmullw %2, [pw_5]
yading@10 67 psubw %1, %2
yading@10 68 paddw %1, [pw_4]
yading@10 69 psraw %1, 3
yading@10 70 %endmacro
yading@10 71
yading@10 72 ; in: p0 q0 a0 a1 a2
yading@10 73 ; m0 m1 m7 m6 m5
yading@10 74 ; %1: size
yading@10 75 ; out: m0=p0' m1=q0'
yading@10 76 %macro VC1_FILTER 1
yading@10 77 PABSW m4, m7
yading@10 78 PABSW m3, m6
yading@10 79 PABSW m2, m5
yading@10 80 mova m6, m4
yading@10 81 pminsw m3, m2
yading@10 82 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
yading@10 83 psubw m3, m4
yading@10 84 pmullw m3, [pw_5] ; 5*(a3 - a0)
yading@10 85 PABSW m2, m3
yading@10 86 psraw m2, 3 ; abs(d/8)
yading@10 87 pxor m7, m3 ; d_sign ^= a0_sign
yading@10 88
yading@10 89 pxor m5, m5
yading@10 90 movd m3, r2d
yading@10 91 %if %1 > 4
yading@10 92 punpcklbw m3, m3
yading@10 93 %endif
yading@10 94 punpcklbw m3, m5
yading@10 95 pcmpgtw m3, m4 ; if (a0 < pq)
yading@10 96 pand m6, m3
yading@10 97
yading@10 98 mova m3, m0
yading@10 99 psubw m3, m1
yading@10 100 PABSW m4, m3
yading@10 101 psraw m4, 1
yading@10 102 pxor m3, m7 ; d_sign ^ clip_sign
yading@10 103 psraw m3, 15
yading@10 104 pminsw m2, m4 ; min(d, clip)
yading@10 105 pcmpgtw m4, m5
yading@10 106 pand m6, m4 ; filt3 (C return value)
yading@10 107
yading@10 108 ; each set of 4 pixels is not filtered if the 3rd is not
yading@10 109 %if mmsize==16
yading@10 110 pshuflw m4, m6, 0xaa
yading@10 111 %if %1 > 4
yading@10 112 pshufhw m4, m4, 0xaa
yading@10 113 %endif
yading@10 114 %else
yading@10 115 pshufw m4, m6, 0xaa
yading@10 116 %endif
yading@10 117 pandn m3, m4
yading@10 118 pand m2, m6
yading@10 119 pand m3, m2 ; d final
yading@10 120
yading@10 121 psraw m7, 15
yading@10 122 pxor m3, m7
yading@10 123 psubw m3, m7
yading@10 124 psubw m0, m3
yading@10 125 paddw m1, m3
yading@10 126 packuswb m0, m0
yading@10 127 packuswb m1, m1
yading@10 128 %endmacro
yading@10 129
yading@10 130 ; 1st param: size of filter
yading@10 131 ; 2nd param: mov suffix equivalent to the filter size
yading@10 132 %macro VC1_V_LOOP_FILTER 2
yading@10 133 pxor m5, m5
yading@10 134 mov%2 m6, [r4]
yading@10 135 mov%2 m4, [r4+r1]
yading@10 136 mov%2 m7, [r4+2*r1]
yading@10 137 mov%2 m0, [r4+r3]
yading@10 138 punpcklbw m6, m5
yading@10 139 punpcklbw m4, m5
yading@10 140 punpcklbw m7, m5
yading@10 141 punpcklbw m0, m5
yading@10 142
yading@10 143 VC1_LOOP_FILTER_A0 m6, m4, m7, m0
yading@10 144 mov%2 m1, [r0]
yading@10 145 mov%2 m2, [r0+r1]
yading@10 146 punpcklbw m1, m5
yading@10 147 punpcklbw m2, m5
yading@10 148 mova m4, m0
yading@10 149 VC1_LOOP_FILTER_A0 m7, m4, m1, m2
yading@10 150 mov%2 m3, [r0+2*r1]
yading@10 151 mov%2 m4, [r0+r3]
yading@10 152 punpcklbw m3, m5
yading@10 153 punpcklbw m4, m5
yading@10 154 mova m5, m1
yading@10 155 VC1_LOOP_FILTER_A0 m5, m2, m3, m4
yading@10 156
yading@10 157 VC1_FILTER %1
yading@10 158 mov%2 [r4+r3], m0
yading@10 159 mov%2 [r0], m1
yading@10 160 %endmacro
yading@10 161
yading@10 162 ; 1st param: size of filter
yading@10 163 ; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
yading@10 164 ; 2nd (optional) param: temp register to use for storing words
yading@10 165 %macro VC1_H_LOOP_FILTER 1-2
yading@10 166 %if %1 == 4
yading@10 167 movq m0, [r0 -4]
yading@10 168 movq m1, [r0+ r1-4]
yading@10 169 movq m2, [r0+2*r1-4]
yading@10 170 movq m3, [r0+ r3-4]
yading@10 171 TRANSPOSE4x4B 0, 1, 2, 3, 4
yading@10 172 %else
yading@10 173 movq m0, [r0 -4]
yading@10 174 movq m4, [r0+ r1-4]
yading@10 175 movq m1, [r0+2*r1-4]
yading@10 176 movq m5, [r0+ r3-4]
yading@10 177 movq m2, [r4 -4]
yading@10 178 movq m6, [r4+ r1-4]
yading@10 179 movq m3, [r4+2*r1-4]
yading@10 180 movq m7, [r4+ r3-4]
yading@10 181 punpcklbw m0, m4
yading@10 182 punpcklbw m1, m5
yading@10 183 punpcklbw m2, m6
yading@10 184 punpcklbw m3, m7
yading@10 185 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 186 %endif
yading@10 187 pxor m5, m5
yading@10 188
yading@10 189 UNPACK_8TO16 bw, 6, 0, 5
yading@10 190 UNPACK_8TO16 bw, 7, 1, 5
yading@10 191 VC1_LOOP_FILTER_A0 m6, m0, m7, m1
yading@10 192 UNPACK_8TO16 bw, 4, 2, 5
yading@10 193 mova m0, m1 ; m0 = p0
yading@10 194 VC1_LOOP_FILTER_A0 m7, m1, m4, m2
yading@10 195 UNPACK_8TO16 bw, 1, 3, 5
yading@10 196 mova m5, m4
yading@10 197 VC1_LOOP_FILTER_A0 m5, m2, m1, m3
yading@10 198 SWAP 1, 4 ; m1 = q0
yading@10 199
yading@10 200 VC1_FILTER %1
yading@10 201 punpcklbw m0, m1
yading@10 202 %if %0 > 1
yading@10 203 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
yading@10 204 %if %1 > 4
yading@10 205 psrldq m0, 4
yading@10 206 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
yading@10 207 %endif
yading@10 208 %else
yading@10 209 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
yading@10 210 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
yading@10 211 %endif
yading@10 212 %endmacro
yading@10 213
yading@10 214
yading@10 215 %macro START_V_FILTER 0
yading@10 216 mov r4, r0
yading@10 217 lea r3, [4*r1]
yading@10 218 sub r4, r3
yading@10 219 lea r3, [r1+2*r1]
yading@10 220 imul r2, 0x01010101
yading@10 221 %endmacro
yading@10 222
yading@10 223 %macro START_H_FILTER 1
yading@10 224 lea r3, [r1+2*r1]
yading@10 225 %if %1 > 4
yading@10 226 lea r4, [r0+4*r1]
yading@10 227 %endif
yading@10 228 imul r2, 0x01010101
yading@10 229 %endmacro
yading@10 230
yading@10 231 %macro VC1_LF 0
yading@10 232 cglobal vc1_v_loop_filter_internal
yading@10 233 VC1_V_LOOP_FILTER 4, d
yading@10 234 ret
yading@10 235
yading@10 236 cglobal vc1_h_loop_filter_internal
yading@10 237 VC1_H_LOOP_FILTER 4, r4
yading@10 238 ret
yading@10 239
yading@10 240 ; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
yading@10 241 cglobal vc1_v_loop_filter4, 3,5,0
yading@10 242 START_V_FILTER
yading@10 243 call vc1_v_loop_filter_internal
yading@10 244 RET
yading@10 245
yading@10 246 ; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
yading@10 247 cglobal vc1_h_loop_filter4, 3,5,0
yading@10 248 START_H_FILTER 4
yading@10 249 call vc1_h_loop_filter_internal
yading@10 250 RET
yading@10 251
yading@10 252 ; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
yading@10 253 cglobal vc1_v_loop_filter8, 3,5,0
yading@10 254 START_V_FILTER
yading@10 255 call vc1_v_loop_filter_internal
yading@10 256 add r4, 4
yading@10 257 add r0, 4
yading@10 258 call vc1_v_loop_filter_internal
yading@10 259 RET
yading@10 260
yading@10 261 ; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
yading@10 262 cglobal vc1_h_loop_filter8, 3,5,0
yading@10 263 START_H_FILTER 4
yading@10 264 call vc1_h_loop_filter_internal
yading@10 265 lea r0, [r0+4*r1]
yading@10 266 call vc1_h_loop_filter_internal
yading@10 267 RET
yading@10 268 %endmacro
yading@10 269
yading@10 270 INIT_MMX mmxext
yading@10 271 VC1_LF
yading@10 272
yading@10 273 INIT_XMM sse2
yading@10 274 ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
yading@10 275 cglobal vc1_v_loop_filter8, 3,5,8
yading@10 276 START_V_FILTER
yading@10 277 VC1_V_LOOP_FILTER 8, q
yading@10 278 RET
yading@10 279
yading@10 280 ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
yading@10 281 cglobal vc1_h_loop_filter8, 3,6,8
yading@10 282 START_H_FILTER 8
yading@10 283 VC1_H_LOOP_FILTER 8, r5
yading@10 284 RET
yading@10 285
yading@10 286 INIT_MMX ssse3
yading@10 287 ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
yading@10 288 cglobal vc1_v_loop_filter4, 3,5,0
yading@10 289 START_V_FILTER
yading@10 290 VC1_V_LOOP_FILTER 4, d
yading@10 291 RET
yading@10 292
yading@10 293 ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
yading@10 294 cglobal vc1_h_loop_filter4, 3,5,0
yading@10 295 START_H_FILTER 4
yading@10 296 VC1_H_LOOP_FILTER 4, r4
yading@10 297 RET
yading@10 298
yading@10 299 INIT_XMM ssse3
yading@10 300 ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
yading@10 301 cglobal vc1_v_loop_filter8, 3,5,8
yading@10 302 START_V_FILTER
yading@10 303 VC1_V_LOOP_FILTER 8, q
yading@10 304 RET
yading@10 305
yading@10 306 ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
yading@10 307 cglobal vc1_h_loop_filter8, 3,6,8
yading@10 308 START_H_FILTER 8
yading@10 309 VC1_H_LOOP_FILTER 8, r5
yading@10 310 RET
yading@10 311
yading@10 312 INIT_XMM sse4
yading@10 313 ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
yading@10 314 cglobal vc1_h_loop_filter8, 3,5,8
yading@10 315 START_H_FILTER 8
yading@10 316 VC1_H_LOOP_FILTER 8
yading@10 317 RET