annotate ffmpeg/libavfilter/x86/yadif-16.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;*****************************************************************************
yading@11 2 ;* x86-optimized functions for yadif filter
yading@11 3 ;*
yading@11 4 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
yading@11 5 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
yading@11 6 ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
yading@11 7 ;*
yading@11 8 ;* This file is part of FFmpeg.
yading@11 9 ;*
yading@11 10 ;* FFmpeg is free software; you can redistribute it and/or modify
yading@11 11 ;* it under the terms of the GNU General Public License as published by
yading@11 12 ;* the Free Software Foundation; either version 2 of the License, or
yading@11 13 ;* (at your option) any later version.
yading@11 14 ;*
yading@11 15 ;* FFmpeg is distributed in the hope that it will be useful,
yading@11 16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
yading@11 18 ;* GNU General Public License for more details.
yading@11 19 ;*
yading@11 20 ;* You should have received a copy of the GNU General Public License along
yading@11 21 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
yading@11 22 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
yading@11 23 ;******************************************************************************
yading@11 24
yading@11 25 %include "libavutil/x86/x86util.asm"
yading@11 26
yading@11 27 SECTION_RODATA
yading@11 28
yading@11 29 pw_1: times 8 dw 1
yading@11 30 pw_8000: times 8 dw 0x8000
yading@11 31 pd_1: times 4 dd 1
yading@11 32 pd_8000: times 4 dd 0x8000
yading@11 33
yading@11 34 SECTION .text
yading@11 35
yading@11 36 %macro PIXSHIFT1 1
yading@11 37 %if cpuflag(sse2)
yading@11 38 psrldq %1, 2
yading@11 39 %else
yading@11 40 psrlq %1, 16
yading@11 41 %endif
yading@11 42 %endmacro
yading@11 43
yading@11 44 %macro PIXSHIFT2 1
yading@11 45 %if cpuflag(sse2)
yading@11 46 psrldq %1, 4
yading@11 47 %else
yading@11 48 psrlq %1, 32
yading@11 49 %endif
yading@11 50 %endmacro
yading@11 51
yading@11 52 %macro PABS 2
yading@11 53 %if cpuflag(ssse3)
yading@11 54 pabsd %1, %1
yading@11 55 %else
yading@11 56 pxor %2, %2
yading@11 57 pcmpgtd %2, %1
yading@11 58 pxor %1, %2
yading@11 59 psubd %1, %2
yading@11 60 %endif
yading@11 61 %endmacro
yading@11 62
yading@11 63 %macro PACK 1
yading@11 64 %if cpuflag(sse4)
yading@11 65 packusdw %1, %1
yading@11 66 %else
yading@11 67 psubd %1, [pd_8000]
yading@11 68 packssdw %1, %1
yading@11 69 paddw %1, [pw_8000]
yading@11 70 %endif
yading@11 71 %endmacro
yading@11 72
yading@11 73 %macro PMINSD 3
yading@11 74 %if cpuflag(sse4)
yading@11 75 pminsd %1, %2
yading@11 76 %else
yading@11 77 mova %3, %2
yading@11 78 pcmpgtd %3, %1
yading@11 79 pand %1, %3
yading@11 80 pandn %3, %2
yading@11 81 por %1, %3
yading@11 82 %endif
yading@11 83 %endmacro
yading@11 84
yading@11 85 %macro PMAXSD 3
yading@11 86 %if cpuflag(sse4)
yading@11 87 pmaxsd %1, %2
yading@11 88 %else
yading@11 89 mova %3, %1
yading@11 90 pcmpgtd %3, %2
yading@11 91 pand %1, %3
yading@11 92 pandn %3, %2
yading@11 93 por %1, %3
yading@11 94 %endif
yading@11 95 %endmacro
yading@11 96
yading@11 97 %macro PMAXUW 2
yading@11 98 %if cpuflag(sse4)
yading@11 99 pmaxuw %1, %2
yading@11 100 %else
yading@11 101 psubusw %1, %2
yading@11 102 paddusw %1, %2
yading@11 103 %endif
yading@11 104 %endmacro
yading@11 105
yading@11 106 %macro CHECK 2
yading@11 107 movu m2, [curq+t1+%1*2]
yading@11 108 movu m3, [curq+t0+%2*2]
yading@11 109 mova m4, m2
yading@11 110 mova m5, m2
yading@11 111 pxor m4, m3
yading@11 112 pavgw m5, m3
yading@11 113 pand m4, [pw_1]
yading@11 114 psubusw m5, m4
yading@11 115 %if mmsize == 16
yading@11 116 psrldq m5, 2
yading@11 117 %else
yading@11 118 psrlq m5, 16
yading@11 119 %endif
yading@11 120 punpcklwd m5, m7
yading@11 121 mova m4, m2
yading@11 122 psubusw m2, m3
yading@11 123 psubusw m3, m4
yading@11 124 PMAXUW m2, m3
yading@11 125 mova m3, m2
yading@11 126 mova m4, m2
yading@11 127 %if mmsize == 16
yading@11 128 psrldq m3, 2
yading@11 129 psrldq m4, 4
yading@11 130 %else
yading@11 131 psrlq m3, 16
yading@11 132 psrlq m4, 32
yading@11 133 %endif
yading@11 134 punpcklwd m2, m7
yading@11 135 punpcklwd m3, m7
yading@11 136 punpcklwd m4, m7
yading@11 137 paddd m2, m3
yading@11 138 paddd m2, m4
yading@11 139 %endmacro
yading@11 140
yading@11 141 %macro CHECK1 0
yading@11 142 mova m3, m0
yading@11 143 pcmpgtd m3, m2
yading@11 144 PMINSD m0, m2, m6
yading@11 145 mova m6, m3
yading@11 146 pand m5, m3
yading@11 147 pandn m3, m1
yading@11 148 por m3, m5
yading@11 149 mova m1, m3
yading@11 150 %endmacro
yading@11 151
yading@11 152 %macro CHECK2 0
yading@11 153 paddd m6, [pd_1]
yading@11 154 pslld m6, 30
yading@11 155 paddd m2, m6
yading@11 156 mova m3, m0
yading@11 157 pcmpgtd m3, m2
yading@11 158 PMINSD m0, m2, m4
yading@11 159 pand m5, m3
yading@11 160 pandn m3, m1
yading@11 161 por m3, m5
yading@11 162 mova m1, m3
yading@11 163 %endmacro
yading@11 164
yading@11 165 ; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
yading@11 166 ; am not sure whether it is any faster. A rewrite or refactor of the filter
yading@11 167 ; code should make it possible to eliminate the move intruction at the end. It
yading@11 168 ; exists to satisfy the expectation that the "score" values are in m1.
yading@11 169
yading@11 170 ; %macro CHECK2 0
yading@11 171 ; mova m3, m0
yading@11 172 ; pcmpgtd m0, m2
yading@11 173 ; pand m0, m6
yading@11 174 ; mova m6, m0
yading@11 175 ; pand m5, m6
yading@11 176 ; pand m2, m0
yading@11 177 ; pandn m6, m1
yading@11 178 ; pandn m0, m3
yading@11 179 ; por m6, m5
yading@11 180 ; por m0, m2
yading@11 181 ; mova m1, m6
yading@11 182 ; %endmacro
yading@11 183
yading@11 184 %macro LOAD 2
yading@11 185 movh %1, %2
yading@11 186 punpcklwd %1, m7
yading@11 187 %endmacro
yading@11 188
yading@11 189 %macro FILTER 3
yading@11 190 .loop%1:
yading@11 191 pxor m7, m7
yading@11 192 LOAD m0, [curq+t1]
yading@11 193 LOAD m1, [curq+t0]
yading@11 194 LOAD m2, [%2]
yading@11 195 LOAD m3, [%3]
yading@11 196 mova m4, m3
yading@11 197 paddd m3, m2
yading@11 198 psrad m3, 1
yading@11 199 mova [rsp+ 0], m0
yading@11 200 mova [rsp+16], m3
yading@11 201 mova [rsp+32], m1
yading@11 202 psubd m2, m4
yading@11 203 PABS m2, m4
yading@11 204 LOAD m3, [prevq+t1]
yading@11 205 LOAD m4, [prevq+t0]
yading@11 206 psubd m3, m0
yading@11 207 psubd m4, m1
yading@11 208 PABS m3, m5
yading@11 209 PABS m4, m5
yading@11 210 paddd m3, m4
yading@11 211 psrld m2, 1
yading@11 212 psrld m3, 1
yading@11 213 PMAXSD m2, m3, m6
yading@11 214 LOAD m3, [nextq+t1]
yading@11 215 LOAD m4, [nextq+t0]
yading@11 216 psubd m3, m0
yading@11 217 psubd m4, m1
yading@11 218 PABS m3, m5
yading@11 219 PABS m4, m5
yading@11 220 paddd m3, m4
yading@11 221 psrld m3, 1
yading@11 222 PMAXSD m2, m3, m6
yading@11 223 mova [rsp+48], m2
yading@11 224
yading@11 225 paddd m1, m0
yading@11 226 paddd m0, m0
yading@11 227 psubd m0, m1
yading@11 228 psrld m1, 1
yading@11 229 PABS m0, m2
yading@11 230
yading@11 231 movu m2, [curq+t1-1*2]
yading@11 232 movu m3, [curq+t0-1*2]
yading@11 233 mova m4, m2
yading@11 234 psubusw m2, m3
yading@11 235 psubusw m3, m4
yading@11 236 PMAXUW m2, m3
yading@11 237 %if mmsize == 16
yading@11 238 mova m3, m2
yading@11 239 psrldq m3, 4
yading@11 240 %else
yading@11 241 mova m3, m2
yading@11 242 psrlq m3, 32
yading@11 243 %endif
yading@11 244 punpcklwd m2, m7
yading@11 245 punpcklwd m3, m7
yading@11 246 paddd m0, m2
yading@11 247 paddd m0, m3
yading@11 248 psubd m0, [pd_1]
yading@11 249
yading@11 250 CHECK -2, 0
yading@11 251 CHECK1
yading@11 252 CHECK -3, 1
yading@11 253 CHECK2
yading@11 254 CHECK 0, -2
yading@11 255 CHECK1
yading@11 256 CHECK 1, -3
yading@11 257 CHECK2
yading@11 258
yading@11 259 mova m6, [rsp+48]
yading@11 260 cmp DWORD r8m, 2
yading@11 261 jge .end%1
yading@11 262 LOAD m2, [%2+t1*2]
yading@11 263 LOAD m4, [%3+t1*2]
yading@11 264 LOAD m3, [%2+t0*2]
yading@11 265 LOAD m5, [%3+t0*2]
yading@11 266 paddd m2, m4
yading@11 267 paddd m3, m5
yading@11 268 psrld m2, 1
yading@11 269 psrld m3, 1
yading@11 270 mova m4, [rsp+ 0]
yading@11 271 mova m5, [rsp+16]
yading@11 272 mova m7, [rsp+32]
yading@11 273 psubd m2, m4
yading@11 274 psubd m3, m7
yading@11 275 mova m0, m5
yading@11 276 psubd m5, m4
yading@11 277 psubd m0, m7
yading@11 278 mova m4, m2
yading@11 279 PMINSD m2, m3, m7
yading@11 280 PMAXSD m3, m4, m7
yading@11 281 PMAXSD m2, m5, m7
yading@11 282 PMINSD m3, m5, m7
yading@11 283 PMAXSD m2, m0, m7
yading@11 284 PMINSD m3, m0, m7
yading@11 285 pxor m4, m4
yading@11 286 PMAXSD m6, m3, m7
yading@11 287 psubd m4, m2
yading@11 288 PMAXSD m6, m4, m7
yading@11 289
yading@11 290 .end%1:
yading@11 291 mova m2, [rsp+16]
yading@11 292 mova m3, m2
yading@11 293 psubd m2, m6
yading@11 294 paddd m3, m6
yading@11 295 PMAXSD m1, m2, m7
yading@11 296 PMINSD m1, m3, m7
yading@11 297 PACK m1
yading@11 298
yading@11 299 movh [dstq], m1
yading@11 300 add dstq, mmsize/2
yading@11 301 add prevq, mmsize/2
yading@11 302 add curq, mmsize/2
yading@11 303 add nextq, mmsize/2
yading@11 304 sub DWORD r4m, mmsize/4
yading@11 305 jg .loop%1
yading@11 306 %endmacro
yading@11 307
yading@11 308 %macro YADIF 0
yading@11 309 %if ARCH_X86_32
yading@11 310 cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
yading@11 311 prefs, mrefs, parity, mode
yading@11 312 %else
yading@11 313 cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
yading@11 314 prefs, mrefs, parity, mode
yading@11 315 %endif
yading@11 316 %if ARCH_X86_32
yading@11 317 mov r4, r5mp
yading@11 318 mov r5, r6mp
yading@11 319 DECLARE_REG_TMP 4,5
yading@11 320 %else
yading@11 321 movsxd r5, DWORD r5m
yading@11 322 movsxd r6, DWORD r6m
yading@11 323 DECLARE_REG_TMP 5,6
yading@11 324 %endif
yading@11 325
yading@11 326 cmp DWORD paritym, 0
yading@11 327 je .parity0
yading@11 328 FILTER 1, prevq, curq
yading@11 329 jmp .ret
yading@11 330
yading@11 331 .parity0:
yading@11 332 FILTER 0, curq, nextq
yading@11 333
yading@11 334 .ret:
yading@11 335 RET
yading@11 336 %endmacro
yading@11 337
yading@11 338 INIT_XMM sse4
yading@11 339 YADIF
yading@11 340 INIT_XMM ssse3
yading@11 341 YADIF
yading@11 342 INIT_XMM sse2
yading@11 343 YADIF
yading@11 344 %if ARCH_X86_32
yading@11 345 INIT_MMX mmxext
yading@11 346 YADIF
yading@11 347 %endif