annotate ffmpeg/libavfilter/x86/yadif-10.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;*****************************************************************************
yading@11 2 ;* x86-optimized functions for yadif filter
yading@11 3 ;*
yading@11 4 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
yading@11 5 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
yading@11 6 ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
yading@11 7 ;*
yading@11 8 ;* This file is part of FFmpeg.
yading@11 9 ;*
yading@11 10 ;* FFmpeg is free software; you can redistribute it and/or modify
yading@11 11 ;* it under the terms of the GNU General Public License as published by
yading@11 12 ;* the Free Software Foundation; either version 2 of the License, or
yading@11 13 ;* (at your option) any later version.
yading@11 14 ;*
yading@11 15 ;* FFmpeg is distributed in the hope that it will be useful,
yading@11 16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
yading@11 18 ;* GNU General Public License for more details.
yading@11 19 ;*
yading@11 20 ;* You should have received a copy of the GNU General Public License along
yading@11 21 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
yading@11 22 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
yading@11 23 ;******************************************************************************
yading@11 24
yading@11 25 %include "libavutil/x86/x86util.asm"
yading@11 26
yading@11 27 SECTION_RODATA
yading@11 28
yading@11 29 pw_1: times 8 dw 1
yading@11 30
yading@11 31 SECTION .text
yading@11 32
yading@11 33 %macro PABS 2
yading@11 34 %if cpuflag(ssse3)
yading@11 35 pabsw %1, %1
yading@11 36 %else
yading@11 37 pxor %2, %2
yading@11 38 pcmpgtw %2, %1
yading@11 39 pxor %1, %2
yading@11 40 psubw %1, %2
yading@11 41 %endif
yading@11 42 %endmacro
yading@11 43
yading@11 44 %macro PMAXUW 2
yading@11 45 %if cpuflag(sse4)
yading@11 46 pmaxuw %1, %2
yading@11 47 %else
yading@11 48 psubusw %1, %2
yading@11 49 paddusw %1, %2
yading@11 50 %endif
yading@11 51 %endmacro
yading@11 52
yading@11 53 %macro CHECK 2
yading@11 54 movu m2, [curq+t1+%1*2]
yading@11 55 movu m3, [curq+t0+%2*2]
yading@11 56 mova m4, m2
yading@11 57 mova m5, m2
yading@11 58 pxor m4, m3
yading@11 59 pavgw m5, m3
yading@11 60 pand m4, [pw_1]
yading@11 61 psubusw m5, m4
yading@11 62 %if mmsize == 16
yading@11 63 psrldq m5, 2
yading@11 64 %else
yading@11 65 psrlq m5, 16
yading@11 66 %endif
yading@11 67 mova m4, m2
yading@11 68 psubusw m2, m3
yading@11 69 psubusw m3, m4
yading@11 70 PMAXUW m2, m3
yading@11 71 mova m3, m2
yading@11 72 mova m4, m2
yading@11 73 %if mmsize == 16
yading@11 74 psrldq m3, 2
yading@11 75 psrldq m4, 4
yading@11 76 %else
yading@11 77 psrlq m3, 16
yading@11 78 psrlq m4, 32
yading@11 79 %endif
yading@11 80 paddw m2, m3
yading@11 81 paddw m2, m4
yading@11 82 %endmacro
yading@11 83
yading@11 84 %macro CHECK1 0
yading@11 85 mova m3, m0
yading@11 86 pcmpgtw m3, m2
yading@11 87 pminsw m0, m2
yading@11 88 mova m6, m3
yading@11 89 pand m5, m3
yading@11 90 pandn m3, m1
yading@11 91 por m3, m5
yading@11 92 mova m1, m3
yading@11 93 %endmacro
yading@11 94
yading@11 95 ; %macro CHECK2 0
yading@11 96 ; paddw m6, [pw_1]
yading@11 97 ; psllw m6, 14
yading@11 98 ; paddsw m2, m6
yading@11 99 ; mova m3, m0
yading@11 100 ; pcmpgtw m3, m2
yading@11 101 ; pminsw m0, m2
yading@11 102 ; pand m5, m3
yading@11 103 ; pandn m3, m1
yading@11 104 ; por m3, m5
yading@11 105 ; mova m1, m3
yading@11 106 ; %endmacro
yading@11 107
yading@11 108 ; This version of CHECK2 is required for 14-bit samples. The left-shift trick
yading@11 109 ; in the old code is not large enough to correctly select pixels or scores.
yading@11 110
yading@11 111 %macro CHECK2 0
yading@11 112 mova m3, m0
yading@11 113 pcmpgtw m0, m2
yading@11 114 pand m0, m6
yading@11 115 mova m6, m0
yading@11 116 pand m5, m6
yading@11 117 pand m2, m0
yading@11 118 pandn m6, m1
yading@11 119 pandn m0, m3
yading@11 120 por m6, m5
yading@11 121 por m0, m2
yading@11 122 mova m1, m6
yading@11 123 %endmacro
yading@11 124
yading@11 125 %macro LOAD 2
yading@11 126 movu %1, %2
yading@11 127 %endmacro
yading@11 128
yading@11 129 %macro FILTER 3
yading@11 130 .loop%1:
yading@11 131 pxor m7, m7
yading@11 132 LOAD m0, [curq+t1]
yading@11 133 LOAD m1, [curq+t0]
yading@11 134 LOAD m2, [%2]
yading@11 135 LOAD m3, [%3]
yading@11 136 mova m4, m3
yading@11 137 paddw m3, m2
yading@11 138 psraw m3, 1
yading@11 139 mova [rsp+ 0], m0
yading@11 140 mova [rsp+16], m3
yading@11 141 mova [rsp+32], m1
yading@11 142 psubw m2, m4
yading@11 143 PABS m2, m4
yading@11 144 LOAD m3, [prevq+t1]
yading@11 145 LOAD m4, [prevq+t0]
yading@11 146 psubw m3, m0
yading@11 147 psubw m4, m1
yading@11 148 PABS m3, m5
yading@11 149 PABS m4, m5
yading@11 150 paddw m3, m4
yading@11 151 psrlw m2, 1
yading@11 152 psrlw m3, 1
yading@11 153 pmaxsw m2, m3
yading@11 154 LOAD m3, [nextq+t1]
yading@11 155 LOAD m4, [nextq+t0]
yading@11 156 psubw m3, m0
yading@11 157 psubw m4, m1
yading@11 158 PABS m3, m5
yading@11 159 PABS m4, m5
yading@11 160 paddw m3, m4
yading@11 161 psrlw m3, 1
yading@11 162 pmaxsw m2, m3
yading@11 163 mova [rsp+48], m2
yading@11 164
yading@11 165 paddw m1, m0
yading@11 166 paddw m0, m0
yading@11 167 psubw m0, m1
yading@11 168 psrlw m1, 1
yading@11 169 PABS m0, m2
yading@11 170
yading@11 171 movu m2, [curq+t1-1*2]
yading@11 172 movu m3, [curq+t0-1*2]
yading@11 173 mova m4, m2
yading@11 174 psubusw m2, m3
yading@11 175 psubusw m3, m4
yading@11 176 PMAXUW m2, m3
yading@11 177 %if mmsize == 16
yading@11 178 mova m3, m2
yading@11 179 psrldq m3, 4
yading@11 180 %else
yading@11 181 mova m3, m2
yading@11 182 psrlq m3, 32
yading@11 183 %endif
yading@11 184 paddw m0, m2
yading@11 185 paddw m0, m3
yading@11 186 psubw m0, [pw_1]
yading@11 187
yading@11 188 CHECK -2, 0
yading@11 189 CHECK1
yading@11 190 CHECK -3, 1
yading@11 191 CHECK2
yading@11 192 CHECK 0, -2
yading@11 193 CHECK1
yading@11 194 CHECK 1, -3
yading@11 195 CHECK2
yading@11 196
yading@11 197 mova m6, [rsp+48]
yading@11 198 cmp DWORD r8m, 2
yading@11 199 jge .end%1
yading@11 200 LOAD m2, [%2+t1*2]
yading@11 201 LOAD m4, [%3+t1*2]
yading@11 202 LOAD m3, [%2+t0*2]
yading@11 203 LOAD m5, [%3+t0*2]
yading@11 204 paddw m2, m4
yading@11 205 paddw m3, m5
yading@11 206 psrlw m2, 1
yading@11 207 psrlw m3, 1
yading@11 208 mova m4, [rsp+ 0]
yading@11 209 mova m5, [rsp+16]
yading@11 210 mova m7, [rsp+32]
yading@11 211 psubw m2, m4
yading@11 212 psubw m3, m7
yading@11 213 mova m0, m5
yading@11 214 psubw m5, m4
yading@11 215 psubw m0, m7
yading@11 216 mova m4, m2
yading@11 217 pminsw m2, m3
yading@11 218 pmaxsw m3, m4
yading@11 219 pmaxsw m2, m5
yading@11 220 pminsw m3, m5
yading@11 221 pmaxsw m2, m0
yading@11 222 pminsw m3, m0
yading@11 223 pxor m4, m4
yading@11 224 pmaxsw m6, m3
yading@11 225 psubw m4, m2
yading@11 226 pmaxsw m6, m4
yading@11 227
yading@11 228 .end%1:
yading@11 229 mova m2, [rsp+16]
yading@11 230 mova m3, m2
yading@11 231 psubw m2, m6
yading@11 232 paddw m3, m6
yading@11 233 pmaxsw m1, m2
yading@11 234 pminsw m1, m3
yading@11 235
yading@11 236 movu [dstq], m1
yading@11 237 add dstq, mmsize-4
yading@11 238 add prevq, mmsize-4
yading@11 239 add curq, mmsize-4
yading@11 240 add nextq, mmsize-4
yading@11 241 sub DWORD r4m, mmsize/2-2
yading@11 242 jg .loop%1
yading@11 243 %endmacro
yading@11 244
yading@11 245 %macro YADIF 0
yading@11 246 %if ARCH_X86_32
yading@11 247 cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
yading@11 248 prefs, mrefs, parity, mode
yading@11 249 %else
yading@11 250 cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
yading@11 251 prefs, mrefs, parity, mode
yading@11 252 %endif
yading@11 253 %if ARCH_X86_32
yading@11 254 mov r4, r5mp
yading@11 255 mov r5, r6mp
yading@11 256 DECLARE_REG_TMP 4,5
yading@11 257 %else
yading@11 258 movsxd r5, DWORD r5m
yading@11 259 movsxd r6, DWORD r6m
yading@11 260 DECLARE_REG_TMP 5,6
yading@11 261 %endif
yading@11 262
yading@11 263 cmp DWORD paritym, 0
yading@11 264 je .parity0
yading@11 265 FILTER 1, prevq, curq
yading@11 266 jmp .ret
yading@11 267
yading@11 268 .parity0:
yading@11 269 FILTER 0, curq, nextq
yading@11 270
yading@11 271 .ret:
yading@11 272 RET
yading@11 273 %endmacro
yading@11 274
yading@11 275 INIT_XMM ssse3
yading@11 276 YADIF
yading@11 277 INIT_XMM sse2
yading@11 278 YADIF
yading@11 279 %if ARCH_X86_32
yading@11 280 INIT_MMX mmxext
yading@11 281 YADIF
yading@11 282 %endif