annotate ffmpeg/libavcodec/x86/h264_weight.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* SSE2-optimized weighted prediction code
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
yading@10 5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
yading@10 6 ;*
yading@10 7 ;* This file is part of FFmpeg.
yading@10 8 ;*
yading@10 9 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 10 ;* modify it under the terms of the GNU Lesser General Public
yading@10 11 ;* License as published by the Free Software Foundation; either
yading@10 12 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 13 ;*
yading@10 14 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 17 ;* Lesser General Public License for more details.
yading@10 18 ;*
yading@10 19 ;* You should have received a copy of the GNU Lesser General Public
yading@10 20 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 22 ;******************************************************************************
yading@10 23
yading@10 24 %include "libavutil/x86/x86util.asm"
yading@10 25
yading@10 26 SECTION .text
yading@10 27
yading@10 28 ;-----------------------------------------------------------------------------
yading@10 29 ; biweight pred:
yading@10 30 ;
yading@10 31 ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
yading@10 32 ; int height, int log2_denom, int weightd,
yading@10 33 ; int weights, int offset);
yading@10 34 ; and
yading@10 35 ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
yading@10 36 ; int log2_denom, int weight, int offset);
yading@10 37 ;-----------------------------------------------------------------------------
yading@10 38
yading@10 39 %macro WEIGHT_SETUP 0
yading@10 40 add r5, r5
yading@10 41 inc r5
yading@10 42 movd m3, r4d
yading@10 43 movd m5, r5d
yading@10 44 movd m6, r3d
yading@10 45 pslld m5, m6
yading@10 46 psrld m5, 1
yading@10 47 %if mmsize == 16
yading@10 48 pshuflw m3, m3, 0
yading@10 49 pshuflw m5, m5, 0
yading@10 50 punpcklqdq m3, m3
yading@10 51 punpcklqdq m5, m5
yading@10 52 %else
yading@10 53 pshufw m3, m3, 0
yading@10 54 pshufw m5, m5, 0
yading@10 55 %endif
yading@10 56 pxor m7, m7
yading@10 57 %endmacro
yading@10 58
yading@10 59 %macro WEIGHT_OP 2
yading@10 60 movh m0, [r0+%1]
yading@10 61 movh m1, [r0+%2]
yading@10 62 punpcklbw m0, m7
yading@10 63 punpcklbw m1, m7
yading@10 64 pmullw m0, m3
yading@10 65 pmullw m1, m3
yading@10 66 paddsw m0, m5
yading@10 67 paddsw m1, m5
yading@10 68 psraw m0, m6
yading@10 69 psraw m1, m6
yading@10 70 packuswb m0, m1
yading@10 71 %endmacro
yading@10 72
yading@10 73 INIT_MMX mmxext
yading@10 74 cglobal h264_weight_16, 6, 6, 0
yading@10 75 WEIGHT_SETUP
yading@10 76 .nextrow:
yading@10 77 WEIGHT_OP 0, 4
yading@10 78 mova [r0 ], m0
yading@10 79 WEIGHT_OP 8, 12
yading@10 80 mova [r0+8], m0
yading@10 81 add r0, r1
yading@10 82 dec r2d
yading@10 83 jnz .nextrow
yading@10 84 REP_RET
yading@10 85
yading@10 86 %macro WEIGHT_FUNC_MM 2
yading@10 87 cglobal h264_weight_%1, 6, 6, %2
yading@10 88 WEIGHT_SETUP
yading@10 89 .nextrow:
yading@10 90 WEIGHT_OP 0, mmsize/2
yading@10 91 mova [r0], m0
yading@10 92 add r0, r1
yading@10 93 dec r2d
yading@10 94 jnz .nextrow
yading@10 95 REP_RET
yading@10 96 %endmacro
yading@10 97
yading@10 98 INIT_MMX mmxext
yading@10 99 WEIGHT_FUNC_MM 8, 0
yading@10 100 INIT_XMM sse2
yading@10 101 WEIGHT_FUNC_MM 16, 8
yading@10 102
yading@10 103 %macro WEIGHT_FUNC_HALF_MM 2
yading@10 104 cglobal h264_weight_%1, 6, 6, %2
yading@10 105 WEIGHT_SETUP
yading@10 106 sar r2d, 1
yading@10 107 lea r3, [r1*2]
yading@10 108 .nextrow:
yading@10 109 WEIGHT_OP 0, r1
yading@10 110 movh [r0], m0
yading@10 111 %if mmsize == 16
yading@10 112 movhps [r0+r1], m0
yading@10 113 %else
yading@10 114 psrlq m0, 32
yading@10 115 movh [r0+r1], m0
yading@10 116 %endif
yading@10 117 add r0, r3
yading@10 118 dec r2d
yading@10 119 jnz .nextrow
yading@10 120 REP_RET
yading@10 121 %endmacro
yading@10 122
yading@10 123 INIT_MMX mmxext
yading@10 124 WEIGHT_FUNC_HALF_MM 4, 0
yading@10 125 INIT_XMM sse2
yading@10 126 WEIGHT_FUNC_HALF_MM 8, 8
yading@10 127
yading@10 128 %macro BIWEIGHT_SETUP 0
yading@10 129 %if ARCH_X86_64
yading@10 130 %define off_regd r7d
yading@10 131 %else
yading@10 132 %define off_regd r3d
yading@10 133 %endif
yading@10 134 mov off_regd, r7m
yading@10 135 add off_regd, 1
yading@10 136 or off_regd, 1
yading@10 137 add r4, 1
yading@10 138 cmp r5, 128
yading@10 139 jne .normal
yading@10 140 sar r5, 1
yading@10 141 sar r6, 1
yading@10 142 sar off_regd, 1
yading@10 143 sub r4, 1
yading@10 144 .normal
yading@10 145 %if cpuflag(ssse3)
yading@10 146 movd m4, r5d
yading@10 147 movd m0, r6d
yading@10 148 %else
yading@10 149 movd m3, r5d
yading@10 150 movd m4, r6d
yading@10 151 %endif
yading@10 152 movd m5, off_regd
yading@10 153 movd m6, r4d
yading@10 154 pslld m5, m6
yading@10 155 psrld m5, 1
yading@10 156 %if cpuflag(ssse3)
yading@10 157 punpcklbw m4, m0
yading@10 158 pshuflw m4, m4, 0
yading@10 159 pshuflw m5, m5, 0
yading@10 160 punpcklqdq m4, m4
yading@10 161 punpcklqdq m5, m5
yading@10 162
yading@10 163 %else
yading@10 164 %if mmsize == 16
yading@10 165 pshuflw m3, m3, 0
yading@10 166 pshuflw m4, m4, 0
yading@10 167 pshuflw m5, m5, 0
yading@10 168 punpcklqdq m3, m3
yading@10 169 punpcklqdq m4, m4
yading@10 170 punpcklqdq m5, m5
yading@10 171 %else
yading@10 172 pshufw m3, m3, 0
yading@10 173 pshufw m4, m4, 0
yading@10 174 pshufw m5, m5, 0
yading@10 175 %endif
yading@10 176 pxor m7, m7
yading@10 177 %endif
yading@10 178 %endmacro
yading@10 179
yading@10 180 %macro BIWEIGHT_STEPA 3
yading@10 181 movh m%1, [r0+%3]
yading@10 182 movh m%2, [r1+%3]
yading@10 183 punpcklbw m%1, m7
yading@10 184 punpcklbw m%2, m7
yading@10 185 pmullw m%1, m3
yading@10 186 pmullw m%2, m4
yading@10 187 paddsw m%1, m%2
yading@10 188 %endmacro
yading@10 189
yading@10 190 %macro BIWEIGHT_STEPB 0
yading@10 191 paddsw m0, m5
yading@10 192 paddsw m1, m5
yading@10 193 psraw m0, m6
yading@10 194 psraw m1, m6
yading@10 195 packuswb m0, m1
yading@10 196 %endmacro
yading@10 197
yading@10 198 INIT_MMX mmxext
yading@10 199 cglobal h264_biweight_16, 7, 8, 0
yading@10 200 BIWEIGHT_SETUP
yading@10 201 movifnidn r3d, r3m
yading@10 202 .nextrow:
yading@10 203 BIWEIGHT_STEPA 0, 1, 0
yading@10 204 BIWEIGHT_STEPA 1, 2, 4
yading@10 205 BIWEIGHT_STEPB
yading@10 206 mova [r0], m0
yading@10 207 BIWEIGHT_STEPA 0, 1, 8
yading@10 208 BIWEIGHT_STEPA 1, 2, 12
yading@10 209 BIWEIGHT_STEPB
yading@10 210 mova [r0+8], m0
yading@10 211 add r0, r2
yading@10 212 add r1, r2
yading@10 213 dec r3d
yading@10 214 jnz .nextrow
yading@10 215 REP_RET
yading@10 216
yading@10 217 %macro BIWEIGHT_FUNC_MM 2
yading@10 218 cglobal h264_biweight_%1, 7, 8, %2
yading@10 219 BIWEIGHT_SETUP
yading@10 220 movifnidn r3d, r3m
yading@10 221 .nextrow:
yading@10 222 BIWEIGHT_STEPA 0, 1, 0
yading@10 223 BIWEIGHT_STEPA 1, 2, mmsize/2
yading@10 224 BIWEIGHT_STEPB
yading@10 225 mova [r0], m0
yading@10 226 add r0, r2
yading@10 227 add r1, r2
yading@10 228 dec r3d
yading@10 229 jnz .nextrow
yading@10 230 REP_RET
yading@10 231 %endmacro
yading@10 232
yading@10 233 INIT_MMX mmxext
yading@10 234 BIWEIGHT_FUNC_MM 8, 0
yading@10 235 INIT_XMM sse2
yading@10 236 BIWEIGHT_FUNC_MM 16, 8
yading@10 237
yading@10 238 %macro BIWEIGHT_FUNC_HALF_MM 2
yading@10 239 cglobal h264_biweight_%1, 7, 8, %2
yading@10 240 BIWEIGHT_SETUP
yading@10 241 movifnidn r3d, r3m
yading@10 242 sar r3, 1
yading@10 243 lea r4, [r2*2]
yading@10 244 .nextrow:
yading@10 245 BIWEIGHT_STEPA 0, 1, 0
yading@10 246 BIWEIGHT_STEPA 1, 2, r2
yading@10 247 BIWEIGHT_STEPB
yading@10 248 movh [r0], m0
yading@10 249 %if mmsize == 16
yading@10 250 movhps [r0+r2], m0
yading@10 251 %else
yading@10 252 psrlq m0, 32
yading@10 253 movh [r0+r2], m0
yading@10 254 %endif
yading@10 255 add r0, r4
yading@10 256 add r1, r4
yading@10 257 dec r3d
yading@10 258 jnz .nextrow
yading@10 259 REP_RET
yading@10 260 %endmacro
yading@10 261
yading@10 262 INIT_MMX mmxext
yading@10 263 BIWEIGHT_FUNC_HALF_MM 4, 0
yading@10 264 INIT_XMM sse2
yading@10 265 BIWEIGHT_FUNC_HALF_MM 8, 8
yading@10 266
yading@10 267 %macro BIWEIGHT_SSSE3_OP 0
yading@10 268 pmaddubsw m0, m4
yading@10 269 pmaddubsw m2, m4
yading@10 270 paddsw m0, m5
yading@10 271 paddsw m2, m5
yading@10 272 psraw m0, m6
yading@10 273 psraw m2, m6
yading@10 274 packuswb m0, m2
yading@10 275 %endmacro
yading@10 276
yading@10 277 INIT_XMM ssse3
yading@10 278 cglobal h264_biweight_16, 7, 8, 8
yading@10 279 BIWEIGHT_SETUP
yading@10 280 movifnidn r3d, r3m
yading@10 281
yading@10 282 .nextrow:
yading@10 283 movh m0, [r0]
yading@10 284 movh m2, [r0+8]
yading@10 285 movh m3, [r1+8]
yading@10 286 punpcklbw m0, [r1]
yading@10 287 punpcklbw m2, m3
yading@10 288 BIWEIGHT_SSSE3_OP
yading@10 289 mova [r0], m0
yading@10 290 add r0, r2
yading@10 291 add r1, r2
yading@10 292 dec r3d
yading@10 293 jnz .nextrow
yading@10 294 REP_RET
yading@10 295
yading@10 296 INIT_XMM ssse3
yading@10 297 cglobal h264_biweight_8, 7, 8, 8
yading@10 298 BIWEIGHT_SETUP
yading@10 299 movifnidn r3d, r3m
yading@10 300 sar r3, 1
yading@10 301 lea r4, [r2*2]
yading@10 302
yading@10 303 .nextrow:
yading@10 304 movh m0, [r0]
yading@10 305 movh m1, [r1]
yading@10 306 movh m2, [r0+r2]
yading@10 307 movh m3, [r1+r2]
yading@10 308 punpcklbw m0, m1
yading@10 309 punpcklbw m2, m3
yading@10 310 BIWEIGHT_SSSE3_OP
yading@10 311 movh [r0], m0
yading@10 312 movhps [r0+r2], m0
yading@10 313 add r0, r4
yading@10 314 add r1, r4
yading@10 315 dec r3d
yading@10 316 jnz .nextrow
yading@10 317 REP_RET