annotate ffmpeg/libavutil/x86/x86inc.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;*****************************************************************************
yading@11 2 ;* x86inc.asm: x264asm abstraction layer
yading@11 3 ;*****************************************************************************
yading@11 4 ;* Copyright (C) 2005-2012 x264 project
yading@11 5 ;*
yading@11 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
yading@11 7 ;* Anton Mitrofanov <BugMaster@narod.ru>
yading@11 8 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
yading@11 9 ;* Henrik Gramner <hengar-6@student.ltu.se>
yading@11 10 ;*
yading@11 11 ;* Permission to use, copy, modify, and/or distribute this software for any
yading@11 12 ;* purpose with or without fee is hereby granted, provided that the above
yading@11 13 ;* copyright notice and this permission notice appear in all copies.
yading@11 14 ;*
yading@11 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
yading@11 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
yading@11 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
yading@11 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
yading@11 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
yading@11 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
yading@11 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
yading@11 22 ;*****************************************************************************
yading@11 23
yading@11 24 ; This is a header file for the x264ASM assembly language, which uses
yading@11 25 ; NASM/YASM syntax combined with a large number of macros to provide easy
yading@11 26 ; abstraction between different calling conventions (x86_32, win64, linux64).
yading@11 27 ; It also has various other useful features to simplify writing the kind of
yading@11 28 ; DSP functions that are most often used in x264.
yading@11 29
yading@11 30 ; Unlike the rest of x264, this file is available under an ISC license, as it
yading@11 31 ; has significant usefulness outside of x264 and we want it to be available
yading@11 32 ; to the largest audience possible. Of course, if you modify it for your own
yading@11 33 ; purposes to add a new feature, we strongly encourage contributing a patch
yading@11 34 ; as this feature might be useful for others as well. Send patches or ideas
yading@11 35 ; to x264-devel@videolan.org .
yading@11 36
yading@11 37 %ifndef private_prefix
yading@11 38 %define private_prefix x264
yading@11 39 %endif
yading@11 40
yading@11 41 %ifndef public_prefix
yading@11 42 %define public_prefix private_prefix
yading@11 43 %endif
yading@11 44
yading@11 45 %define WIN64 0
yading@11 46 %define UNIX64 0
yading@11 47 %if ARCH_X86_64
yading@11 48 %ifidn __OUTPUT_FORMAT__,win32
yading@11 49 %define WIN64 1
yading@11 50 %elifidn __OUTPUT_FORMAT__,win64
yading@11 51 %define WIN64 1
yading@11 52 %else
yading@11 53 %define UNIX64 1
yading@11 54 %endif
yading@11 55 %endif
yading@11 56
yading@11 57 %ifdef PREFIX
yading@11 58 %define mangle(x) _ %+ x
yading@11 59 %else
yading@11 60 %define mangle(x) x
yading@11 61 %endif
yading@11 62
yading@11 63 ; Name of the .rodata section.
yading@11 64 %macro SECTION_RODATA 0-1 16
yading@11 65 ; Kludge: Something on OS X fails to align .rodata even given an align
yading@11 66 ; attribute, so use a different read-only section. This has been fixed in
yading@11 67 ; yasm 0.8.0 and nasm 2.6.
yading@11 68 %ifdef __YASM_VERSION_ID__
yading@11 69 %if __YASM_VERSION_ID__ < 00080000h
yading@11 70 %define NEED_MACHO_RODATA_KLUDGE
yading@11 71 %endif
yading@11 72 %elifdef __NASM_VERSION_ID__
yading@11 73 %if __NASM_VERSION_ID__ < 02060000h
yading@11 74 %define NEED_MACHO_RODATA_KLUDGE
yading@11 75 %endif
yading@11 76 %endif
yading@11 77
yading@11 78 %ifidn __OUTPUT_FORMAT__,aout
yading@11 79 section .text
yading@11 80 %else
yading@11 81 %ifndef NEED_MACHO_RODATA_KLUDGE
yading@11 82 SECTION .rodata align=%1
yading@11 83 %else
yading@11 84 %ifidn __OUTPUT_FORMAT__,macho64
yading@11 85 SECTION .text align=%1
yading@11 86 %elifidn __OUTPUT_FORMAT__,macho
yading@11 87 SECTION .text align=%1
yading@11 88 fakegot:
yading@11 89 %else
yading@11 90 SECTION .rodata align=%1
yading@11 91 %endif
yading@11 92 %endif
yading@11 93 %endif
yading@11 94
yading@11 95 %undef NEED_MACHO_RODATA_KLUDGE
yading@11 96 %endmacro
yading@11 97
yading@11 98 ; aout does not support align=
yading@11 99 %macro SECTION_TEXT 0-1 16
yading@11 100 %ifidn __OUTPUT_FORMAT__,aout
yading@11 101 SECTION .text
yading@11 102 %else
yading@11 103 SECTION .text align=%1
yading@11 104 %endif
yading@11 105 %endmacro
yading@11 106
yading@11 107 %if WIN64
yading@11 108 %define PIC
yading@11 109 %elif ARCH_X86_64 == 0
yading@11 110 ; x86_32 doesn't require PIC.
yading@11 111 ; Some distros prefer shared objects to be PIC, but nothing breaks if
yading@11 112 ; the code contains a few textrels, so we'll skip that complexity.
yading@11 113 %undef PIC
yading@11 114 %endif
yading@11 115 %ifdef PIC
yading@11 116 default rel
yading@11 117 %endif
yading@11 118
yading@11 119 %macro CPUNOP 1
yading@11 120 %if HAVE_CPUNOP
yading@11 121 CPU %1
yading@11 122 %endif
yading@11 123 %endmacro
yading@11 124
yading@11 125 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
yading@11 126 CPUNOP amdnop
yading@11 127
yading@11 128 ; Macros to eliminate most code duplication between x86_32 and x86_64:
yading@11 129 ; Currently this works only for leaf functions which load all their arguments
yading@11 130 ; into registers at the start, and make no other use of the stack. Luckily that
yading@11 131 ; covers most of x264's asm.
yading@11 132
yading@11 133 ; PROLOGUE:
yading@11 134 ; %1 = number of arguments. loads them from stack if needed.
yading@11 135 ; %2 = number of registers used. pushes callee-saved regs if needed.
yading@11 136 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
yading@11 137 ; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
yading@11 138 ; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
yading@11 139 ; and an extra register will be allocated to hold the original stack
yading@11 140 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra
yading@11 141 ; register as stack pointer, request a negative stack size.
yading@11 142 ; %4+/%5+ = list of names to define to registers
yading@11 143 ; PROLOGUE can also be invoked by adding the same options to cglobal
yading@11 144
yading@11 145 ; e.g.
yading@11 146 ; cglobal foo, 2,3,0, dst, src, tmp
yading@11 147 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
yading@11 148
yading@11 149 ; TODO Some functions can use some args directly from the stack. If they're the
yading@11 150 ; last args then you can just not declare them, but if they're in the middle
yading@11 151 ; we need more flexible macro.
yading@11 152
yading@11 153 ; RET:
yading@11 154 ; Pops anything that was pushed by PROLOGUE, and returns.
yading@11 155
yading@11 156 ; REP_RET:
yading@11 157 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
yading@11 158 ; which are slow when a normal ret follows a branch.
yading@11 159
yading@11 160 ; registers:
yading@11 161 ; rN and rNq are the native-size register holding function argument N
yading@11 162 ; rNd, rNw, rNb are dword, word, and byte size
yading@11 163 ; rNh is the high 8 bits of the word size
yading@11 164 ; rNm is the original location of arg N (a register or on the stack), dword
yading@11 165 ; rNmp is native size
yading@11 166
yading@11 167 %macro DECLARE_REG 2-3
yading@11 168 %define r%1q %2
yading@11 169 %define r%1d %2d
yading@11 170 %define r%1w %2w
yading@11 171 %define r%1b %2b
yading@11 172 %define r%1h %2h
yading@11 173 %define %2q %2
yading@11 174 %if %0 == 2
yading@11 175 %define r%1m %2d
yading@11 176 %define r%1mp %2
yading@11 177 %elif ARCH_X86_64 ; memory
yading@11 178 %define r%1m [rstk + stack_offset + %3]
yading@11 179 %define r%1mp qword r %+ %1 %+ m
yading@11 180 %else
yading@11 181 %define r%1m [rstk + stack_offset + %3]
yading@11 182 %define r%1mp dword r %+ %1 %+ m
yading@11 183 %endif
yading@11 184 %define r%1 %2
yading@11 185 %endmacro
yading@11 186
yading@11 187 %macro DECLARE_REG_SIZE 3
yading@11 188 %define r%1q r%1
yading@11 189 %define e%1q r%1
yading@11 190 %define r%1d e%1
yading@11 191 %define e%1d e%1
yading@11 192 %define r%1w %1
yading@11 193 %define e%1w %1
yading@11 194 %define r%1h %3
yading@11 195 %define e%1h %3
yading@11 196 %define r%1b %2
yading@11 197 %define e%1b %2
yading@11 198 %if ARCH_X86_64 == 0
yading@11 199 %define r%1 e%1
yading@11 200 %endif
yading@11 201 %endmacro
yading@11 202
yading@11 203 DECLARE_REG_SIZE ax, al, ah
yading@11 204 DECLARE_REG_SIZE bx, bl, bh
yading@11 205 DECLARE_REG_SIZE cx, cl, ch
yading@11 206 DECLARE_REG_SIZE dx, dl, dh
yading@11 207 DECLARE_REG_SIZE si, sil, null
yading@11 208 DECLARE_REG_SIZE di, dil, null
yading@11 209 DECLARE_REG_SIZE bp, bpl, null
yading@11 210
yading@11 211 ; t# defines for when per-arch register allocation is more complex than just function arguments
yading@11 212
yading@11 213 %macro DECLARE_REG_TMP 1-*
yading@11 214 %assign %%i 0
yading@11 215 %rep %0
yading@11 216 CAT_XDEFINE t, %%i, r%1
yading@11 217 %assign %%i %%i+1
yading@11 218 %rotate 1
yading@11 219 %endrep
yading@11 220 %endmacro
yading@11 221
yading@11 222 %macro DECLARE_REG_TMP_SIZE 0-*
yading@11 223 %rep %0
yading@11 224 %define t%1q t%1 %+ q
yading@11 225 %define t%1d t%1 %+ d
yading@11 226 %define t%1w t%1 %+ w
yading@11 227 %define t%1h t%1 %+ h
yading@11 228 %define t%1b t%1 %+ b
yading@11 229 %rotate 1
yading@11 230 %endrep
yading@11 231 %endmacro
yading@11 232
yading@11 233 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
yading@11 234
yading@11 235 %if ARCH_X86_64
yading@11 236 %define gprsize 8
yading@11 237 %else
yading@11 238 %define gprsize 4
yading@11 239 %endif
yading@11 240
yading@11 241 %macro PUSH 1
yading@11 242 push %1
yading@11 243 %ifidn rstk, rsp
yading@11 244 %assign stack_offset stack_offset+gprsize
yading@11 245 %endif
yading@11 246 %endmacro
yading@11 247
yading@11 248 %macro POP 1
yading@11 249 pop %1
yading@11 250 %ifidn rstk, rsp
yading@11 251 %assign stack_offset stack_offset-gprsize
yading@11 252 %endif
yading@11 253 %endmacro
yading@11 254
yading@11 255 %macro PUSH_IF_USED 1-*
yading@11 256 %rep %0
yading@11 257 %if %1 < regs_used
yading@11 258 PUSH r%1
yading@11 259 %endif
yading@11 260 %rotate 1
yading@11 261 %endrep
yading@11 262 %endmacro
yading@11 263
yading@11 264 %macro POP_IF_USED 1-*
yading@11 265 %rep %0
yading@11 266 %if %1 < regs_used
yading@11 267 pop r%1
yading@11 268 %endif
yading@11 269 %rotate 1
yading@11 270 %endrep
yading@11 271 %endmacro
yading@11 272
yading@11 273 %macro LOAD_IF_USED 1-*
yading@11 274 %rep %0
yading@11 275 %if %1 < num_args
yading@11 276 mov r%1, r %+ %1 %+ mp
yading@11 277 %endif
yading@11 278 %rotate 1
yading@11 279 %endrep
yading@11 280 %endmacro
yading@11 281
yading@11 282 %macro SUB 2
yading@11 283 sub %1, %2
yading@11 284 %ifidn %1, rstk
yading@11 285 %assign stack_offset stack_offset+(%2)
yading@11 286 %endif
yading@11 287 %endmacro
yading@11 288
yading@11 289 %macro ADD 2
yading@11 290 add %1, %2
yading@11 291 %ifidn %1, rstk
yading@11 292 %assign stack_offset stack_offset-(%2)
yading@11 293 %endif
yading@11 294 %endmacro
yading@11 295
yading@11 296 %macro movifnidn 2
yading@11 297 %ifnidn %1, %2
yading@11 298 mov %1, %2
yading@11 299 %endif
yading@11 300 %endmacro
yading@11 301
yading@11 302 %macro movsxdifnidn 2
yading@11 303 %ifnidn %1, %2
yading@11 304 movsxd %1, %2
yading@11 305 %endif
yading@11 306 %endmacro
yading@11 307
yading@11 308 %macro ASSERT 1
yading@11 309 %if (%1) == 0
yading@11 310 %error assert failed
yading@11 311 %endif
yading@11 312 %endmacro
yading@11 313
yading@11 314 %macro DEFINE_ARGS 0-*
yading@11 315 %ifdef n_arg_names
yading@11 316 %assign %%i 0
yading@11 317 %rep n_arg_names
yading@11 318 CAT_UNDEF arg_name %+ %%i, q
yading@11 319 CAT_UNDEF arg_name %+ %%i, d
yading@11 320 CAT_UNDEF arg_name %+ %%i, w
yading@11 321 CAT_UNDEF arg_name %+ %%i, h
yading@11 322 CAT_UNDEF arg_name %+ %%i, b
yading@11 323 CAT_UNDEF arg_name %+ %%i, m
yading@11 324 CAT_UNDEF arg_name %+ %%i, mp
yading@11 325 CAT_UNDEF arg_name, %%i
yading@11 326 %assign %%i %%i+1
yading@11 327 %endrep
yading@11 328 %endif
yading@11 329
yading@11 330 %xdefine %%stack_offset stack_offset
yading@11 331 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
yading@11 332 %assign %%i 0
yading@11 333 %rep %0
yading@11 334 %xdefine %1q r %+ %%i %+ q
yading@11 335 %xdefine %1d r %+ %%i %+ d
yading@11 336 %xdefine %1w r %+ %%i %+ w
yading@11 337 %xdefine %1h r %+ %%i %+ h
yading@11 338 %xdefine %1b r %+ %%i %+ b
yading@11 339 %xdefine %1m r %+ %%i %+ m
yading@11 340 %xdefine %1mp r %+ %%i %+ mp
yading@11 341 CAT_XDEFINE arg_name, %%i, %1
yading@11 342 %assign %%i %%i+1
yading@11 343 %rotate 1
yading@11 344 %endrep
yading@11 345 %xdefine stack_offset %%stack_offset
yading@11 346 %assign n_arg_names %0
yading@11 347 %endmacro
yading@11 348
yading@11 349 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
yading@11 350 %ifnum %1
yading@11 351 %if %1 != 0
yading@11 352 %assign %%stack_alignment ((mmsize + 15) & ~15)
yading@11 353 %assign stack_size %1
yading@11 354 %if stack_size < 0
yading@11 355 %assign stack_size -stack_size
yading@11 356 %endif
yading@11 357 %if mmsize != 8
yading@11 358 %assign xmm_regs_used %2
yading@11 359 %endif
yading@11 360 %if mmsize <= 16 && HAVE_ALIGNED_STACK
yading@11 361 %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
yading@11 362 %if xmm_regs_used > 6
yading@11 363 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
yading@11 364 %endif
yading@11 365 SUB rsp, stack_size_padded
yading@11 366 %else
yading@11 367 %assign %%reg_num (regs_used - 1)
yading@11 368 %xdefine rstk r %+ %%reg_num
yading@11 369 ; align stack, and save original stack location directly above
yading@11 370 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
yading@11 371 ; stack in a single instruction (i.e. mov rsp, rstk or mov
yading@11 372 ; rsp, [rsp+stack_size_padded])
yading@11 373 mov rstk, rsp
yading@11 374 %assign stack_size_padded stack_size
yading@11 375 %if xmm_regs_used > 6
yading@11 376 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
yading@11 377 %if mmsize == 32 && xmm_regs_used & 1
yading@11 378 ; re-align to 32 bytes
yading@11 379 %assign stack_size_padded (stack_size_padded + 16)
yading@11 380 %endif
yading@11 381 %endif
yading@11 382 %if %1 < 0 ; need to store rsp on stack
yading@11 383 sub rsp, gprsize+stack_size_padded
yading@11 384 and rsp, ~(%%stack_alignment-1)
yading@11 385 %xdefine rstkm [rsp+stack_size_padded]
yading@11 386 mov rstkm, rstk
yading@11 387 %else ; can keep rsp in rstk during whole function
yading@11 388 sub rsp, stack_size_padded
yading@11 389 and rsp, ~(%%stack_alignment-1)
yading@11 390 %xdefine rstkm rstk
yading@11 391 %endif
yading@11 392 %endif
yading@11 393 %if xmm_regs_used > 6
yading@11 394 WIN64_PUSH_XMM
yading@11 395 %endif
yading@11 396 %endif
yading@11 397 %endif
yading@11 398 %endmacro
yading@11 399
yading@11 400 %macro SETUP_STACK_POINTER 1
yading@11 401 %ifnum %1
yading@11 402 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
yading@11 403 %if %1 > 0
yading@11 404 %assign regs_used (regs_used + 1)
yading@11 405 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
yading@11 406 %warning "Stack pointer will overwrite register argument"
yading@11 407 %endif
yading@11 408 %endif
yading@11 409 %endif
yading@11 410 %endmacro
yading@11 411
yading@11 412 %macro DEFINE_ARGS_INTERNAL 3+
yading@11 413 %ifnum %2
yading@11 414 DEFINE_ARGS %3
yading@11 415 %elif %1 == 4
yading@11 416 DEFINE_ARGS %2
yading@11 417 %elif %1 > 4
yading@11 418 DEFINE_ARGS %2, %3
yading@11 419 %endif
yading@11 420 %endmacro
yading@11 421
yading@11 422 %if WIN64 ; Windows x64 ;=================================================
yading@11 423
yading@11 424 DECLARE_REG 0, rcx
yading@11 425 DECLARE_REG 1, rdx
yading@11 426 DECLARE_REG 2, R8
yading@11 427 DECLARE_REG 3, R9
yading@11 428 DECLARE_REG 4, R10, 40
yading@11 429 DECLARE_REG 5, R11, 48
yading@11 430 DECLARE_REG 6, rax, 56
yading@11 431 DECLARE_REG 7, rdi, 64
yading@11 432 DECLARE_REG 8, rsi, 72
yading@11 433 DECLARE_REG 9, rbx, 80
yading@11 434 DECLARE_REG 10, rbp, 88
yading@11 435 DECLARE_REG 11, R12, 96
yading@11 436 DECLARE_REG 12, R13, 104
yading@11 437 DECLARE_REG 13, R14, 112
yading@11 438 DECLARE_REG 14, R15, 120
yading@11 439
yading@11 440 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
yading@11 441 %assign num_args %1
yading@11 442 %assign regs_used %2
yading@11 443 ASSERT regs_used >= num_args
yading@11 444 SETUP_STACK_POINTER %4
yading@11 445 ASSERT regs_used <= 15
yading@11 446 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
yading@11 447 ALLOC_STACK %4, %3
yading@11 448 %if mmsize != 8 && stack_size == 0
yading@11 449 WIN64_SPILL_XMM %3
yading@11 450 %endif
yading@11 451 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
yading@11 452 DEFINE_ARGS_INTERNAL %0, %4, %5
yading@11 453 %endmacro
yading@11 454
yading@11 455 %macro WIN64_PUSH_XMM 0
yading@11 456 %assign %%i xmm_regs_used
yading@11 457 %rep (xmm_regs_used-6)
yading@11 458 %assign %%i %%i-1
yading@11 459 movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
yading@11 460 %endrep
yading@11 461 %endmacro
yading@11 462
yading@11 463 %macro WIN64_SPILL_XMM 1
yading@11 464 %assign xmm_regs_used %1
yading@11 465 ASSERT xmm_regs_used <= 16
yading@11 466 %if xmm_regs_used > 6
yading@11 467 SUB rsp, (xmm_regs_used-6)*16+16
yading@11 468 WIN64_PUSH_XMM
yading@11 469 %endif
yading@11 470 %endmacro
yading@11 471
yading@11 472 %macro WIN64_RESTORE_XMM_INTERNAL 1
yading@11 473 %if xmm_regs_used > 6
yading@11 474 %assign %%i xmm_regs_used
yading@11 475 %rep (xmm_regs_used-6)
yading@11 476 %assign %%i %%i-1
yading@11 477 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
yading@11 478 %endrep
yading@11 479 %if stack_size_padded == 0
yading@11 480 add %1, (xmm_regs_used-6)*16+16
yading@11 481 %endif
yading@11 482 %endif
yading@11 483 %if stack_size_padded > 0
yading@11 484 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
yading@11 485 mov rsp, rstkm
yading@11 486 %else
yading@11 487 add %1, stack_size_padded
yading@11 488 %endif
yading@11 489 %endif
yading@11 490 %endmacro
yading@11 491
yading@11 492 %macro WIN64_RESTORE_XMM 1
yading@11 493 WIN64_RESTORE_XMM_INTERNAL %1
yading@11 494 %assign stack_offset (stack_offset-stack_size_padded)
yading@11 495 %assign xmm_regs_used 0
yading@11 496 %endmacro
yading@11 497
yading@11 498 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
yading@11 499
yading@11 500 %macro RET 0
yading@11 501 WIN64_RESTORE_XMM_INTERNAL rsp
yading@11 502 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
yading@11 503 %if mmsize == 32
yading@11 504 vzeroupper
yading@11 505 %endif
yading@11 506 ret
yading@11 507 %endmacro
yading@11 508
yading@11 509 %elif ARCH_X86_64 ; *nix x64 ;=============================================
yading@11 510
yading@11 511 DECLARE_REG 0, rdi
yading@11 512 DECLARE_REG 1, rsi
yading@11 513 DECLARE_REG 2, rdx
yading@11 514 DECLARE_REG 3, rcx
yading@11 515 DECLARE_REG 4, R8
yading@11 516 DECLARE_REG 5, R9
yading@11 517 DECLARE_REG 6, rax, 8
yading@11 518 DECLARE_REG 7, R10, 16
yading@11 519 DECLARE_REG 8, R11, 24
yading@11 520 DECLARE_REG 9, rbx, 32
yading@11 521 DECLARE_REG 10, rbp, 40
yading@11 522 DECLARE_REG 11, R12, 48
yading@11 523 DECLARE_REG 12, R13, 56
yading@11 524 DECLARE_REG 13, R14, 64
yading@11 525 DECLARE_REG 14, R15, 72
yading@11 526
yading@11 527 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
yading@11 528 %assign num_args %1
yading@11 529 %assign regs_used %2
yading@11 530 ASSERT regs_used >= num_args
yading@11 531 SETUP_STACK_POINTER %4
yading@11 532 ASSERT regs_used <= 15
yading@11 533 PUSH_IF_USED 9, 10, 11, 12, 13, 14
yading@11 534 ALLOC_STACK %4
yading@11 535 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
yading@11 536 DEFINE_ARGS_INTERNAL %0, %4, %5
yading@11 537 %endmacro
yading@11 538
yading@11 539 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
yading@11 540
yading@11 541 %macro RET 0
yading@11 542 %if stack_size_padded > 0
yading@11 543 %if mmsize == 32 || HAVE_ALIGNED_STACK == 0
yading@11 544 mov rsp, rstkm
yading@11 545 %else
yading@11 546 add rsp, stack_size_padded
yading@11 547 %endif
yading@11 548 %endif
yading@11 549 POP_IF_USED 14, 13, 12, 11, 10, 9
yading@11 550 %if mmsize == 32
yading@11 551 vzeroupper
yading@11 552 %endif
yading@11 553 ret
yading@11 554 %endmacro
yading@11 555
yading@11 556 %else ; X86_32 ;==============================================================
yading@11 557
yading@11 558 DECLARE_REG 0, eax, 4
yading@11 559 DECLARE_REG 1, ecx, 8
yading@11 560 DECLARE_REG 2, edx, 12
yading@11 561 DECLARE_REG 3, ebx, 16
yading@11 562 DECLARE_REG 4, esi, 20
yading@11 563 DECLARE_REG 5, edi, 24
yading@11 564 DECLARE_REG 6, ebp, 28
yading@11 565 %define rsp esp
yading@11 566
yading@11 567 %macro DECLARE_ARG 1-*
yading@11 568 %rep %0
yading@11 569 %define r%1m [rstk + stack_offset + 4*%1 + 4]
yading@11 570 %define r%1mp dword r%1m
yading@11 571 %rotate 1
yading@11 572 %endrep
yading@11 573 %endmacro
yading@11 574
yading@11 575 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
yading@11 576
yading@11 577 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
yading@11 578 %assign num_args %1
yading@11 579 %assign regs_used %2
yading@11 580 ASSERT regs_used >= num_args
yading@11 581 %if num_args > 7
yading@11 582 %assign num_args 7
yading@11 583 %endif
yading@11 584 %if regs_used > 7
yading@11 585 %assign regs_used 7
yading@11 586 %endif
yading@11 587 SETUP_STACK_POINTER %4
yading@11 588 ASSERT regs_used <= 7
yading@11 589 PUSH_IF_USED 3, 4, 5, 6
yading@11 590 ALLOC_STACK %4
yading@11 591 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
yading@11 592 DEFINE_ARGS_INTERNAL %0, %4, %5
yading@11 593 %endmacro
yading@11 594
yading@11 595 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
yading@11 596
yading@11 597 %macro RET 0
yading@11 598 %if stack_size_padded > 0
yading@11 599 %if mmsize == 32 || HAVE_ALIGNED_STACK == 0
yading@11 600 mov rsp, rstkm
yading@11 601 %else
yading@11 602 add rsp, stack_size_padded
yading@11 603 %endif
yading@11 604 %endif
yading@11 605 POP_IF_USED 6, 5, 4, 3
yading@11 606 %if mmsize == 32
yading@11 607 vzeroupper
yading@11 608 %endif
yading@11 609 ret
yading@11 610 %endmacro
yading@11 611
yading@11 612 %endif ;======================================================================
yading@11 613
yading@11 614 %if WIN64 == 0
yading@11 615 %macro WIN64_SPILL_XMM 1
yading@11 616 %endmacro
yading@11 617 %macro WIN64_RESTORE_XMM 1
yading@11 618 %endmacro
yading@11 619 %macro WIN64_PUSH_XMM 0
yading@11 620 %endmacro
yading@11 621 %endif
yading@11 622
yading@11 623 %macro REP_RET 0
yading@11 624 %if has_epilogue
yading@11 625 RET
yading@11 626 %else
yading@11 627 rep ret
yading@11 628 %endif
yading@11 629 %endmacro
yading@11 630
yading@11 631 %macro TAIL_CALL 2 ; callee, is_nonadjacent
yading@11 632 %if has_epilogue
yading@11 633 call %1
yading@11 634 RET
yading@11 635 %elif %2
yading@11 636 jmp %1
yading@11 637 %endif
yading@11 638 %endmacro
yading@11 639
yading@11 640 ;=============================================================================
yading@11 641 ; arch-independent part
yading@11 642 ;=============================================================================
yading@11 643
yading@11 644 %assign function_align 16
yading@11 645
yading@11 646 ; Begin a function.
yading@11 647 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
yading@11 648 ; subsequent uses of the function name automatically refer to the mangled version.
yading@11 649 ; Appends cpuflags to the function name if cpuflags has been specified.
yading@11 650 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
yading@11 651 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
yading@11 652 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
yading@11 653 cglobal_internal 1, %1 %+ SUFFIX, %2
yading@11 654 %endmacro
yading@11 655 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
yading@11 656 cglobal_internal 0, %1 %+ SUFFIX, %2
yading@11 657 %endmacro
yading@11 658 %macro cglobal_internal 2-3+
yading@11 659 %if %1
yading@11 660 %xdefine %%FUNCTION_PREFIX private_prefix
yading@11 661 %xdefine %%VISIBILITY hidden
yading@11 662 %else
yading@11 663 %xdefine %%FUNCTION_PREFIX public_prefix
yading@11 664 %xdefine %%VISIBILITY
yading@11 665 %endif
yading@11 666 %ifndef cglobaled_%2
yading@11 667 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
yading@11 668 %xdefine %2.skip_prologue %2 %+ .skip_prologue
yading@11 669 CAT_XDEFINE cglobaled_, %2, 1
yading@11 670 %endif
yading@11 671 %xdefine current_function %2
yading@11 672 %ifidn __OUTPUT_FORMAT__,elf
yading@11 673 global %2:function %%VISIBILITY
yading@11 674 %else
yading@11 675 global %2
yading@11 676 %endif
yading@11 677 align function_align
yading@11 678 %2:
yading@11 679 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
yading@11 680 %xdefine rstk rsp
yading@11 681 %assign stack_offset 0
yading@11 682 %assign stack_size 0
yading@11 683 %assign stack_size_padded 0
yading@11 684 %assign xmm_regs_used 0
yading@11 685 %ifnidn %3, ""
yading@11 686 PROLOGUE %3
yading@11 687 %endif
yading@11 688 %endmacro
yading@11 689
yading@11 690 %macro cextern 1
yading@11 691 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
yading@11 692 CAT_XDEFINE cglobaled_, %1, 1
yading@11 693 extern %1
yading@11 694 %endmacro
yading@11 695
yading@11 696 ; like cextern, but without the prefix
yading@11 697 %macro cextern_naked 1
yading@11 698 %xdefine %1 mangle(%1)
yading@11 699 CAT_XDEFINE cglobaled_, %1, 1
yading@11 700 extern %1
yading@11 701 %endmacro
yading@11 702
yading@11 703 %macro const 2+
yading@11 704 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
yading@11 705 global %1
yading@11 706 %1: %2
yading@11 707 %endmacro
yading@11 708
yading@11 709 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
yading@11 710 ; executable by default.
yading@11 711 %ifidn __OUTPUT_FORMAT__,elf
yading@11 712 SECTION .note.GNU-stack noalloc noexec nowrite progbits
yading@11 713 %endif
yading@11 714
yading@11 715 ; cpuflags
yading@11 716
yading@11 717 %assign cpuflags_mmx (1<<0)
yading@11 718 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
yading@11 719 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx
yading@11 720 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
yading@11 721 %assign cpuflags_sse (1<<4) | cpuflags_mmx2
yading@11 722 %assign cpuflags_sse2 (1<<5) | cpuflags_sse
yading@11 723 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
yading@11 724 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2
yading@11 725 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
yading@11 726 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
yading@11 727 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4
yading@11 728 %assign cpuflags_avx (1<<11)| cpuflags_sse42
yading@11 729 %assign cpuflags_xop (1<<12)| cpuflags_avx
yading@11 730 %assign cpuflags_fma4 (1<<13)| cpuflags_avx
yading@11 731 %assign cpuflags_avx2 (1<<14)| cpuflags_avx
yading@11 732 %assign cpuflags_fma3 (1<<15)| cpuflags_avx
yading@11 733
yading@11 734 %assign cpuflags_cache32 (1<<16)
yading@11 735 %assign cpuflags_cache64 (1<<17)
yading@11 736 %assign cpuflags_slowctz (1<<18)
yading@11 737 %assign cpuflags_lzcnt (1<<19)
yading@11 738 %assign cpuflags_misalign (1<<20)
yading@11 739 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
yading@11 740 %assign cpuflags_atom (1<<22)
yading@11 741 %assign cpuflags_bmi1 (1<<23)
yading@11 742 %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
yading@11 743 %assign cpuflags_tbm (1<<25)|cpuflags_bmi1
yading@11 744
yading@11 745 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
yading@11 746 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
yading@11 747
yading@11 748 ; Takes up to 2 cpuflags from the above list.
yading@11 749 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
yading@11 750 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
yading@11 751 %macro INIT_CPUFLAGS 0-2
yading@11 752 CPUNOP amdnop
yading@11 753 %if %0 >= 1
yading@11 754 %xdefine cpuname %1
yading@11 755 %assign cpuflags cpuflags_%1
yading@11 756 %if %0 >= 2
yading@11 757 %xdefine cpuname %1_%2
yading@11 758 %assign cpuflags cpuflags | cpuflags_%2
yading@11 759 %endif
yading@11 760 %xdefine SUFFIX _ %+ cpuname
yading@11 761 %if cpuflag(avx)
yading@11 762 %assign avx_enabled 1
yading@11 763 %endif
yading@11 764 %if mmsize == 16 && notcpuflag(sse2)
yading@11 765 %define mova movaps
yading@11 766 %define movu movups
yading@11 767 %define movnta movntps
yading@11 768 %endif
yading@11 769 %if cpuflag(aligned)
yading@11 770 %define movu mova
yading@11 771 %elifidn %1, sse3
yading@11 772 %define movu lddqu
yading@11 773 %endif
yading@11 774 %if notcpuflag(sse2)
yading@11 775 CPUNOP basicnop
yading@11 776 %endif
yading@11 777 %else
yading@11 778 %xdefine SUFFIX
yading@11 779 %undef cpuname
yading@11 780 %undef cpuflags
yading@11 781 %endif
yading@11 782 %endmacro
yading@11 783
yading@11 784 ; merge mmx and sse*
yading@11 785
yading@11 786 %macro CAT_XDEFINE 3
yading@11 787 %xdefine %1%2 %3
yading@11 788 %endmacro
yading@11 789
yading@11 790 %macro CAT_UNDEF 2
yading@11 791 %undef %1%2
yading@11 792 %endmacro
yading@11 793
yading@11 794 %macro INIT_MMX 0-1+
yading@11 795 %assign avx_enabled 0
yading@11 796 %define RESET_MM_PERMUTATION INIT_MMX %1
yading@11 797 %define mmsize 8
yading@11 798 %define num_mmregs 8
yading@11 799 %define mova movq
yading@11 800 %define movu movq
yading@11 801 %define movh movd
yading@11 802 %define movnta movntq
yading@11 803 %assign %%i 0
yading@11 804 %rep 8
yading@11 805 CAT_XDEFINE m, %%i, mm %+ %%i
yading@11 806 CAT_XDEFINE nmm, %%i, %%i
yading@11 807 %assign %%i %%i+1
yading@11 808 %endrep
yading@11 809 %rep 8
yading@11 810 CAT_UNDEF m, %%i
yading@11 811 CAT_UNDEF nmm, %%i
yading@11 812 %assign %%i %%i+1
yading@11 813 %endrep
yading@11 814 INIT_CPUFLAGS %1
yading@11 815 %endmacro
yading@11 816
yading@11 817 %macro INIT_XMM 0-1+
yading@11 818 %assign avx_enabled 0
yading@11 819 %define RESET_MM_PERMUTATION INIT_XMM %1
yading@11 820 %define mmsize 16
yading@11 821 %define num_mmregs 8
yading@11 822 %if ARCH_X86_64
yading@11 823 %define num_mmregs 16
yading@11 824 %endif
yading@11 825 %define mova movdqa
yading@11 826 %define movu movdqu
yading@11 827 %define movh movq
yading@11 828 %define movnta movntdq
yading@11 829 %assign %%i 0
yading@11 830 %rep num_mmregs
yading@11 831 CAT_XDEFINE m, %%i, xmm %+ %%i
yading@11 832 CAT_XDEFINE nxmm, %%i, %%i
yading@11 833 %assign %%i %%i+1
yading@11 834 %endrep
yading@11 835 INIT_CPUFLAGS %1
yading@11 836 %endmacro
yading@11 837
yading@11 838 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx
yading@11 839 %macro INIT_AVX 0
yading@11 840 INIT_XMM
yading@11 841 %assign avx_enabled 1
yading@11 842 %define PALIGNR PALIGNR_SSSE3
yading@11 843 %define RESET_MM_PERMUTATION INIT_AVX
yading@11 844 %endmacro
yading@11 845
yading@11 846 %macro INIT_YMM 0-1+
yading@11 847 %assign avx_enabled 1
yading@11 848 %define RESET_MM_PERMUTATION INIT_YMM %1
yading@11 849 %define mmsize 32
yading@11 850 %define num_mmregs 8
yading@11 851 %if ARCH_X86_64
yading@11 852 %define num_mmregs 16
yading@11 853 %endif
yading@11 854 %define mova vmovaps
yading@11 855 %define movu vmovups
yading@11 856 %undef movh
yading@11 857 %define movnta vmovntps
yading@11 858 %assign %%i 0
yading@11 859 %rep num_mmregs
yading@11 860 CAT_XDEFINE m, %%i, ymm %+ %%i
yading@11 861 CAT_XDEFINE nymm, %%i, %%i
yading@11 862 %assign %%i %%i+1
yading@11 863 %endrep
yading@11 864 INIT_CPUFLAGS %1
yading@11 865 %endmacro
yading@11 866
yading@11 867 INIT_XMM
yading@11 868
yading@11 869 ; I often want to use macros that permute their arguments. e.g. there's no
yading@11 870 ; efficient way to implement butterfly or transpose or dct without swapping some
yading@11 871 ; arguments.
yading@11 872 ;
yading@11 873 ; I would like to not have to manually keep track of the permutations:
yading@11 874 ; If I insert a permutation in the middle of a function, it should automatically
yading@11 875 ; change everything that follows. For more complex macros I may also have multiple
yading@11 876 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
yading@11 877 ;
yading@11 878 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
yading@11 879 ; permutes its arguments. It's equivalent to exchanging the contents of the
yading@11 880 ; registers, except that this way you exchange the register names instead, so it
yading@11 881 ; doesn't cost any cycles.
yading@11 882
yading@11 883 %macro PERMUTE 2-* ; takes a list of pairs to swap
yading@11 884 %rep %0/2
yading@11 885 %xdefine tmp%2 m%2
yading@11 886 %xdefine ntmp%2 nm%2
yading@11 887 %rotate 2
yading@11 888 %endrep
yading@11 889 %rep %0/2
yading@11 890 %xdefine m%1 tmp%2
yading@11 891 %xdefine nm%1 ntmp%2
yading@11 892 %undef tmp%2
yading@11 893 %undef ntmp%2
yading@11 894 %rotate 2
yading@11 895 %endrep
yading@11 896 %endmacro
yading@11 897
yading@11 898 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
yading@11 899 %rep %0-1
yading@11 900 %ifdef m%1
yading@11 901 %xdefine tmp m%1
yading@11 902 %xdefine m%1 m%2
yading@11 903 %xdefine m%2 tmp
yading@11 904 CAT_XDEFINE n, m%1, %1
yading@11 905 CAT_XDEFINE n, m%2, %2
yading@11 906 %else
yading@11 907 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
yading@11 908 ; Be careful using this mode in nested macros though, as in some cases there may be
yading@11 909 ; other copies of m# that have already been dereferenced and don't get updated correctly.
yading@11 910 %xdefine %%n1 n %+ %1
yading@11 911 %xdefine %%n2 n %+ %2
yading@11 912 %xdefine tmp m %+ %%n1
yading@11 913 CAT_XDEFINE m, %%n1, m %+ %%n2
yading@11 914 CAT_XDEFINE m, %%n2, tmp
yading@11 915 CAT_XDEFINE n, m %+ %%n1, %%n1
yading@11 916 CAT_XDEFINE n, m %+ %%n2, %%n2
yading@11 917 %endif
yading@11 918 %undef tmp
yading@11 919 %rotate 1
yading@11 920 %endrep
yading@11 921 %endmacro
yading@11 922
yading@11 923 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
yading@11 924 ; calls to that function will automatically load the permutation, so values can
yading@11 925 ; be returned in mmregs.
yading@11 926 %macro SAVE_MM_PERMUTATION 0-1
yading@11 927 %if %0
yading@11 928 %xdefine %%f %1_m
yading@11 929 %else
yading@11 930 %xdefine %%f current_function %+ _m
yading@11 931 %endif
yading@11 932 %assign %%i 0
yading@11 933 %rep num_mmregs
yading@11 934 CAT_XDEFINE %%f, %%i, m %+ %%i
yading@11 935 %assign %%i %%i+1
yading@11 936 %endrep
yading@11 937 %endmacro
yading@11 938
yading@11 939 %macro LOAD_MM_PERMUTATION 1 ; name to load from
yading@11 940 %ifdef %1_m0
yading@11 941 %assign %%i 0
yading@11 942 %rep num_mmregs
yading@11 943 CAT_XDEFINE m, %%i, %1_m %+ %%i
yading@11 944 CAT_XDEFINE n, m %+ %%i, %%i
yading@11 945 %assign %%i %%i+1
yading@11 946 %endrep
yading@11 947 %endif
yading@11 948 %endmacro
yading@11 949
yading@11 950 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
yading@11 951 %macro call 1
yading@11 952 call_internal %1 %+ SUFFIX, %1
yading@11 953 %endmacro
yading@11 954 %macro call_internal 2
yading@11 955 %xdefine %%i %2
yading@11 956 %ifndef cglobaled_%2
yading@11 957 %ifdef cglobaled_%1
yading@11 958 %xdefine %%i %1
yading@11 959 %endif
yading@11 960 %endif
yading@11 961 call %%i
yading@11 962 LOAD_MM_PERMUTATION %%i
yading@11 963 %endmacro
yading@11 964
yading@11 965 ; Substitutions that reduce instruction size but are functionally equivalent
yading@11 966 %macro add 2
yading@11 967 %ifnum %2
yading@11 968 %if %2==128
yading@11 969 sub %1, -128
yading@11 970 %else
yading@11 971 add %1, %2
yading@11 972 %endif
yading@11 973 %else
yading@11 974 add %1, %2
yading@11 975 %endif
yading@11 976 %endmacro
yading@11 977
yading@11 978 %macro sub 2
yading@11 979 %ifnum %2
yading@11 980 %if %2==128
yading@11 981 add %1, -128
yading@11 982 %else
yading@11 983 sub %1, %2
yading@11 984 %endif
yading@11 985 %else
yading@11 986 sub %1, %2
yading@11 987 %endif
yading@11 988 %endmacro
yading@11 989
yading@11 990 ;=============================================================================
yading@11 991 ; AVX abstraction layer
yading@11 992 ;=============================================================================
yading@11 993
yading@11 994 %assign i 0
yading@11 995 %rep 16
yading@11 996 %if i < 8
yading@11 997 CAT_XDEFINE sizeofmm, i, 8
yading@11 998 %endif
yading@11 999 CAT_XDEFINE sizeofxmm, i, 16
yading@11 1000 CAT_XDEFINE sizeofymm, i, 32
yading@11 1001 %assign i i+1
yading@11 1002 %endrep
yading@11 1003 %undef i
yading@11 1004
yading@11 1005 %macro CHECK_AVX_INSTR_EMU 3-*
yading@11 1006 %xdefine %%opcode %1
yading@11 1007 %xdefine %%dst %2
yading@11 1008 %rep %0-2
yading@11 1009 %ifidn %%dst, %3
yading@11 1010 %error non-avx emulation of ``%%opcode'' is not supported
yading@11 1011 %endif
yading@11 1012 %rotate 1
yading@11 1013 %endrep
yading@11 1014 %endmacro
yading@11 1015
yading@11 1016 ;%1 == instruction
yading@11 1017 ;%2 == 1 if float, 0 if int
yading@11 1018 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
yading@11 1019 ;%4 == number of operands given
yading@11 1020 ;%5+: operands
yading@11 1021 %macro RUN_AVX_INSTR 6-7+
yading@11 1022 %ifid %6
yading@11 1023 %define %%sizeofreg sizeof%6
yading@11 1024 %elifid %5
yading@11 1025 %define %%sizeofreg sizeof%5
yading@11 1026 %else
yading@11 1027 %define %%sizeofreg mmsize
yading@11 1028 %endif
yading@11 1029 %if %%sizeofreg==32
yading@11 1030 %if %4>=3
yading@11 1031 v%1 %5, %6, %7
yading@11 1032 %else
yading@11 1033 v%1 %5, %6
yading@11 1034 %endif
yading@11 1035 %else
yading@11 1036 %if %%sizeofreg==8
yading@11 1037 %define %%regmov movq
yading@11 1038 %elif %2
yading@11 1039 %define %%regmov movaps
yading@11 1040 %else
yading@11 1041 %define %%regmov movdqa
yading@11 1042 %endif
yading@11 1043
yading@11 1044 %if %4>=3+%3
yading@11 1045 %ifnidn %5, %6
yading@11 1046 %if avx_enabled && %%sizeofreg==16
yading@11 1047 v%1 %5, %6, %7
yading@11 1048 %else
yading@11 1049 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
yading@11 1050 %%regmov %5, %6
yading@11 1051 %1 %5, %7
yading@11 1052 %endif
yading@11 1053 %else
yading@11 1054 %1 %5, %7
yading@11 1055 %endif
yading@11 1056 %elif %4>=3
yading@11 1057 %1 %5, %6, %7
yading@11 1058 %else
yading@11 1059 %1 %5, %6
yading@11 1060 %endif
yading@11 1061 %endif
yading@11 1062 %endmacro
yading@11 1063
yading@11 1064 ; 3arg AVX ops with a memory arg can only have it in src2,
yading@11 1065 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
yading@11 1066 ; So, if the op is symmetric and the wrong one is memory, swap them.
yading@11 1067 %macro RUN_AVX_INSTR1 8
yading@11 1068 %assign %%swap 0
yading@11 1069 %if avx_enabled
yading@11 1070 %ifnid %6
yading@11 1071 %assign %%swap 1
yading@11 1072 %endif
yading@11 1073 %elifnidn %5, %6
yading@11 1074 %ifnid %7
yading@11 1075 %assign %%swap 1
yading@11 1076 %endif
yading@11 1077 %endif
yading@11 1078 %if %%swap && %3 == 0 && %8 == 1
yading@11 1079 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
yading@11 1080 %else
yading@11 1081 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
yading@11 1082 %endif
yading@11 1083 %endmacro
yading@11 1084
yading@11 1085 ;%1 == instruction
yading@11 1086 ;%2 == 1 if float, 0 if int
yading@11 1087 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
yading@11 1088 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
yading@11 1089 %macro AVX_INSTR 4
yading@11 1090 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
yading@11 1091 %ifidn %3, fnord
yading@11 1092 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
yading@11 1093 %elifidn %4, fnord
yading@11 1094 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
yading@11 1095 %elifidn %5, fnord
yading@11 1096 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
yading@11 1097 %else
yading@11 1098 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
yading@11 1099 %endif
yading@11 1100 %endmacro
yading@11 1101 %endmacro
yading@11 1102
yading@11 1103 AVX_INSTR addpd, 1, 0, 1
yading@11 1104 AVX_INSTR addps, 1, 0, 1
yading@11 1105 AVX_INSTR addsd, 1, 0, 1
yading@11 1106 AVX_INSTR addss, 1, 0, 1
yading@11 1107 AVX_INSTR addsubpd, 1, 0, 0
yading@11 1108 AVX_INSTR addsubps, 1, 0, 0
yading@11 1109 AVX_INSTR andpd, 1, 0, 1
yading@11 1110 AVX_INSTR andps, 1, 0, 1
yading@11 1111 AVX_INSTR andnpd, 1, 0, 0
yading@11 1112 AVX_INSTR andnps, 1, 0, 0
yading@11 1113 AVX_INSTR blendpd, 1, 0, 0
yading@11 1114 AVX_INSTR blendps, 1, 0, 0
yading@11 1115 AVX_INSTR blendvpd, 1, 0, 0
yading@11 1116 AVX_INSTR blendvps, 1, 0, 0
yading@11 1117 AVX_INSTR cmppd, 1, 1, 0
yading@11 1118 AVX_INSTR cmpps, 1, 1, 0
yading@11 1119 AVX_INSTR cmpsd, 1, 1, 0
yading@11 1120 AVX_INSTR cmpss, 1, 1, 0
yading@11 1121 AVX_INSTR cvtdq2ps, 1, 0, 0
yading@11 1122 AVX_INSTR cvtpd2dq, 1, 0, 0
yading@11 1123 AVX_INSTR cvtps2dq, 1, 0, 0
yading@11 1124 AVX_INSTR divpd, 1, 0, 0
yading@11 1125 AVX_INSTR divps, 1, 0, 0
yading@11 1126 AVX_INSTR divsd, 1, 0, 0
yading@11 1127 AVX_INSTR divss, 1, 0, 0
yading@11 1128 AVX_INSTR dppd, 1, 1, 0
yading@11 1129 AVX_INSTR dpps, 1, 1, 0
yading@11 1130 AVX_INSTR haddpd, 1, 0, 0
yading@11 1131 AVX_INSTR haddps, 1, 0, 0
yading@11 1132 AVX_INSTR hsubpd, 1, 0, 0
yading@11 1133 AVX_INSTR hsubps, 1, 0, 0
yading@11 1134 AVX_INSTR maxpd, 1, 0, 1
yading@11 1135 AVX_INSTR maxps, 1, 0, 1
yading@11 1136 AVX_INSTR maxsd, 1, 0, 1
yading@11 1137 AVX_INSTR maxss, 1, 0, 1
yading@11 1138 AVX_INSTR minpd, 1, 0, 1
yading@11 1139 AVX_INSTR minps, 1, 0, 1
yading@11 1140 AVX_INSTR minsd, 1, 0, 1
yading@11 1141 AVX_INSTR minss, 1, 0, 1
yading@11 1142 AVX_INSTR movhlps, 1, 0, 0
yading@11 1143 AVX_INSTR movlhps, 1, 0, 0
yading@11 1144 AVX_INSTR movsd, 1, 0, 0
yading@11 1145 AVX_INSTR movss, 1, 0, 0
yading@11 1146 AVX_INSTR mpsadbw, 0, 1, 0
yading@11 1147 AVX_INSTR mulpd, 1, 0, 1
yading@11 1148 AVX_INSTR mulps, 1, 0, 1
yading@11 1149 AVX_INSTR mulsd, 1, 0, 1
yading@11 1150 AVX_INSTR mulss, 1, 0, 1
yading@11 1151 AVX_INSTR orpd, 1, 0, 1
yading@11 1152 AVX_INSTR orps, 1, 0, 1
yading@11 1153 AVX_INSTR pabsb, 0, 0, 0
yading@11 1154 AVX_INSTR pabsw, 0, 0, 0
yading@11 1155 AVX_INSTR pabsd, 0, 0, 0
yading@11 1156 AVX_INSTR packsswb, 0, 0, 0
yading@11 1157 AVX_INSTR packssdw, 0, 0, 0
yading@11 1158 AVX_INSTR packuswb, 0, 0, 0
yading@11 1159 AVX_INSTR packusdw, 0, 0, 0
yading@11 1160 AVX_INSTR paddb, 0, 0, 1
yading@11 1161 AVX_INSTR paddw, 0, 0, 1
yading@11 1162 AVX_INSTR paddd, 0, 0, 1
yading@11 1163 AVX_INSTR paddq, 0, 0, 1
yading@11 1164 AVX_INSTR paddsb, 0, 0, 1
yading@11 1165 AVX_INSTR paddsw, 0, 0, 1
yading@11 1166 AVX_INSTR paddusb, 0, 0, 1
yading@11 1167 AVX_INSTR paddusw, 0, 0, 1
yading@11 1168 AVX_INSTR palignr, 0, 1, 0
yading@11 1169 AVX_INSTR pand, 0, 0, 1
yading@11 1170 AVX_INSTR pandn, 0, 0, 0
yading@11 1171 AVX_INSTR pavgb, 0, 0, 1
yading@11 1172 AVX_INSTR pavgw, 0, 0, 1
yading@11 1173 AVX_INSTR pblendvb, 0, 0, 0
yading@11 1174 AVX_INSTR pblendw, 0, 1, 0
yading@11 1175 AVX_INSTR pcmpestri, 0, 0, 0
yading@11 1176 AVX_INSTR pcmpestrm, 0, 0, 0
yading@11 1177 AVX_INSTR pcmpistri, 0, 0, 0
yading@11 1178 AVX_INSTR pcmpistrm, 0, 0, 0
yading@11 1179 AVX_INSTR pcmpeqb, 0, 0, 1
yading@11 1180 AVX_INSTR pcmpeqw, 0, 0, 1
yading@11 1181 AVX_INSTR pcmpeqd, 0, 0, 1
yading@11 1182 AVX_INSTR pcmpeqq, 0, 0, 1
yading@11 1183 AVX_INSTR pcmpgtb, 0, 0, 0
yading@11 1184 AVX_INSTR pcmpgtw, 0, 0, 0
yading@11 1185 AVX_INSTR pcmpgtd, 0, 0, 0
yading@11 1186 AVX_INSTR pcmpgtq, 0, 0, 0
yading@11 1187 AVX_INSTR phaddw, 0, 0, 0
yading@11 1188 AVX_INSTR phaddd, 0, 0, 0
yading@11 1189 AVX_INSTR phaddsw, 0, 0, 0
yading@11 1190 AVX_INSTR phsubw, 0, 0, 0
yading@11 1191 AVX_INSTR phsubd, 0, 0, 0
yading@11 1192 AVX_INSTR phsubsw, 0, 0, 0
yading@11 1193 AVX_INSTR pmaddwd, 0, 0, 1
yading@11 1194 AVX_INSTR pmaddubsw, 0, 0, 0
yading@11 1195 AVX_INSTR pmaxsb, 0, 0, 1
yading@11 1196 AVX_INSTR pmaxsw, 0, 0, 1
yading@11 1197 AVX_INSTR pmaxsd, 0, 0, 1
yading@11 1198 AVX_INSTR pmaxub, 0, 0, 1
yading@11 1199 AVX_INSTR pmaxuw, 0, 0, 1
yading@11 1200 AVX_INSTR pmaxud, 0, 0, 1
yading@11 1201 AVX_INSTR pminsb, 0, 0, 1
yading@11 1202 AVX_INSTR pminsw, 0, 0, 1
yading@11 1203 AVX_INSTR pminsd, 0, 0, 1
yading@11 1204 AVX_INSTR pminub, 0, 0, 1
yading@11 1205 AVX_INSTR pminuw, 0, 0, 1
yading@11 1206 AVX_INSTR pminud, 0, 0, 1
yading@11 1207 AVX_INSTR pmovmskb, 0, 0, 0
yading@11 1208 AVX_INSTR pmulhuw, 0, 0, 1
yading@11 1209 AVX_INSTR pmulhrsw, 0, 0, 1
yading@11 1210 AVX_INSTR pmulhw, 0, 0, 1
yading@11 1211 AVX_INSTR pmullw, 0, 0, 1
yading@11 1212 AVX_INSTR pmulld, 0, 0, 1
yading@11 1213 AVX_INSTR pmuludq, 0, 0, 1
yading@11 1214 AVX_INSTR pmuldq, 0, 0, 1
yading@11 1215 AVX_INSTR por, 0, 0, 1
yading@11 1216 AVX_INSTR psadbw, 0, 0, 1
yading@11 1217 AVX_INSTR pshufb, 0, 0, 0
yading@11 1218 AVX_INSTR pshufd, 0, 1, 0
yading@11 1219 AVX_INSTR pshufhw, 0, 1, 0
yading@11 1220 AVX_INSTR pshuflw, 0, 1, 0
yading@11 1221 AVX_INSTR psignb, 0, 0, 0
yading@11 1222 AVX_INSTR psignw, 0, 0, 0
yading@11 1223 AVX_INSTR psignd, 0, 0, 0
yading@11 1224 AVX_INSTR psllw, 0, 0, 0
yading@11 1225 AVX_INSTR pslld, 0, 0, 0
yading@11 1226 AVX_INSTR psllq, 0, 0, 0
yading@11 1227 AVX_INSTR pslldq, 0, 0, 0
yading@11 1228 AVX_INSTR psraw, 0, 0, 0
yading@11 1229 AVX_INSTR psrad, 0, 0, 0
yading@11 1230 AVX_INSTR psrlw, 0, 0, 0
yading@11 1231 AVX_INSTR psrld, 0, 0, 0
yading@11 1232 AVX_INSTR psrlq, 0, 0, 0
yading@11 1233 AVX_INSTR psrldq, 0, 0, 0
yading@11 1234 AVX_INSTR psubb, 0, 0, 0
yading@11 1235 AVX_INSTR psubw, 0, 0, 0
yading@11 1236 AVX_INSTR psubd, 0, 0, 0
yading@11 1237 AVX_INSTR psubq, 0, 0, 0
yading@11 1238 AVX_INSTR psubsb, 0, 0, 0
yading@11 1239 AVX_INSTR psubsw, 0, 0, 0
yading@11 1240 AVX_INSTR psubusb, 0, 0, 0
yading@11 1241 AVX_INSTR psubusw, 0, 0, 0
yading@11 1242 AVX_INSTR ptest, 0, 0, 0
yading@11 1243 AVX_INSTR punpckhbw, 0, 0, 0
yading@11 1244 AVX_INSTR punpckhwd, 0, 0, 0
yading@11 1245 AVX_INSTR punpckhdq, 0, 0, 0
yading@11 1246 AVX_INSTR punpckhqdq, 0, 0, 0
yading@11 1247 AVX_INSTR punpcklbw, 0, 0, 0
yading@11 1248 AVX_INSTR punpcklwd, 0, 0, 0
yading@11 1249 AVX_INSTR punpckldq, 0, 0, 0
yading@11 1250 AVX_INSTR punpcklqdq, 0, 0, 0
yading@11 1251 AVX_INSTR pxor, 0, 0, 1
yading@11 1252 AVX_INSTR shufps, 1, 1, 0
yading@11 1253 AVX_INSTR subpd, 1, 0, 0
yading@11 1254 AVX_INSTR subps, 1, 0, 0
yading@11 1255 AVX_INSTR subsd, 1, 0, 0
yading@11 1256 AVX_INSTR subss, 1, 0, 0
yading@11 1257 AVX_INSTR unpckhpd, 1, 0, 0
yading@11 1258 AVX_INSTR unpckhps, 1, 0, 0
yading@11 1259 AVX_INSTR unpcklpd, 1, 0, 0
yading@11 1260 AVX_INSTR unpcklps, 1, 0, 0
yading@11 1261 AVX_INSTR xorpd, 1, 0, 1
yading@11 1262 AVX_INSTR xorps, 1, 0, 1
yading@11 1263
yading@11 1264 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
yading@11 1265 AVX_INSTR pfadd, 1, 0, 1
yading@11 1266 AVX_INSTR pfsub, 1, 0, 0
yading@11 1267 AVX_INSTR pfmul, 1, 0, 1
yading@11 1268
yading@11 1269 ; base-4 constants for shuffles
yading@11 1270 %assign i 0
yading@11 1271 %rep 256
yading@11 1272 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
yading@11 1273 %if j < 10
yading@11 1274 CAT_XDEFINE q000, j, i
yading@11 1275 %elif j < 100
yading@11 1276 CAT_XDEFINE q00, j, i
yading@11 1277 %elif j < 1000
yading@11 1278 CAT_XDEFINE q0, j, i
yading@11 1279 %else
yading@11 1280 CAT_XDEFINE q, j, i
yading@11 1281 %endif
yading@11 1282 %assign i i+1
yading@11 1283 %endrep
yading@11 1284 %undef i
yading@11 1285 %undef j
yading@11 1286
yading@11 1287 %macro FMA_INSTR 3
yading@11 1288 %macro %1 5-8 %1, %2, %3
yading@11 1289 %if cpuflag(xop) || cpuflag(fma4)
yading@11 1290 v%6 %1, %2, %3, %4
yading@11 1291 %else
yading@11 1292 %ifidn %1, %4
yading@11 1293 %7 %5, %2, %3
yading@11 1294 %8 %1, %4, %5
yading@11 1295 %else
yading@11 1296 %7 %1, %2, %3
yading@11 1297 %8 %1, %4
yading@11 1298 %endif
yading@11 1299 %endif
yading@11 1300 %endmacro
yading@11 1301 %endmacro
yading@11 1302
yading@11 1303 FMA_INSTR fmaddps, mulps, addps
yading@11 1304 FMA_INSTR pmacsdd, pmulld, paddd
yading@11 1305 FMA_INSTR pmacsww, pmullw, paddw
yading@11 1306 FMA_INSTR pmadcswd, pmaddwd, paddd
yading@11 1307
yading@11 1308 ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
yading@11 1309 ; This lets us use tzcnt without bumping the yasm version requirement yet.
yading@11 1310 %define tzcnt rep bsf