yading@10: ;***************************************************************************** yading@10: ;* MMX optimized DSP utils yading@10: ;***************************************************************************** yading@10: ;* Copyright (c) 2000, 2001 Fabrice Bellard yading@10: ;* Copyright (c) 2002-2004 Michael Niedermayer yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;***************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION .text yading@10: yading@10: %macro DIFF_PIXELS_1 4 yading@10: movh %1, %3 yading@10: movh %2, %4 yading@10: punpcklbw %2, %1 yading@10: punpcklbw %1, %1 yading@10: psubw %1, %2 yading@10: %endmacro yading@10: yading@10: ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 yading@10: ; %6=temporary storage location yading@10: ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) yading@10: %macro DIFF_PIXELS_8 6 yading@10: DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] yading@10: DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] yading@10: DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] yading@10: add %1, %5 yading@10: add %2, %5 yading@10: DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] yading@10: DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] yading@10: DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] yading@10: DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] yading@10: %ifdef m8 yading@10: DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] yading@10: %else yading@10: mova [%6], m0 yading@10: DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] yading@10: mova m0, [%6] yading@10: %endif yading@10: sub %1, %5 yading@10: sub %2, %5 yading@10: %endmacro yading@10: yading@10: %macro HADAMARD8 0 yading@10: SUMSUB_BADC w, 0, 1, 2, 3 yading@10: SUMSUB_BADC w, 4, 5, 6, 7 yading@10: SUMSUB_BADC w, 0, 2, 1, 3 yading@10: SUMSUB_BADC w, 4, 6, 5, 7 yading@10: SUMSUB_BADC w, 0, 4, 1, 5 yading@10: SUMSUB_BADC w, 2, 6, 3, 7 yading@10: %endmacro yading@10: yading@10: %macro ABS1_SUM 3 yading@10: ABS1 %1, %2 yading@10: paddusw %3, %1 yading@10: %endmacro yading@10: yading@10: %macro ABS2_SUM 6 yading@10: ABS2 %1, %2, %3, %4 yading@10: paddusw %5, %1 yading@10: paddusw %6, %2 yading@10: %endmacro yading@10: yading@10: %macro ABS_SUM_8x8_64 1 yading@10: ABS2 m0, m1, m8, m9 yading@10: ABS2_SUM m2, m3, m8, m9, m0, m1 yading@10: ABS2_SUM m4, m5, m8, m9, m0, m1 yading@10: ABS2_SUM m6, m7, m8, m9, m0, m1 yading@10: paddusw m0, m1 yading@10: %endmacro yading@10: yading@10: %macro ABS_SUM_8x8_32 1 yading@10: mova [%1], m7 yading@10: ABS1 m0, m7 yading@10: ABS1 m1, m7 yading@10: ABS1_SUM m2, m7, m0 yading@10: ABS1_SUM m3, m7, m1 yading@10: ABS1_SUM m4, m7, m0 yading@10: ABS1_SUM m5, m7, m1 yading@10: ABS1_SUM m6, m7, m0 yading@10: mova m2, [%1] yading@10: ABS1_SUM m2, m7, m1 yading@10: paddusw m0, m1 yading@10: %endmacro yading@10: yading@10: ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to yading@10: ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, yading@10: ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. yading@10: %macro HSUM 3 yading@10: %if cpuflag(sse2) yading@10: movhlps %2, %1 yading@10: paddusw %1, %2 yading@10: pshuflw %2, %1, 0xE yading@10: paddusw %1, %2 yading@10: pshuflw %2, %1, 0x1 yading@10: paddusw %1, %2 yading@10: movd %3, %1 yading@10: %elif cpuflag(mmxext) yading@10: pshufw %2, %1, 0xE yading@10: paddusw %1, %2 yading@10: pshufw %2, %1, 0x1 yading@10: paddusw %1, %2 yading@10: movd %3, %1 yading@10: %elif cpuflag(mmx) yading@10: mova %2, %1 yading@10: psrlq %1, 32 yading@10: paddusw %1, %2 yading@10: mova %2, %1 yading@10: psrlq %1, 16 yading@10: paddusw %1, %2 yading@10: movd %3, %1 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro STORE4 5 yading@10: mova [%1+mmsize*0], %2 yading@10: mova [%1+mmsize*1], %3 yading@10: mova [%1+mmsize*2], %4 yading@10: mova [%1+mmsize*3], %5 yading@10: %endmacro yading@10: yading@10: %macro LOAD4 5 yading@10: mova %2, [%1+mmsize*0] yading@10: mova %3, [%1+mmsize*1] yading@10: mova %4, [%1+mmsize*2] yading@10: mova %5, [%1+mmsize*3] yading@10: %endmacro yading@10: yading@10: %macro hadamard8_16_wrapper 2 yading@10: cglobal hadamard8_diff, 4, 4, %1 yading@10: %ifndef m8 yading@10: %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) yading@10: SUB rsp, pad yading@10: %endif yading@10: call hadamard8x8_diff %+ SUFFIX yading@10: %ifndef m8 yading@10: ADD rsp, pad yading@10: %endif yading@10: RET yading@10: yading@10: cglobal hadamard8_diff16, 5, 6, %1 yading@10: %ifndef m8 yading@10: %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) yading@10: SUB rsp, pad yading@10: %endif yading@10: yading@10: call hadamard8x8_diff %+ SUFFIX yading@10: mov r5d, eax yading@10: yading@10: add r1, 8 yading@10: add r2, 8 yading@10: call hadamard8x8_diff %+ SUFFIX yading@10: add r5d, eax yading@10: yading@10: cmp r4d, 16 yading@10: jne .done yading@10: yading@10: lea r1, [r1+r3*8-8] yading@10: lea r2, [r2+r3*8-8] yading@10: call hadamard8x8_diff %+ SUFFIX yading@10: add r5d, eax yading@10: yading@10: add r1, 8 yading@10: add r2, 8 yading@10: call hadamard8x8_diff %+ SUFFIX yading@10: add r5d, eax yading@10: yading@10: .done: yading@10: mov eax, r5d yading@10: %ifndef m8 yading@10: ADD rsp, pad yading@10: %endif yading@10: RET yading@10: %endmacro yading@10: yading@10: %macro HADAMARD8_DIFF 0-1 yading@10: %if cpuflag(sse2) yading@10: hadamard8x8_diff %+ SUFFIX: yading@10: lea r0, [r3*3] yading@10: DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize yading@10: HADAMARD8 yading@10: %if ARCH_X86_64 yading@10: TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 yading@10: %else yading@10: TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] yading@10: %endif yading@10: HADAMARD8 yading@10: ABS_SUM_8x8 rsp+gprsize yading@10: HSUM m0, m1, eax yading@10: and eax, 0xFFFF yading@10: ret yading@10: yading@10: hadamard8_16_wrapper %1, 3 yading@10: %elif cpuflag(mmx) yading@10: ALIGN 16 yading@10: ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, yading@10: ; int stride, int h) yading@10: ; r0 = void *s = unused, int h = unused (always 8) yading@10: ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 yading@10: ; can simply call this 2x2x (and that's why we access rsp+gprsize yading@10: ; everywhere, which is rsp of calling func yading@10: hadamard8x8_diff %+ SUFFIX: yading@10: lea r0, [r3*3] yading@10: yading@10: ; first 4x8 pixels yading@10: DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 yading@10: HADAMARD8 yading@10: mova [rsp+gprsize+0x60], m7 yading@10: TRANSPOSE4x4W 0, 1, 2, 3, 7 yading@10: STORE4 rsp+gprsize, m0, m1, m2, m3 yading@10: mova m7, [rsp+gprsize+0x60] yading@10: TRANSPOSE4x4W 4, 5, 6, 7, 0 yading@10: STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 yading@10: yading@10: ; second 4x8 pixels yading@10: DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 yading@10: HADAMARD8 yading@10: mova [rsp+gprsize+0x60], m7 yading@10: TRANSPOSE4x4W 0, 1, 2, 3, 7 yading@10: STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 yading@10: mova m7, [rsp+gprsize+0x60] yading@10: TRANSPOSE4x4W 4, 5, 6, 7, 0 yading@10: yading@10: LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 yading@10: HADAMARD8 yading@10: ABS_SUM_8x8_32 rsp+gprsize+0x60 yading@10: mova [rsp+gprsize+0x60], m0 yading@10: yading@10: LOAD4 rsp+gprsize , m0, m1, m2, m3 yading@10: LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 yading@10: HADAMARD8 yading@10: ABS_SUM_8x8_32 rsp+gprsize yading@10: paddusw m0, [rsp+gprsize+0x60] yading@10: yading@10: HSUM m0, m1, eax yading@10: and rax, 0xFFFF yading@10: ret yading@10: yading@10: hadamard8_16_wrapper 0, 14 yading@10: %endif yading@10: %endmacro yading@10: yading@10: INIT_MMX mmx yading@10: HADAMARD8_DIFF yading@10: yading@10: INIT_MMX mmxext yading@10: HADAMARD8_DIFF yading@10: yading@10: INIT_XMM sse2 yading@10: %if ARCH_X86_64 yading@10: %define ABS_SUM_8x8 ABS_SUM_8x8_64 yading@10: %else yading@10: %define ABS_SUM_8x8 ABS_SUM_8x8_32 yading@10: %endif yading@10: HADAMARD8_DIFF 10 yading@10: yading@10: INIT_XMM ssse3 yading@10: %define ABS_SUM_8x8 ABS_SUM_8x8_64 yading@10: HADAMARD8_DIFF 9 yading@10: yading@10: INIT_XMM sse2 yading@10: ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) yading@10: cglobal sse16, 5, 5, 8 yading@10: shr r4d, 1 yading@10: pxor m0, m0 ; mm0 = 0 yading@10: pxor m7, m7 ; mm7 holds the sum yading@10: yading@10: .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned yading@10: movu m1, [r1 ] ; mm1 = pix1[0][0-15] yading@10: movu m2, [r2 ] ; mm2 = pix2[0][0-15] yading@10: movu m3, [r1+r3] ; mm3 = pix1[1][0-15] yading@10: movu m4, [r2+r3] ; mm4 = pix2[1][0-15] yading@10: yading@10: ; todo: mm1-mm2, mm3-mm4 yading@10: ; algo: subtract mm1 from mm2 with saturation and vice versa yading@10: ; OR the result to get the absolute difference yading@10: mova m5, m1 yading@10: mova m6, m3 yading@10: psubusb m1, m2 yading@10: psubusb m3, m4 yading@10: psubusb m2, m5 yading@10: psubusb m4, m6 yading@10: yading@10: por m2, m1 yading@10: por m4, m3 yading@10: yading@10: ; now convert to 16-bit vectors so we can square them yading@10: mova m1, m2 yading@10: mova m3, m4 yading@10: yading@10: punpckhbw m2, m0 yading@10: punpckhbw m4, m0 yading@10: punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) yading@10: punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) yading@10: yading@10: pmaddwd m2, m2 yading@10: pmaddwd m4, m4 yading@10: pmaddwd m1, m1 yading@10: pmaddwd m3, m3 yading@10: yading@10: lea r1, [r1+r3*2] ; pix1 += 2*line_size yading@10: lea r2, [r2+r3*2] ; pix2 += 2*line_size yading@10: yading@10: paddd m1, m2 yading@10: paddd m3, m4 yading@10: paddd m7, m1 yading@10: paddd m7, m3 yading@10: yading@10: dec r4 yading@10: jnz .next2lines yading@10: yading@10: mova m1, m7 yading@10: psrldq m7, 8 ; shift hi qword to lo yading@10: paddd m7, m1 yading@10: mova m1, m7 yading@10: psrldq m7, 4 ; shift hi dword to lo yading@10: paddd m7, m1 yading@10: movd eax, m7 ; return value yading@10: RET yading@10: yading@10: INIT_MMX mmx yading@10: ; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) yading@10: cglobal get_pixels, 3,4 yading@10: movsxdifnidn r2, r2d yading@10: add r0, 128 yading@10: mov r3, -128 yading@10: pxor m7, m7 yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m2, [r1+r2] yading@10: mova m1, m0 yading@10: mova m3, m2 yading@10: punpcklbw m0, m7 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: mova [r0+r3+ 0], m0 yading@10: mova [r0+r3+ 8], m1 yading@10: mova [r0+r3+16], m2 yading@10: mova [r0+r3+24], m3 yading@10: lea r1, [r1+r2*2] yading@10: add r3, 32 yading@10: js .loop yading@10: REP_RET yading@10: yading@10: INIT_XMM sse2 yading@10: cglobal get_pixels, 3, 4 yading@10: movsxdifnidn r2, r2d yading@10: lea r3, [r2*3] yading@10: pxor m4, m4 yading@10: movh m0, [r1] yading@10: movh m1, [r1+r2] yading@10: movh m2, [r1+r2*2] yading@10: movh m3, [r1+r3] yading@10: lea r1, [r1+r2*4] yading@10: punpcklbw m0, m4 yading@10: punpcklbw m1, m4 yading@10: punpcklbw m2, m4 yading@10: punpcklbw m3, m4 yading@10: mova [r0], m0 yading@10: mova [r0+0x10], m1 yading@10: mova [r0+0x20], m2 yading@10: mova [r0+0x30], m3 yading@10: movh m0, [r1] yading@10: movh m1, [r1+r2*1] yading@10: movh m2, [r1+r2*2] yading@10: movh m3, [r1+r3] yading@10: punpcklbw m0, m4 yading@10: punpcklbw m1, m4 yading@10: punpcklbw m2, m4 yading@10: punpcklbw m3, m4 yading@10: mova [r0+0x40], m0 yading@10: mova [r0+0x50], m1 yading@10: mova [r0+0x60], m2 yading@10: mova [r0+0x70], m3 yading@10: RET yading@10: yading@10: INIT_MMX mmx yading@10: ; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride) yading@10: cglobal diff_pixels, 4,5 yading@10: movsxdifnidn r3, r3d yading@10: pxor m7, m7 yading@10: add r0, 128 yading@10: mov r4, -128 yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m2, [r2] yading@10: mova m1, m0 yading@10: mova m3, m2 yading@10: punpcklbw m0, m7 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: psubw m0, m2 yading@10: psubw m1, m3 yading@10: mova [r0+r4+0], m0 yading@10: mova [r0+r4+8], m1 yading@10: add r1, r3 yading@10: add r2, r3 yading@10: add r4, 16 yading@10: jne .loop yading@10: REP_RET yading@10: yading@10: INIT_MMX mmx yading@10: ; pix_sum16_mmx(uint8_t * pix, int line_size) yading@10: cglobal pix_sum16, 2, 3 yading@10: movsxdifnidn r1, r1d yading@10: mov r2, r1 yading@10: neg r2 yading@10: shl r2, 4 yading@10: sub r0, r2 yading@10: pxor m7, m7 yading@10: pxor m6, m6 yading@10: .loop: yading@10: mova m0, [r0+r2+0] yading@10: mova m1, [r0+r2+0] yading@10: mova m2, [r0+r2+8] yading@10: mova m3, [r0+r2+8] yading@10: punpcklbw m0, m7 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: paddw m1, m0 yading@10: paddw m3, m2 yading@10: paddw m3, m1 yading@10: paddw m6, m3 yading@10: add r2, r1 yading@10: js .loop yading@10: mova m5, m6 yading@10: psrlq m6, 32 yading@10: paddw m6, m5 yading@10: mova m5, m6 yading@10: psrlq m6, 16 yading@10: paddw m6, m5 yading@10: movd eax, m6 yading@10: and eax, 0xffff yading@10: RET yading@10: yading@10: INIT_MMX mmx yading@10: ; pix_norm1_mmx(uint8_t *pix, int line_size) yading@10: cglobal pix_norm1, 2, 4 yading@10: movsxdifnidn r1, r1d yading@10: mov r2, 16 yading@10: pxor m0, m0 yading@10: pxor m7, m7 yading@10: .loop: yading@10: mova m2, [r0+0] yading@10: mova m3, [r0+8] yading@10: mova m1, m2 yading@10: punpckhbw m1, m0 yading@10: punpcklbw m2, m0 yading@10: mova m4, m3 yading@10: punpckhbw m3, m0 yading@10: punpcklbw m4, m0 yading@10: pmaddwd m1, m1 yading@10: pmaddwd m2, m2 yading@10: pmaddwd m3, m3 yading@10: pmaddwd m4, m4 yading@10: paddd m2, m1 yading@10: paddd m4, m3 yading@10: paddd m7, m2 yading@10: add r0, r1 yading@10: paddd m7, m4 yading@10: dec r2 yading@10: jne .loop yading@10: mova m1, m7 yading@10: psrlq m7, 32 yading@10: paddd m1, m7 yading@10: movd eax, m1 yading@10: RET yading@10: