yading@10: ;****************************************************************************** yading@10: ;* x86 optimizations for PNG decoding yading@10: ;* yading@10: ;* Copyright (c) 2008 Loren Merritt yading@10: ;* Copyright (c) 2012 Ronald S. Bultje yading@10: ;* yading@10: ;* This file is part of Libav. yading@10: ;* yading@10: ;* Libav is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* Libav is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with Libav; if not, write to the Free Software yading@10: ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA yading@10: yading@10: cextern pw_255 yading@10: yading@10: SECTION_TEXT yading@10: yading@10: ; %1 = nr. of xmm registers used yading@10: %macro ADD_BYTES_FN 1 yading@10: cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i yading@10: %if ARCH_X86_64 yading@10: movsxd waq, wad yading@10: %endif yading@10: xor iq, iq yading@10: yading@10: ; vector loop yading@10: mov wq, waq yading@10: and waq, ~(mmsize*2-1) yading@10: jmp .end_v yading@10: .loop_v: yading@10: mova m0, [src1q+iq] yading@10: mova m1, [src1q+iq+mmsize] yading@10: paddb m0, [src2q+iq] yading@10: paddb m1, [src2q+iq+mmsize] yading@10: mova [dstq+iq ], m0 yading@10: mova [dstq+iq+mmsize], m1 yading@10: add iq, mmsize*2 yading@10: .end_v: yading@10: cmp iq, waq yading@10: jl .loop_v yading@10: yading@10: %if mmsize == 16 yading@10: ; vector loop yading@10: mov waq, wq yading@10: and waq, ~7 yading@10: jmp .end_l yading@10: .loop_l: yading@10: movq mm0, [src1q+iq] yading@10: paddb mm0, [src2q+iq] yading@10: movq [dstq+iq ], mm0 yading@10: add iq, 8 yading@10: .end_l: yading@10: cmp iq, waq yading@10: jl .loop_l yading@10: %endif yading@10: yading@10: ; scalar loop for leftover yading@10: jmp .end_s yading@10: .loop_s: yading@10: mov wab, [src1q+iq] yading@10: add wab, [src2q+iq] yading@10: mov [dstq+iq], wab yading@10: inc iq yading@10: .end_s: yading@10: cmp iq, wq yading@10: jl .loop_s yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: %if ARCH_X86_32 yading@10: INIT_MMX mmx yading@10: ADD_BYTES_FN 0 yading@10: %endif yading@10: yading@10: INIT_XMM sse2 yading@10: ADD_BYTES_FN 2 yading@10: yading@10: %macro ADD_PAETH_PRED_FN 1 yading@10: cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr yading@10: %if ARCH_X86_64 yading@10: movsxd bppq, bppd yading@10: movsxd wq, wd yading@10: %endif yading@10: lea endq, [dstq+wq-(mmsize/2-1)] yading@10: sub topq, dstq yading@10: sub srcq, dstq yading@10: sub dstq, bppq yading@10: pxor m7, m7 yading@10: yading@10: PUSH dstq yading@10: lea cntrq, [bppq-1] yading@10: shr cntrq, 2 + mmsize/16 yading@10: .bpp_loop: yading@10: lea dstq, [dstq+cntrq*(mmsize/2)] yading@10: movh m0, [dstq] yading@10: movh m1, [topq+dstq] yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: add dstq, bppq yading@10: .loop: yading@10: mova m2, m1 yading@10: movh m1, [topq+dstq] yading@10: mova m3, m2 yading@10: punpcklbw m1, m7 yading@10: mova m4, m2 yading@10: psubw m3, m1 yading@10: psubw m4, m0 yading@10: mova m5, m3 yading@10: paddw m5, m4 yading@10: %if cpuflag(ssse3) yading@10: pabsw m3, m3 yading@10: pabsw m4, m4 yading@10: pabsw m5, m5 yading@10: %else ; !cpuflag(ssse3) yading@10: psubw m7, m5 yading@10: pmaxsw m5, m7 yading@10: pxor m6, m6 yading@10: pxor m7, m7 yading@10: psubw m6, m3 yading@10: psubw m7, m4 yading@10: pmaxsw m3, m6 yading@10: pmaxsw m4, m7 yading@10: pxor m7, m7 yading@10: %endif ; cpuflag(ssse3) yading@10: mova m6, m4 yading@10: pminsw m6, m5 yading@10: pcmpgtw m3, m6 yading@10: pcmpgtw m4, m5 yading@10: mova m6, m4 yading@10: pand m4, m3 yading@10: pandn m6, m3 yading@10: pandn m3, m0 yading@10: movh m0, [srcq+dstq] yading@10: pand m6, m1 yading@10: pand m2, m4 yading@10: punpcklbw m0, m7 yading@10: paddw m0, m6 yading@10: paddw m3, m2 yading@10: paddw m0, m3 yading@10: pand m0, [pw_255] yading@10: mova m3, m0 yading@10: packuswb m3, m3 yading@10: movh [dstq], m3 yading@10: add dstq, bppq yading@10: cmp dstq, endq yading@10: jle .loop yading@10: yading@10: mov dstq, [rsp] yading@10: dec cntrq yading@10: jge .bpp_loop yading@10: POP dstq yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: ADD_PAETH_PRED_FN 0 yading@10: yading@10: INIT_MMX ssse3 yading@10: ADD_PAETH_PRED_FN 0