yading@10: /* yading@10: * VP8 NEON optimisations yading@10: * yading@10: * Copyright (c) 2010 Rob Clark yading@10: * Copyright (c) 2011 Mans Rullgard yading@10: * yading@10: * This file is part of FFmpeg. yading@10: * yading@10: * FFmpeg is free software; you can redistribute it and/or yading@10: * modify it under the terms of the GNU Lesser General Public yading@10: * License as published by the Free Software Foundation; either yading@10: * version 2.1 of the License, or (at your option) any later version. yading@10: * yading@10: * FFmpeg is distributed in the hope that it will be useful, yading@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: * Lesser General Public License for more details. yading@10: * yading@10: * You should have received a copy of the GNU Lesser General Public yading@10: * License along with FFmpeg; if not, write to the Free Software yading@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: */ yading@10: yading@10: #include "libavutil/arm/asm.S" yading@10: #include "neon.S" yading@10: yading@10: function ff_vp8_luma_dc_wht_neon, export=1 yading@10: vld1.16 {q0-q1}, [r1,:128] yading@10: vmov.i16 q15, #0 yading@10: yading@10: vadd.i16 d4, d0, d3 yading@10: vadd.i16 d6, d1, d2 yading@10: vst1.16 {q15}, [r1,:128]! yading@10: vsub.i16 d7, d1, d2 yading@10: vsub.i16 d5, d0, d3 yading@10: vst1.16 {q15}, [r1,:128] yading@10: vadd.i16 q0, q2, q3 yading@10: vsub.i16 q1, q2, q3 yading@10: yading@10: vmov.i16 q8, #3 yading@10: yading@10: vtrn.32 d0, d2 yading@10: vtrn.32 d1, d3 yading@10: vtrn.16 d0, d1 yading@10: vtrn.16 d2, d3 yading@10: yading@10: vadd.i16 d0, d0, d16 yading@10: yading@10: vadd.i16 d4, d0, d3 yading@10: vadd.i16 d6, d1, d2 yading@10: vsub.i16 d7, d1, d2 yading@10: vsub.i16 d5, d0, d3 yading@10: vadd.i16 q0, q2, q3 yading@10: vsub.i16 q1, q2, q3 yading@10: yading@10: vshr.s16 q0, q0, #3 yading@10: vshr.s16 q1, q1, #3 yading@10: yading@10: mov r3, #32 yading@10: vst1.16 {d0[0]}, [r0,:16], r3 yading@10: vst1.16 {d1[0]}, [r0,:16], r3 yading@10: vst1.16 {d2[0]}, [r0,:16], r3 yading@10: vst1.16 {d3[0]}, [r0,:16], r3 yading@10: vst1.16 {d0[1]}, [r0,:16], r3 yading@10: vst1.16 {d1[1]}, [r0,:16], r3 yading@10: vst1.16 {d2[1]}, [r0,:16], r3 yading@10: vst1.16 {d3[1]}, [r0,:16], r3 yading@10: vst1.16 {d0[2]}, [r0,:16], r3 yading@10: vst1.16 {d1[2]}, [r0,:16], r3 yading@10: vst1.16 {d2[2]}, [r0,:16], r3 yading@10: vst1.16 {d3[2]}, [r0,:16], r3 yading@10: vst1.16 {d0[3]}, [r0,:16], r3 yading@10: vst1.16 {d1[3]}, [r0,:16], r3 yading@10: vst1.16 {d2[3]}, [r0,:16], r3 yading@10: vst1.16 {d3[3]}, [r0,:16], r3 yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_vp8_idct_add_neon, export=1 yading@10: vld1.16 {q0-q1}, [r1,:128] yading@10: movw r3, #20091 yading@10: movt r3, #35468/2 yading@10: vdup.32 d4, r3 yading@10: yading@10: vmull.s16 q12, d1, d4[0] yading@10: vmull.s16 q13, d3, d4[0] yading@10: vqdmulh.s16 d20, d1, d4[1] yading@10: vqdmulh.s16 d23, d3, d4[1] yading@10: vshrn.s32 d21, q12, #16 yading@10: vshrn.s32 d22, q13, #16 yading@10: vadd.s16 d21, d21, d1 yading@10: vadd.s16 d22, d22, d3 yading@10: yading@10: vadd.s16 d16, d0, d2 yading@10: vsub.s16 d17, d0, d2 yading@10: vadd.s16 d18, d21, d23 yading@10: vsub.s16 d19, d20, d22 yading@10: vadd.s16 q0, q8, q9 yading@10: vsub.s16 q1, q8, q9 yading@10: yading@10: vtrn.32 d0, d3 yading@10: vtrn.32 d1, d2 yading@10: vtrn.16 d0, d1 yading@10: vtrn.16 d3, d2 yading@10: yading@10: vmov.i16 q15, #0 yading@10: vmull.s16 q12, d1, d4[0] yading@10: vst1.16 {q15}, [r1,:128]! yading@10: vmull.s16 q13, d2, d4[0] yading@10: vst1.16 {q15}, [r1,:128] yading@10: vqdmulh.s16 d21, d1, d4[1] yading@10: vqdmulh.s16 d23, d2, d4[1] yading@10: vshrn.s32 d20, q12, #16 yading@10: vshrn.s32 d22, q13, #16 yading@10: vadd.i16 d20, d20, d1 yading@10: vadd.i16 d22, d22, d2 yading@10: yading@10: vadd.i16 d16, d0, d3 yading@10: vsub.i16 d17, d0, d3 yading@10: vadd.i16 d18, d20, d23 yading@10: vld1.32 {d20[]}, [r0,:32], r2 yading@10: vsub.i16 d19, d21, d22 yading@10: vld1.32 {d22[]}, [r0,:32], r2 yading@10: vadd.s16 q0, q8, q9 yading@10: vld1.32 {d23[]}, [r0,:32], r2 yading@10: vsub.s16 q1, q8, q9 yading@10: vld1.32 {d21[]}, [r0,:32], r2 yading@10: vrshr.s16 q0, q0, #3 yading@10: vtrn.32 q10, q11 yading@10: vrshr.s16 q1, q1, #3 yading@10: yading@10: sub r0, r0, r2, lsl #2 yading@10: yading@10: vtrn.32 d0, d3 yading@10: vtrn.32 d1, d2 yading@10: vtrn.16 d0, d1 yading@10: vtrn.16 d3, d2 yading@10: yading@10: vaddw.u8 q0, q0, d20 yading@10: vaddw.u8 q1, q1, d21 yading@10: vqmovun.s16 d0, q0 yading@10: vqmovun.s16 d1, q1 yading@10: yading@10: vst1.32 {d0[0]}, [r0,:32], r2 yading@10: vst1.32 {d0[1]}, [r0,:32], r2 yading@10: vst1.32 {d1[1]}, [r0,:32], r2 yading@10: vst1.32 {d1[0]}, [r0,:32], r2 yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_vp8_idct_dc_add_neon, export=1 yading@10: mov r3, #0 yading@10: ldrsh r12, [r1] yading@10: strh r3, [r1] yading@10: vdup.16 q1, r12 yading@10: vrshr.s16 q1, q1, #3 yading@10: vld1.32 {d0[]}, [r0,:32], r2 yading@10: vld1.32 {d1[]}, [r0,:32], r2 yading@10: vld1.32 {d0[1]}, [r0,:32], r2 yading@10: vld1.32 {d1[1]}, [r0,:32], r2 yading@10: vaddw.u8 q2, q1, d0 yading@10: vaddw.u8 q3, q1, d1 yading@10: sub r0, r0, r2, lsl #2 yading@10: vqmovun.s16 d0, q2 yading@10: vqmovun.s16 d1, q3 yading@10: vst1.32 {d0[0]}, [r0,:32], r2 yading@10: vst1.32 {d1[0]}, [r0,:32], r2 yading@10: vst1.32 {d0[1]}, [r0,:32], r2 yading@10: vst1.32 {d1[1]}, [r0,:32], r2 yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_vp8_idct_dc_add4uv_neon, export=1 yading@10: vmov.i16 d0, #0 yading@10: mov r3, #32 yading@10: vld1.16 {d16[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d17[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d18[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d19[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: mov r3, r0 yading@10: vrshr.s16 q8, q8, #3 @ dc >>= 3 yading@10: vld1.8 {d0}, [r0,:64], r2 yading@10: vrshr.s16 q9, q9, #3 yading@10: vld1.8 {d1}, [r0,:64], r2 yading@10: vaddw.u8 q10, q8, d0 yading@10: vld1.8 {d2}, [r0,:64], r2 yading@10: vaddw.u8 q0, q8, d1 yading@10: vld1.8 {d3}, [r0,:64], r2 yading@10: vaddw.u8 q11, q8, d2 yading@10: vld1.8 {d4}, [r0,:64], r2 yading@10: vaddw.u8 q1, q8, d3 yading@10: vld1.8 {d5}, [r0,:64], r2 yading@10: vaddw.u8 q12, q9, d4 yading@10: vld1.8 {d6}, [r0,:64], r2 yading@10: vaddw.u8 q2, q9, d5 yading@10: vld1.8 {d7}, [r0,:64], r2 yading@10: vaddw.u8 q13, q9, d6 yading@10: vqmovun.s16 d20, q10 yading@10: vaddw.u8 q3, q9, d7 yading@10: vqmovun.s16 d21, q0 yading@10: vqmovun.s16 d22, q11 yading@10: vst1.8 {d20}, [r3,:64], r2 yading@10: vqmovun.s16 d23, q1 yading@10: vst1.8 {d21}, [r3,:64], r2 yading@10: vqmovun.s16 d24, q12 yading@10: vst1.8 {d22}, [r3,:64], r2 yading@10: vqmovun.s16 d25, q2 yading@10: vst1.8 {d23}, [r3,:64], r2 yading@10: vqmovun.s16 d26, q13 yading@10: vst1.8 {d24}, [r3,:64], r2 yading@10: vqmovun.s16 d27, q3 yading@10: vst1.8 {d25}, [r3,:64], r2 yading@10: vst1.8 {d26}, [r3,:64], r2 yading@10: vst1.8 {d27}, [r3,:64], r2 yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_vp8_idct_dc_add4y_neon, export=1 yading@10: vmov.i16 d0, #0 yading@10: mov r3, #32 yading@10: vld1.16 {d16[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d17[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d18[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vld1.16 {d19[]}, [r1,:16] yading@10: vst1.16 {d0[0]}, [r1,:16], r3 yading@10: vrshr.s16 q8, q8, #3 @ dc >>= 3 yading@10: vld1.8 {q0}, [r0,:128], r2 yading@10: vrshr.s16 q9, q9, #3 yading@10: vld1.8 {q1}, [r0,:128], r2 yading@10: vaddw.u8 q10, q8, d0 yading@10: vld1.8 {q2}, [r0,:128], r2 yading@10: vaddw.u8 q0, q9, d1 yading@10: vld1.8 {q3}, [r0,:128], r2 yading@10: vaddw.u8 q11, q8, d2 yading@10: vaddw.u8 q1, q9, d3 yading@10: vaddw.u8 q12, q8, d4 yading@10: vaddw.u8 q2, q9, d5 yading@10: vaddw.u8 q13, q8, d6 yading@10: vaddw.u8 q3, q9, d7 yading@10: sub r0, r0, r2, lsl #2 yading@10: vqmovun.s16 d20, q10 yading@10: vqmovun.s16 d21, q0 yading@10: vqmovun.s16 d22, q11 yading@10: vqmovun.s16 d23, q1 yading@10: vqmovun.s16 d24, q12 yading@10: vst1.8 {q10}, [r0,:128], r2 yading@10: vqmovun.s16 d25, q2 yading@10: vst1.8 {q11}, [r0,:128], r2 yading@10: vqmovun.s16 d26, q13 yading@10: vst1.8 {q12}, [r0,:128], r2 yading@10: vqmovun.s16 d27, q3 yading@10: vst1.8 {q13}, [r0,:128], r2 yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: @ Register layout: yading@10: @ P3..Q3 -> q0..q7 yading@10: @ flim_E -> q14 yading@10: @ flim_I -> q15 yading@10: @ hev_thresh -> r12 yading@10: @ yading@10: .macro vp8_loop_filter, inner=0, simple=0 yading@10: .if \simple yading@10: vabd.u8 q9, q3, q4 @ abs(P0-Q0) yading@10: vabd.u8 q15, q2, q5 @ abs(P1-Q1) yading@10: vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 yading@10: vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 yading@10: vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) yading@10: vmov.i8 q13, #0x80 yading@10: vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim yading@10: .else yading@10: @ calculate hev and normal_limit: yading@10: vabd.u8 q12, q2, q3 @ abs(P1-P0) yading@10: vabd.u8 q13, q5, q4 @ abs(Q1-Q0) yading@10: vabd.u8 q10, q0, q1 @ abs(P3-P2) yading@10: vabd.u8 q11, q1, q2 @ abs(P2-P1) yading@10: vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I yading@10: vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I yading@10: vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I yading@10: vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I yading@10: vand q8, q8, q9 yading@10: vabd.u8 q9, q7, q6 @ abs(Q3-Q2) yading@10: vand q8, q8, q11 yading@10: vabd.u8 q11, q6, q5 @ abs(Q2-Q1) yading@10: vand q8, q8, q10 yading@10: vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I yading@10: vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I yading@10: vabd.u8 q9, q3, q4 @ abs(P0-Q0) yading@10: vabd.u8 q15, q2, q5 @ abs(P1-Q1) yading@10: vand q8, q8, q10 yading@10: vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 yading@10: vand q8, q8, q11 yading@10: vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 yading@10: vdup.8 q15, r12 @ hev_thresh yading@10: vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) yading@10: vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh yading@10: vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E yading@10: vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh yading@10: vand q8, q8, q11 yading@10: vmov.i8 q13, #0x80 yading@10: vorr q9, q12, q14 yading@10: .endif yading@10: yading@10: @ at this point: yading@10: @ q8: normal_limit yading@10: @ q9: hev yading@10: yading@10: @ convert to signed value: yading@10: veor q3, q3, q13 @ PS0 = P0 ^ 0x80 yading@10: veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 yading@10: yading@10: vmov.i16 q12, #3 yading@10: vsubl.s8 q10, d8, d6 @ QS0 - PS0 yading@10: vsubl.s8 q11, d9, d7 @ (widened to 16bit) yading@10: veor q2, q2, q13 @ PS1 = P1 ^ 0x80 yading@10: veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 yading@10: vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) yading@10: vmul.i16 q11, q11, q12 yading@10: yading@10: vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) yading@10: vmov.i8 q14, #4 yading@10: vmov.i8 q15, #3 yading@10: .if \inner yading@10: vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) yading@10: .endif yading@10: vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) yading@10: vaddw.s8 q11, q11, d25 yading@10: vqmovn.s16 d20, q10 @ narrow result back into q10 yading@10: vqmovn.s16 d21, q11 yading@10: .if !\inner && !\simple yading@10: veor q1, q1, q13 @ PS2 = P2 ^ 0x80 yading@10: veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 yading@10: .endif yading@10: vand q10, q10, q8 @ w &= normal_limit yading@10: yading@10: @ registers used at this point.. yading@10: @ q0 -> P3 (don't corrupt) yading@10: @ q1-q6 -> PS2-QS2 yading@10: @ q7 -> Q3 (don't corrupt) yading@10: @ q9 -> hev yading@10: @ q10 -> w yading@10: @ q13 -> #0x80 yading@10: @ q14 -> #4 yading@10: @ q15 -> #3 yading@10: @ q8, q11, q12 -> unused yading@10: yading@10: @ filter_common: is4tap==1 yading@10: @ c1 = clamp(w + 4) >> 3; yading@10: @ c2 = clamp(w + 3) >> 3; yading@10: @ Q0 = s2u(QS0 - c1); yading@10: @ P0 = s2u(PS0 + c2); yading@10: yading@10: .if \simple yading@10: vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) yading@10: vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) yading@10: vshr.s8 q11, q11, #3 @ c1 >>= 3 yading@10: vshr.s8 q12, q12, #3 @ c2 >>= 3 yading@10: vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) yading@10: vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) yading@10: veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 yading@10: veor q3, q3, q13 @ P0 = PS0 ^ 0x80 yading@10: veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 yading@10: veor q2, q2, q13 @ P1 = PS1 ^ 0x80 yading@10: .elseif \inner yading@10: @ the !is4tap case of filter_common, only used for inner blocks yading@10: @ c3 = ((c1&~hev) + 1) >> 1; yading@10: @ Q1 = s2u(QS1 - c3); yading@10: @ P1 = s2u(PS1 + c3); yading@10: vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) yading@10: vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) yading@10: vshr.s8 q11, q11, #3 @ c1 >>= 3 yading@10: vshr.s8 q12, q12, #3 @ c2 >>= 3 yading@10: vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) yading@10: vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) yading@10: vbic q11, q11, q9 @ c1 & ~hev yading@10: veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 yading@10: vrshr.s8 q11, q11, #1 @ c3 >>= 1 yading@10: veor q3, q3, q13 @ P0 = PS0 ^ 0x80 yading@10: vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) yading@10: vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) yading@10: veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 yading@10: veor q2, q2, q13 @ P1 = PS1 ^ 0x80 yading@10: .else yading@10: vand q12, q10, q9 @ w & hev yading@10: vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) yading@10: vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) yading@10: vshr.s8 q11, q11, #3 @ c1 >>= 3 yading@10: vshr.s8 q12, q12, #3 @ c2 >>= 3 yading@10: vbic q10, q10, q9 @ w &= ~hev yading@10: vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) yading@10: vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) yading@10: yading@10: @ filter_mbedge: yading@10: @ a = clamp((27*w + 63) >> 7); yading@10: @ Q0 = s2u(QS0 - a); yading@10: @ P0 = s2u(PS0 + a); yading@10: @ a = clamp((18*w + 63) >> 7); yading@10: @ Q1 = s2u(QS1 - a); yading@10: @ P1 = s2u(PS1 + a); yading@10: @ a = clamp((9*w + 63) >> 7); yading@10: @ Q2 = s2u(QS2 - a); yading@10: @ P2 = s2u(PS2 + a); yading@10: vmov.i16 q9, #63 yading@10: vshll.s8 q14, d20, #3 yading@10: vshll.s8 q15, d21, #3 yading@10: vaddw.s8 q14, q14, d20 yading@10: vaddw.s8 q15, q15, d21 yading@10: vadd.s16 q8, q9, q14 yading@10: vadd.s16 q9, q9, q15 @ 9*w + 63 yading@10: vadd.s16 q11, q8, q14 yading@10: vadd.s16 q12, q9, q15 @ 18*w + 63 yading@10: vadd.s16 q14, q11, q14 yading@10: vadd.s16 q15, q12, q15 @ 27*w + 63 yading@10: vqshrn.s16 d16, q8, #7 yading@10: vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) yading@10: vqshrn.s16 d22, q11, #7 yading@10: vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) yading@10: vqshrn.s16 d28, q14, #7 yading@10: vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) yading@10: vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) yading@10: vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) yading@10: vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) yading@10: vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) yading@10: vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) yading@10: vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) yading@10: veor q3, q3, q13 @ P0 = PS0 ^ 0x80 yading@10: veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 yading@10: veor q2, q2, q13 @ P1 = PS1 ^ 0x80 yading@10: veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 yading@10: veor q1, q1, q13 @ P2 = PS2 ^ 0x80 yading@10: veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 yading@10: .endif yading@10: .endm yading@10: yading@10: .macro vp8_v_loop_filter16 name, inner=0, simple=0 yading@10: function ff_vp8_v_loop_filter16\name\()_neon, export=1 yading@10: vpush {q4-q7} yading@10: sub r0, r0, r1, lsl #1+!\simple yading@10: yading@10: @ Load pixels: yading@10: .if !\simple yading@10: ldr r12, [sp, #64] @ hev_thresh yading@10: vld1.8 {q0}, [r0,:128], r1 @ P3 yading@10: vld1.8 {q1}, [r0,:128], r1 @ P2 yading@10: .endif yading@10: vld1.8 {q2}, [r0,:128], r1 @ P1 yading@10: vld1.8 {q3}, [r0,:128], r1 @ P0 yading@10: vld1.8 {q4}, [r0,:128], r1 @ Q0 yading@10: vld1.8 {q5}, [r0,:128], r1 @ Q1 yading@10: .if !\simple yading@10: vld1.8 {q6}, [r0,:128], r1 @ Q2 yading@10: vld1.8 {q7}, [r0,:128] @ Q3 yading@10: vdup.8 q15, r3 @ flim_I yading@10: .endif yading@10: vdup.8 q14, r2 @ flim_E yading@10: yading@10: vp8_loop_filter inner=\inner, simple=\simple yading@10: yading@10: @ back up to P2: dst -= stride * 6 yading@10: sub r0, r0, r1, lsl #2 yading@10: .if !\simple yading@10: sub r0, r0, r1, lsl #1 yading@10: yading@10: @ Store pixels: yading@10: vst1.8 {q1}, [r0,:128], r1 @ P2 yading@10: .endif yading@10: vst1.8 {q2}, [r0,:128], r1 @ P1 yading@10: vst1.8 {q3}, [r0,:128], r1 @ P0 yading@10: vst1.8 {q4}, [r0,:128], r1 @ Q0 yading@10: vst1.8 {q5}, [r0,:128], r1 @ Q1 yading@10: .if !\simple yading@10: vst1.8 {q6}, [r0,:128] @ Q2 yading@10: .endif yading@10: yading@10: vpop {q4-q7} yading@10: bx lr yading@10: endfunc yading@10: .endm yading@10: yading@10: vp8_v_loop_filter16 yading@10: vp8_v_loop_filter16 _inner, inner=1 yading@10: vp8_v_loop_filter16 _simple, simple=1 yading@10: yading@10: .macro vp8_v_loop_filter8uv name, inner=0 yading@10: function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 yading@10: vpush {q4-q7} yading@10: sub r0, r0, r2, lsl #2 yading@10: sub r1, r1, r2, lsl #2 yading@10: ldr r12, [sp, #64] @ flim_I yading@10: yading@10: @ Load pixels: yading@10: vld1.8 {d0}, [r0,:64], r2 @ P3 yading@10: vld1.8 {d1}, [r1,:64], r2 @ P3 yading@10: vld1.8 {d2}, [r0,:64], r2 @ P2 yading@10: vld1.8 {d3}, [r1,:64], r2 @ P2 yading@10: vld1.8 {d4}, [r0,:64], r2 @ P1 yading@10: vld1.8 {d5}, [r1,:64], r2 @ P1 yading@10: vld1.8 {d6}, [r0,:64], r2 @ P0 yading@10: vld1.8 {d7}, [r1,:64], r2 @ P0 yading@10: vld1.8 {d8}, [r0,:64], r2 @ Q0 yading@10: vld1.8 {d9}, [r1,:64], r2 @ Q0 yading@10: vld1.8 {d10}, [r0,:64], r2 @ Q1 yading@10: vld1.8 {d11}, [r1,:64], r2 @ Q1 yading@10: vld1.8 {d12}, [r0,:64], r2 @ Q2 yading@10: vld1.8 {d13}, [r1,:64], r2 @ Q2 yading@10: vld1.8 {d14}, [r0,:64] @ Q3 yading@10: vld1.8 {d15}, [r1,:64] @ Q3 yading@10: yading@10: vdup.8 q14, r3 @ flim_E yading@10: vdup.8 q15, r12 @ flim_I yading@10: ldr r12, [sp, #68] @ hev_thresh yading@10: yading@10: vp8_loop_filter inner=\inner yading@10: yading@10: @ back up to P2: u,v -= stride * 6 yading@10: sub r0, r0, r2, lsl #2 yading@10: sub r1, r1, r2, lsl #2 yading@10: sub r0, r0, r2, lsl #1 yading@10: sub r1, r1, r2, lsl #1 yading@10: yading@10: @ Store pixels: yading@10: vst1.8 {d2}, [r0,:64], r2 @ P2 yading@10: vst1.8 {d3}, [r1,:64], r2 @ P2 yading@10: vst1.8 {d4}, [r0,:64], r2 @ P1 yading@10: vst1.8 {d5}, [r1,:64], r2 @ P1 yading@10: vst1.8 {d6}, [r0,:64], r2 @ P0 yading@10: vst1.8 {d7}, [r1,:64], r2 @ P0 yading@10: vst1.8 {d8}, [r0,:64], r2 @ Q0 yading@10: vst1.8 {d9}, [r1,:64], r2 @ Q0 yading@10: vst1.8 {d10}, [r0,:64], r2 @ Q1 yading@10: vst1.8 {d11}, [r1,:64], r2 @ Q1 yading@10: vst1.8 {d12}, [r0,:64] @ Q2 yading@10: vst1.8 {d13}, [r1,:64] @ Q2 yading@10: yading@10: vpop {q4-q7} yading@10: bx lr yading@10: endfunc yading@10: .endm yading@10: yading@10: vp8_v_loop_filter8uv yading@10: vp8_v_loop_filter8uv _inner, inner=1 yading@10: yading@10: .macro vp8_h_loop_filter16 name, inner=0, simple=0 yading@10: function ff_vp8_h_loop_filter16\name\()_neon, export=1 yading@10: vpush {q4-q7} yading@10: sub r0, r0, #4 yading@10: .if !\simple yading@10: ldr r12, [sp, #64] @ hev_thresh yading@10: .endif yading@10: yading@10: @ Load pixels: yading@10: vld1.8 {d0}, [r0], r1 @ load first 8-line src data yading@10: vld1.8 {d2}, [r0], r1 yading@10: vld1.8 {d4}, [r0], r1 yading@10: vld1.8 {d6}, [r0], r1 yading@10: vld1.8 {d8}, [r0], r1 yading@10: vld1.8 {d10}, [r0], r1 yading@10: vld1.8 {d12}, [r0], r1 yading@10: vld1.8 {d14}, [r0], r1 yading@10: vld1.8 {d1}, [r0], r1 @ load second 8-line src data yading@10: vld1.8 {d3}, [r0], r1 yading@10: vld1.8 {d5}, [r0], r1 yading@10: vld1.8 {d7}, [r0], r1 yading@10: vld1.8 {d9}, [r0], r1 yading@10: vld1.8 {d11}, [r0], r1 yading@10: vld1.8 {d13}, [r0], r1 yading@10: vld1.8 {d15}, [r0], r1 yading@10: yading@10: transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 yading@10: yading@10: vdup.8 q14, r2 @ flim_E yading@10: .if !\simple yading@10: vdup.8 q15, r3 @ flim_I yading@10: .endif yading@10: yading@10: vp8_loop_filter inner=\inner, simple=\simple yading@10: yading@10: sub r0, r0, r1, lsl #4 @ backup 16 rows yading@10: yading@10: transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 yading@10: yading@10: @ Store pixels: yading@10: vst1.8 {d0}, [r0], r1 yading@10: vst1.8 {d2}, [r0], r1 yading@10: vst1.8 {d4}, [r0], r1 yading@10: vst1.8 {d6}, [r0], r1 yading@10: vst1.8 {d8}, [r0], r1 yading@10: vst1.8 {d10}, [r0], r1 yading@10: vst1.8 {d12}, [r0], r1 yading@10: vst1.8 {d14}, [r0], r1 yading@10: vst1.8 {d1}, [r0], r1 yading@10: vst1.8 {d3}, [r0], r1 yading@10: vst1.8 {d5}, [r0], r1 yading@10: vst1.8 {d7}, [r0], r1 yading@10: vst1.8 {d9}, [r0], r1 yading@10: vst1.8 {d11}, [r0], r1 yading@10: vst1.8 {d13}, [r0], r1 yading@10: vst1.8 {d15}, [r0] yading@10: yading@10: vpop {q4-q7} yading@10: bx lr yading@10: endfunc yading@10: .endm yading@10: yading@10: vp8_h_loop_filter16 yading@10: vp8_h_loop_filter16 _inner, inner=1 yading@10: vp8_h_loop_filter16 _simple, simple=1 yading@10: yading@10: .macro vp8_h_loop_filter8uv name, inner=0 yading@10: function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 yading@10: vpush {q4-q7} yading@10: sub r0, r0, #4 yading@10: sub r1, r1, #4 yading@10: ldr r12, [sp, #64] @ flim_I yading@10: yading@10: @ Load pixels: yading@10: vld1.8 {d0}, [r0], r2 @ load u yading@10: vld1.8 {d1}, [r1], r2 @ load v yading@10: vld1.8 {d2}, [r0], r2 yading@10: vld1.8 {d3}, [r1], r2 yading@10: vld1.8 {d4}, [r0], r2 yading@10: vld1.8 {d5}, [r1], r2 yading@10: vld1.8 {d6}, [r0], r2 yading@10: vld1.8 {d7}, [r1], r2 yading@10: vld1.8 {d8}, [r0], r2 yading@10: vld1.8 {d9}, [r1], r2 yading@10: vld1.8 {d10}, [r0], r2 yading@10: vld1.8 {d11}, [r1], r2 yading@10: vld1.8 {d12}, [r0], r2 yading@10: vld1.8 {d13}, [r1], r2 yading@10: vld1.8 {d14}, [r0], r2 yading@10: vld1.8 {d15}, [r1], r2 yading@10: yading@10: transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 yading@10: yading@10: vdup.8 q14, r3 @ flim_E yading@10: vdup.8 q15, r12 @ flim_I yading@10: ldr r12, [sp, #68] @ hev_thresh yading@10: yading@10: vp8_loop_filter inner=\inner yading@10: yading@10: sub r0, r0, r2, lsl #3 @ backup u 8 rows yading@10: sub r1, r1, r2, lsl #3 @ backup v 8 rows yading@10: yading@10: transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 yading@10: yading@10: @ Store pixels: yading@10: vst1.8 {d0}, [r0], r2 yading@10: vst1.8 {d1}, [r1], r2 yading@10: vst1.8 {d2}, [r0], r2 yading@10: vst1.8 {d3}, [r1], r2 yading@10: vst1.8 {d4}, [r0], r2 yading@10: vst1.8 {d5}, [r1], r2 yading@10: vst1.8 {d6}, [r0], r2 yading@10: vst1.8 {d7}, [r1], r2 yading@10: vst1.8 {d8}, [r0], r2 yading@10: vst1.8 {d9}, [r1], r2 yading@10: vst1.8 {d10}, [r0], r2 yading@10: vst1.8 {d11}, [r1], r2 yading@10: vst1.8 {d12}, [r0], r2 yading@10: vst1.8 {d13}, [r1], r2 yading@10: vst1.8 {d14}, [r0] yading@10: vst1.8 {d15}, [r1] yading@10: yading@10: vpop {q4-q7} yading@10: bx lr yading@10: endfunc yading@10: .endm yading@10: yading@10: vp8_h_loop_filter8uv yading@10: vp8_h_loop_filter8uv _inner, inner=1 yading@10: yading@10: function ff_put_vp8_pixels16_neon, export=1 yading@10: ldr r12, [sp, #0] @ h yading@10: 1: yading@10: subs r12, r12, #4 yading@10: vld1.8 {q0}, [r2], r3 yading@10: vld1.8 {q1}, [r2], r3 yading@10: vld1.8 {q2}, [r2], r3 yading@10: vld1.8 {q3}, [r2], r3 yading@10: vst1.8 {q0}, [r0,:128], r1 yading@10: vst1.8 {q1}, [r0,:128], r1 yading@10: vst1.8 {q2}, [r0,:128], r1 yading@10: vst1.8 {q3}, [r0,:128], r1 yading@10: bgt 1b yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_pixels8_neon, export=1 yading@10: ldr r12, [sp, #0] @ h yading@10: 1: yading@10: subs r12, r12, #4 yading@10: vld1.8 {d0}, [r2], r3 yading@10: vld1.8 {d1}, [r2], r3 yading@10: vld1.8 {d2}, [r2], r3 yading@10: vld1.8 {d3}, [r2], r3 yading@10: vst1.8 {d0}, [r0,:64], r1 yading@10: vst1.8 {d1}, [r0,:64], r1 yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: bgt 1b yading@10: bx lr yading@10: endfunc yading@10: yading@10: /* 4/6-tap 8th-pel MC */ yading@10: yading@10: .macro vp8_epel8_h6 d, a, b yading@10: vext.8 d27, \a, \b, #1 yading@10: vmovl.u8 q8, \a yading@10: vext.8 d28, \a, \b, #2 yading@10: vmovl.u8 q9, d27 yading@10: vext.8 d29, \a, \b, #3 yading@10: vmovl.u8 q10, d28 yading@10: vext.8 d30, \a, \b, #4 yading@10: vmovl.u8 q11, d29 yading@10: vext.8 d31, \a, \b, #5 yading@10: vmovl.u8 q12, d30 yading@10: vmul.u16 q10, q10, d0[2] yading@10: vmovl.u8 q13, d31 yading@10: vmul.u16 q11, q11, d0[3] yading@10: vmls.u16 q10, q9, d0[1] yading@10: vmls.u16 q11, q12, d1[0] yading@10: vmla.u16 q10, q8, d0[0] yading@10: vmla.u16 q11, q13, d1[1] yading@10: vqadd.s16 q11, q10, q11 yading@10: vqrshrun.s16 \d, q11, #7 yading@10: .endm yading@10: yading@10: .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 yading@10: vext.8 q14, \q0, \q1, #3 yading@10: vext.8 q15, \q0, \q1, #4 yading@10: vmovl.u8 q11, d28 yading@10: vmovl.u8 q14, d29 yading@10: vext.8 q3, \q0, \q1, #2 yading@10: vmovl.u8 q12, d30 yading@10: vmovl.u8 q15, d31 yading@10: vext.8 q8, \q0, \q1, #1 yading@10: vmovl.u8 q10, d6 yading@10: vmovl.u8 q3, d7 yading@10: vext.8 q2, \q0, \q1, #5 yading@10: vmovl.u8 q13, d4 yading@10: vmovl.u8 q2, d5 yading@10: vmovl.u8 q9, d16 yading@10: vmovl.u8 q8, d17 yading@10: vmul.u16 q11, q11, d0[3] yading@10: vmul.u16 q10, q10, d0[2] yading@10: vmul.u16 q3, q3, d0[2] yading@10: vmul.u16 q14, q14, d0[3] yading@10: vmls.u16 q11, q12, d1[0] yading@10: vmovl.u8 q12, \s0 yading@10: vmovl.u8 q1, \s1 yading@10: vmls.u16 q10, q9, d0[1] yading@10: vmls.u16 q3, q8, d0[1] yading@10: vmls.u16 q14, q15, d1[0] yading@10: vmla.u16 q10, q12, d0[0] yading@10: vmla.u16 q11, q13, d1[1] yading@10: vmla.u16 q3, q1, d0[0] yading@10: vmla.u16 q14, q2, d1[1] yading@10: vqadd.s16 q11, q10, q11 yading@10: vqadd.s16 q14, q3, q14 yading@10: vqrshrun.s16 \d0, q11, #7 yading@10: vqrshrun.s16 \d1, q14, #7 yading@10: .endm yading@10: yading@10: .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 yading@10: vmovl.u8 q10, \s2 yading@10: vmovl.u8 q11, \s3 yading@10: vmovl.u8 q9, \s1 yading@10: vmovl.u8 q12, \s4 yading@10: vmovl.u8 q8, \s0 yading@10: vmovl.u8 q13, \s5 yading@10: vmul.u16 q10, q10, d0[2] yading@10: vmul.u16 q11, q11, d0[3] yading@10: vmls.u16 q10, q9, d0[1] yading@10: vmls.u16 q11, q12, d1[0] yading@10: vmla.u16 q10, q8, d0[0] yading@10: vmla.u16 q11, q13, d1[1] yading@10: vqadd.s16 q11, q10, q11 yading@10: vqrshrun.s16 \d0, q11, #7 yading@10: .endm yading@10: yading@10: .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 yading@10: vmovl.u8 q10, \s0 yading@10: vmovl.u8 q11, \s3 yading@10: vmovl.u8 q14, \s6 yading@10: vmovl.u8 q9, \s1 yading@10: vmovl.u8 q12, \s4 yading@10: vmovl.u8 q8, \s2 yading@10: vmovl.u8 q13, \s5 yading@10: vmul.u16 q10, q10, d0[0] yading@10: vmul.u16 q15, q11, d0[3] yading@10: vmul.u16 q11, q11, d0[2] yading@10: vmul.u16 q14, q14, d1[1] yading@10: vmls.u16 q10, q9, d0[1] yading@10: vmls.u16 q15, q12, d1[0] yading@10: vmls.u16 q11, q8, d0[1] yading@10: vmls.u16 q14, q13, d1[0] yading@10: vmla.u16 q10, q8, d0[2] yading@10: vmla.u16 q15, q13, d1[1] yading@10: vmla.u16 q11, q9, d0[0] yading@10: vmla.u16 q14, q12, d0[3] yading@10: vqadd.s16 q15, q10, q15 yading@10: vqadd.s16 q14, q11, q14 yading@10: vqrshrun.s16 \d0, q15, #7 yading@10: vqrshrun.s16 \d1, q14, #7 yading@10: .endm yading@10: yading@10: .macro vp8_epel8_h4 d, a, b yading@10: vext.8 d28, \a, \b, #1 yading@10: vmovl.u8 q9, \a yading@10: vext.8 d29, \a, \b, #2 yading@10: vmovl.u8 q10, d28 yading@10: vext.8 d30, \a, \b, #3 yading@10: vmovl.u8 q11, d29 yading@10: vmovl.u8 q12, d30 yading@10: vmul.u16 q10, q10, d0[2] yading@10: vmul.u16 q11, q11, d0[3] yading@10: vmls.u16 q10, q9, d0[1] yading@10: vmls.u16 q11, q12, d1[0] yading@10: vqadd.s16 q11, q10, q11 yading@10: vqrshrun.s16 \d, q11, #7 yading@10: .endm yading@10: yading@10: .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 yading@10: vmovl.u8 q9, \s0 yading@10: vmovl.u8 q10, \s1 yading@10: vmovl.u8 q11, \s2 yading@10: vmovl.u8 q12, \s3 yading@10: vmovl.u8 q13, \s4 yading@10: vmul.u16 q8, q10, d0[2] yading@10: vmul.u16 q14, q11, d0[3] yading@10: vmul.u16 q11, q11, d0[2] yading@10: vmul.u16 q15, q12, d0[3] yading@10: vmls.u16 q8, q9, d0[1] yading@10: vmls.u16 q14, q12, d1[0] yading@10: vmls.u16 q11, q10, d0[1] yading@10: vmls.u16 q15, q13, d1[0] yading@10: vqadd.s16 q8, q8, q14 yading@10: vqadd.s16 q11, q11, q15 yading@10: vqrshrun.s16 \d0, q8, #7 yading@10: vqrshrun.s16 \d1, q11, #7 yading@10: .endm yading@10: yading@10: function ff_put_vp8_epel16_v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: push {r4,lr} yading@10: vpush {d8-d15} yading@10: yading@10: ldr r4, [sp, #80] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #72] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2-d3}, [r2], r3 yading@10: vld1.8 {d4-d5}, [r2], r3 yading@10: vld1.8 {d6-d7}, [r2], r3 yading@10: vld1.8 {d8-d9}, [r2], r3 yading@10: vld1.8 {d10-d11},[r2], r3 yading@10: vld1.8 {d12-d13},[r2], r3 yading@10: vld1.8 {d14-d15},[r2] yading@10: sub r2, r2, r3, lsl #2 yading@10: yading@10: vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 yading@10: vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 yading@10: yading@10: vst1.8 {d2-d3}, [r0,:128], r1 yading@10: vst1.8 {d4-d5}, [r0,:128], r1 yading@10: subs r12, r12, #2 yading@10: bne 1b yading@10: yading@10: vpop {d8-d15} yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel16_h6_neon, export=1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2-d4}, [r2], r3 yading@10: yading@10: vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 yading@10: yading@10: vst1.8 {d2-d3}, [r0,:128], r1 yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel16_h6v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: vpush {d8-d9} yading@10: yading@10: @ first pass (horizontal): yading@10: ldr r4, [sp, #28] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #24] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #336+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #5 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2,d3,d4}, [r2], r3 yading@10: yading@10: vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 yading@10: yading@10: vst1.8 {d2-d3}, [lr,:128]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: @ second pass (vertical): yading@10: ldr r4, [sp, #336+16+32] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #336+16+24] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d5}, [lr,:128]! yading@10: vld1.8 {d6-d9}, [lr,:128]! yading@10: vld1.8 {d28-d31},[lr,:128] yading@10: sub lr, lr, #48 yading@10: yading@10: vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 yading@10: vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 yading@10: yading@10: vst1.8 {d2-d3}, [r0,:128], r1 yading@10: subs r12, r12, #1 yading@10: bne 2b yading@10: yading@10: add sp, sp, #336+16 yading@10: vpop {d8-d9} yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2}, [r2], r3 yading@10: vld1.8 {d3}, [r2], r3 yading@10: vld1.8 {d4}, [r2], r3 yading@10: vld1.8 {d5}, [r2], r3 yading@10: vld1.8 {d6}, [r2], r3 yading@10: vld1.8 {d7}, [r2], r3 yading@10: vld1.8 {d28}, [r2] yading@10: yading@10: sub r2, r2, r3, lsl #2 yading@10: yading@10: vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h6_neon, export=1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h6v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: @ first pass (horizontal): yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #168+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #5 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [lr,:64]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: @ second pass (vertical): yading@10: ldr r4, [sp, #168+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #168+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d5}, [lr,:128]! yading@10: vld1.8 {d6-d7}, [lr,:128]! yading@10: vld1.8 {d30}, [lr,:64] yading@10: sub lr, lr, #32 yading@10: yading@10: vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 2b yading@10: yading@10: add sp, sp, #168+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2}, [r2], r3 yading@10: vld1.8 {d3}, [r2], r3 yading@10: vld1.8 {d4}, [r2], r3 yading@10: vld1.8 {d5}, [r2], r3 yading@10: vld1.8 {d6}, [r2] yading@10: sub r2, r2, r3, lsl #1 yading@10: yading@10: vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h4_neon, export=1 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h4 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h4v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: @ first pass (horizontal): yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #168+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #3 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h4 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [lr,:64]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: @ second pass (vertical): yading@10: ldr r4, [sp, #168+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #168+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d5}, [lr,:128]! yading@10: vld1.8 {d6}, [lr,:64] yading@10: sub lr, lr, #16 yading@10: yading@10: vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 2b yading@10: yading@10: add sp, sp, #168+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h6v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: @ first pass (horizontal): yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #168+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #3 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [lr,:64]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: @ second pass (vertical): yading@10: ldr r4, [sp, #168+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #168+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d5}, [lr,:128]! yading@10: vld1.8 {d6}, [lr,:64] yading@10: sub lr, lr, #16 yading@10: yading@10: vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 2b yading@10: yading@10: add sp, sp, #168+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel8_h4v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: @ first pass (horizontal): yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #168+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #5 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2,d3}, [r2], r3 yading@10: yading@10: vp8_epel8_h4 d2, d2, d3 yading@10: yading@10: vst1.8 {d2}, [lr,:64]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: @ second pass (vertical): yading@10: ldr r4, [sp, #168+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #168+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d5}, [lr,:128]! yading@10: vld1.8 {d6-d7}, [lr,:128]! yading@10: vld1.8 {d30}, [lr,:64] yading@10: sub lr, lr, #32 yading@10: yading@10: vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 yading@10: yading@10: vst1.8 {d2}, [r0,:64], r1 yading@10: vst1.8 {d3}, [r0,:64], r1 yading@10: subs r12, r12, #2 yading@10: bne 2b yading@10: yading@10: add sp, sp, #168+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: .ltorg yading@10: yading@10: function ff_put_vp8_epel4_v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.32 {d2[]}, [r2], r3 yading@10: vld1.32 {d3[]}, [r2], r3 yading@10: vld1.32 {d4[]}, [r2], r3 yading@10: vld1.32 {d5[]}, [r2], r3 yading@10: vld1.32 {d6[]}, [r2], r3 yading@10: vld1.32 {d7[]}, [r2], r3 yading@10: vld1.32 {d28[]}, [r2] yading@10: sub r2, r2, r3, lsl #2 yading@10: vld1.32 {d2[1]}, [r2], r3 yading@10: vld1.32 {d3[1]}, [r2], r3 yading@10: vld1.32 {d4[1]}, [r2], r3 yading@10: vld1.32 {d5[1]}, [r2], r3 yading@10: vld1.32 {d6[1]}, [r2], r3 yading@10: vld1.32 {d7[1]}, [r2], r3 yading@10: vld1.32 {d28[1]}, [r2] yading@10: sub r2, r2, r3, lsl #2 yading@10: yading@10: vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 yading@10: yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h6_neon, export=1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {q1}, [r2], r3 yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h6v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #52+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #5 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {q1}, [r2], r3 yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: vst1.32 {d2[0]}, [lr,:32]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: ldr r4, [sp, #52+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #52+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d3}, [lr,:128]! yading@10: vld1.8 {d6}, [lr,:64]! yading@10: vld1.32 {d28[]}, [lr,:32] yading@10: sub lr, lr, #16 yading@10: vld1.8 {d4-d5}, [lr]! yading@10: vld1.8 {d7}, [lr,:64]! yading@10: vld1.32 {d28[1]}, [lr,:32] yading@10: sub lr, lr, #16 yading@10: vtrn.32 q1, q2 yading@10: vtrn.32 d6, d7 yading@10: vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 2b yading@10: yading@10: add sp, sp, #52+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h4v6_neon, export=1 yading@10: sub r2, r2, r3, lsl #1 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #52+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #5 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2}, [r2], r3 yading@10: vp8_epel8_h4 d2, d2, d2 yading@10: vst1.32 {d2[0]}, [lr,:32]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: ldr r4, [sp, #52+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #52+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d3}, [lr,:128]! yading@10: vld1.8 {d6}, [lr,:64]! yading@10: vld1.32 {d28[]}, [lr,:32] yading@10: sub lr, lr, #16 yading@10: vld1.8 {d4-d5}, [lr]! yading@10: vld1.8 {d7}, [lr,:64]! yading@10: vld1.32 {d28[1]}, [lr,:32] yading@10: sub lr, lr, #16 yading@10: vtrn.32 q1, q2 yading@10: vtrn.32 d6, d7 yading@10: vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 2b yading@10: yading@10: add sp, sp, #52+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h6v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: sub r2, r2, #2 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #44+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #3 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {q1}, [r2], r3 yading@10: vp8_epel8_h6 d2, d2, d3 yading@10: vst1.32 {d2[0]}, [lr,:32]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: ldr r4, [sp, #44+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #44+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d3}, [lr,:128]! yading@10: vld1.32 {d6[]}, [lr,:32] yading@10: sub lr, lr, #8 yading@10: vld1.8 {d4-d5}, [lr]! yading@10: vld1.32 {d6[1]}, [lr,:32] yading@10: sub lr, lr, #8 yading@10: vtrn.32 q1, q2 yading@10: vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 2b yading@10: yading@10: add sp, sp, #44+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h4_neon, export=1 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.8 {d2}, [r2], r3 yading@10: vp8_epel8_h4 d2, d2, d2 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: vld1.16 {q0}, [r4,:128] yading@10: 1: yading@10: vld1.32 {d2[]}, [r2], r3 yading@10: vld1.32 {d3[]}, [r2], r3 yading@10: vld1.32 {d4[]}, [r2], r3 yading@10: vld1.32 {d5[]}, [r2], r3 yading@10: vld1.32 {d6[]}, [r2] yading@10: sub r2, r2, r3, lsl #1 yading@10: vld1.32 {d2[1]}, [r2], r3 yading@10: vld1.32 {d3[1]}, [r2], r3 yading@10: vld1.32 {d4[1]}, [r2], r3 yading@10: vld1.32 {d5[1]}, [r2], r3 yading@10: vld1.32 {d6[1]}, [r2] yading@10: sub r2, r2, r3, lsl #1 yading@10: yading@10: vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 yading@10: yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 1b yading@10: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_put_vp8_epel4_h4v4_neon, export=1 yading@10: sub r2, r2, r3 yading@10: sub r2, r2, #1 yading@10: push {r4,lr} yading@10: yading@10: ldr r4, [sp, #12] @ mx yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: sub sp, sp, #44+16 yading@10: vld1.16 {q0}, [r4,:128] yading@10: add lr, sp, #15 yading@10: add r12, r12, #3 yading@10: bic lr, lr, #15 yading@10: 1: yading@10: vld1.8 {d2}, [r2], r3 yading@10: vp8_epel8_h4 d2, d2, d3 yading@10: vst1.32 {d2[0]}, [lr,:32]! yading@10: subs r12, r12, #1 yading@10: bne 1b yading@10: yading@10: ldr r4, [sp, #44+16+16] @ my yading@10: movrel lr, subpel_filters-16 yading@10: ldr r12, [sp, #44+16+8] @ h yading@10: add r4, lr, r4, lsl #4 yading@10: add lr, sp, #15 yading@10: vld1.16 {q0}, [r4,:128] yading@10: bic lr, lr, #15 yading@10: 2: yading@10: vld1.8 {d2-d3}, [lr,:128]! yading@10: vld1.32 {d6[]}, [lr,:32] yading@10: sub lr, lr, #8 yading@10: vld1.8 {d4-d5}, [lr]! yading@10: vld1.32 {d6[1]}, [lr,:32] yading@10: sub lr, lr, #8 yading@10: vtrn.32 q1, q2 yading@10: vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 yading@10: vst1.32 {d2[0]}, [r0,:32], r1 yading@10: vst1.32 {d3[0]}, [r0,:32], r1 yading@10: vst1.32 {d2[1]}, [r0,:32], r1 yading@10: vst1.32 {d3[1]}, [r0,:32], r1 yading@10: subs r12, r12, #4 yading@10: bne 2b yading@10: yading@10: add sp, sp, #44+16 yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit yading@10: @ arithmatic can be used to apply filters yading@10: const subpel_filters, align=4 yading@10: .short 0, 6, 123, 12, 1, 0, 0, 0 yading@10: .short 2, 11, 108, 36, 8, 1, 0, 0 yading@10: .short 0, 9, 93, 50, 6, 0, 0, 0 yading@10: .short 3, 16, 77, 77, 16, 3, 0, 0 yading@10: .short 0, 6, 50, 93, 9, 0, 0, 0 yading@10: .short 1, 8, 36, 108, 11, 2, 0, 0 yading@10: .short 0, 1, 12, 123, 6, 0, 0, 0 yading@10: endconst yading@10: yading@10: /* Bilinear MC */ yading@10: yading@10: function ff_put_vp8_bilin16_h_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {d2-d4}, [r2], r1 yading@10: vext.8 q2, q1, q2, #1 yading@10: vmull.u8 q8, d2, d1 yading@10: vmlal.u8 q8, d4, d0 yading@10: vld1.8 {d18-d20},[r2], r1 yading@10: vmull.u8 q3, d3, d1 yading@10: vmlal.u8 q3, d5, d0 yading@10: vext.8 q10, q9, q10, #1 yading@10: vmull.u8 q11, d18, d1 yading@10: vmlal.u8 q11, d20, d0 yading@10: vmull.u8 q12, d19, d1 yading@10: vmlal.u8 q12, d21, d0 yading@10: vrshrn.u16 d4, q8, #3 yading@10: vrshrn.u16 d5, q3, #3 yading@10: vrshrn.u16 d6, q11, #3 yading@10: vrshrn.u16 d7, q12, #3 yading@10: vst1.8 {q2}, [r0,:128], r1 yading@10: vst1.8 {q3}, [r0,:128], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin16_v_neon, export=1 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: vld1.8 {q1}, [r2], r1 yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {q2}, [r2], r1 yading@10: vmull.u8 q3, d2, d1 yading@10: vmlal.u8 q3, d4, d0 yading@10: vmull.u8 q8, d3, d1 yading@10: vmlal.u8 q8, d5, d0 yading@10: vld1.8 {q1}, [r2], r1 yading@10: vmull.u8 q9, d4, d1 yading@10: vmlal.u8 q9, d2, d0 yading@10: vmull.u8 q10, d5, d1 yading@10: vmlal.u8 q10, d3, d0 yading@10: vrshrn.u16 d4, q3, #3 yading@10: vrshrn.u16 d5, q8, #3 yading@10: vrshrn.u16 d6, q9, #3 yading@10: vrshrn.u16 d7, q10, #3 yading@10: vst1.8 {q2}, [r0,:128], r1 yading@10: vst1.8 {q3}, [r0,:128], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin16_hv_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d2, r3 yading@10: vdup.8 d3, r12 yading@10: ldr r12, [sp] @ h yading@10: yading@10: vld1.8 {d4-d6}, [r2], r1 yading@10: vext.8 q3, q2, q3, #1 yading@10: vmull.u8 q8, d4, d1 yading@10: vmlal.u8 q8, d6, d0 yading@10: vmull.u8 q9, d5, d1 yading@10: vmlal.u8 q9, d7, d0 yading@10: vrshrn.u16 d4, q8, #3 yading@10: vrshrn.u16 d5, q9, #3 yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {d18-d20},[r2], r1 yading@10: vext.8 q10, q9, q10, #1 yading@10: vmull.u8 q11, d18, d1 yading@10: vmlal.u8 q11, d20, d0 yading@10: vld1.8 {d26-d28},[r2], r1 yading@10: vmull.u8 q12, d19, d1 yading@10: vmlal.u8 q12, d21, d0 yading@10: vext.8 q14, q13, q14, #1 yading@10: vmull.u8 q8, d26, d1 yading@10: vmlal.u8 q8, d28, d0 yading@10: vmull.u8 q9, d27, d1 yading@10: vmlal.u8 q9, d29, d0 yading@10: vrshrn.u16 d6, q11, #3 yading@10: vrshrn.u16 d7, q12, #3 yading@10: vmull.u8 q12, d4, d3 yading@10: vmlal.u8 q12, d6, d2 yading@10: vmull.u8 q15, d5, d3 yading@10: vmlal.u8 q15, d7, d2 yading@10: vrshrn.u16 d4, q8, #3 yading@10: vrshrn.u16 d5, q9, #3 yading@10: vmull.u8 q10, d6, d3 yading@10: vmlal.u8 q10, d4, d2 yading@10: vmull.u8 q11, d7, d3 yading@10: vmlal.u8 q11, d5, d2 yading@10: vrshrn.u16 d24, q12, #3 yading@10: vrshrn.u16 d25, q15, #3 yading@10: vst1.8 {q12}, [r0,:128], r1 yading@10: vrshrn.u16 d20, q10, #3 yading@10: vrshrn.u16 d21, q11, #3 yading@10: vst1.8 {q10}, [r0,:128], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin8_h_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {q1}, [r2], r1 yading@10: vext.8 d3, d2, d3, #1 yading@10: vmull.u8 q2, d2, d1 yading@10: vmlal.u8 q2, d3, d0 yading@10: vld1.8 {q3}, [r2], r1 yading@10: vext.8 d7, d6, d7, #1 yading@10: vmull.u8 q8, d6, d1 yading@10: vmlal.u8 q8, d7, d0 yading@10: vrshrn.u16 d4, q2, #3 yading@10: vrshrn.u16 d16, q8, #3 yading@10: vst1.8 {d4}, [r0,:64], r1 yading@10: vst1.8 {d16}, [r0,:64], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin8_v_neon, export=1 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: vld1.8 {d2}, [r2], r1 yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {d3}, [r2], r1 yading@10: vmull.u8 q2, d2, d1 yading@10: vmlal.u8 q2, d3, d0 yading@10: vld1.8 {d2}, [r2], r1 yading@10: vmull.u8 q3, d3, d1 yading@10: vmlal.u8 q3, d2, d0 yading@10: vrshrn.u16 d4, q2, #3 yading@10: vrshrn.u16 d6, q3, #3 yading@10: vst1.8 {d4}, [r0,:64], r1 yading@10: vst1.8 {d6}, [r0,:64], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin8_hv_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d2, r3 yading@10: vdup.8 d3, r12 yading@10: ldr r12, [sp] @ h yading@10: yading@10: vld1.8 {q2}, [r2], r1 yading@10: vext.8 d5, d4, d5, #1 yading@10: vmull.u8 q9, d4, d1 yading@10: vmlal.u8 q9, d5, d0 yading@10: vrshrn.u16 d22, q9, #3 yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {q3}, [r2], r1 yading@10: vext.8 d7, d6, d7, #1 yading@10: vmull.u8 q8, d6, d1 yading@10: vmlal.u8 q8, d7, d0 yading@10: vld1.8 {q2}, [r2], r1 yading@10: vext.8 d5, d4, d5, #1 yading@10: vmull.u8 q9, d4, d1 yading@10: vmlal.u8 q9, d5, d0 yading@10: vrshrn.u16 d16, q8, #3 yading@10: vmull.u8 q10, d22, d3 yading@10: vmlal.u8 q10, d16, d2 yading@10: vrshrn.u16 d22, q9, #3 yading@10: vmull.u8 q12, d16, d3 yading@10: vmlal.u8 q12, d22, d2 yading@10: vrshrn.u16 d20, q10, #3 yading@10: vst1.8 {d20}, [r0,:64], r1 yading@10: vrshrn.u16 d23, q12, #3 yading@10: vst1.8 {d23}, [r0,:64], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin4_h_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {d2}, [r2], r1 yading@10: vext.8 d3, d2, d3, #1 yading@10: vld1.8 {d6}, [r2], r1 yading@10: vext.8 d7, d6, d7, #1 yading@10: vtrn.32 q1, q3 yading@10: vmull.u8 q2, d2, d1 yading@10: vmlal.u8 q2, d3, d0 yading@10: vrshrn.u16 d4, q2, #3 yading@10: vst1.32 {d4[0]}, [r0,:32], r1 yading@10: vst1.32 {d4[1]}, [r0,:32], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin4_v_neon, export=1 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r12, [sp] @ h yading@10: vld1.32 {d2[]}, [r2], r1 yading@10: 1: yading@10: vld1.32 {d3[]}, [r2] yading@10: vld1.32 {d2[1]}, [r2], r1 yading@10: vld1.32 {d3[1]}, [r2], r1 yading@10: vmull.u8 q2, d2, d1 yading@10: vmlal.u8 q2, d3, d0 yading@10: vtrn.32 d3, d2 yading@10: vrshrn.u16 d4, q2, #3 yading@10: vst1.32 {d4[0]}, [r0,:32], r1 yading@10: vst1.32 {d4[1]}, [r0,:32], r1 yading@10: subs r12, r12, #2 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_put_vp8_bilin4_hv_neon, export=1 yading@10: ldr r3, [sp, #4] @ mx yading@10: rsb r12, r3, #8 yading@10: vdup.8 d0, r3 yading@10: vdup.8 d1, r12 yading@10: ldr r3, [sp, #8] @ my yading@10: rsb r12, r3, #8 yading@10: vdup.8 d2, r3 yading@10: vdup.8 d3, r12 yading@10: ldr r12, [sp] @ h yading@10: yading@10: vld1.8 {d4}, [r2], r1 yading@10: vext.8 d5, d4, d4, #1 yading@10: vmull.u8 q9, d4, d1 yading@10: vmlal.u8 q9, d5, d0 yading@10: vrshrn.u16 d22, q9, #3 yading@10: 1: yading@10: subs r12, r12, #2 yading@10: vld1.8 {d6}, [r2], r1 yading@10: vext.8 d7, d6, d6, #1 yading@10: vld1.8 {d4}, [r2], r1 yading@10: vext.8 d5, d4, d4, #1 yading@10: vtrn.32 q3, q2 yading@10: vmull.u8 q8, d6, d1 yading@10: vmlal.u8 q8, d7, d0 yading@10: vrshrn.u16 d16, q8, #3 yading@10: vmull.u8 q10, d16, d2 yading@10: vtrn.32 d22, d16 yading@10: vmlal.u8 q10, d22, d3 yading@10: vrev64.32 d22, d16 yading@10: vrshrn.u16 d20, q10, #3 yading@10: vst1.32 {d20[0]}, [r0,:32], r1 yading@10: vst1.32 {d20[1]}, [r0,:32], r1 yading@10: bgt 1b yading@10: yading@10: bx lr yading@10: endfunc