annotate ffmpeg/libavcodec/arm/vp8dsp_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * VP8 NEON optimisations
yading@10 3 *
yading@10 4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
yading@10 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
yading@10 6 *
yading@10 7 * This file is part of FFmpeg.
yading@10 8 *
yading@10 9 * FFmpeg is free software; you can redistribute it and/or
yading@10 10 * modify it under the terms of the GNU Lesser General Public
yading@10 11 * License as published by the Free Software Foundation; either
yading@10 12 * version 2.1 of the License, or (at your option) any later version.
yading@10 13 *
yading@10 14 * FFmpeg is distributed in the hope that it will be useful,
yading@10 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 17 * Lesser General Public License for more details.
yading@10 18 *
yading@10 19 * You should have received a copy of the GNU Lesser General Public
yading@10 20 * License along with FFmpeg; if not, write to the Free Software
yading@10 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 22 */
yading@10 23
yading@10 24 #include "libavutil/arm/asm.S"
yading@10 25 #include "neon.S"
yading@10 26
yading@10 27 function ff_vp8_luma_dc_wht_neon, export=1
yading@10 28 vld1.16 {q0-q1}, [r1,:128]
yading@10 29 vmov.i16 q15, #0
yading@10 30
yading@10 31 vadd.i16 d4, d0, d3
yading@10 32 vadd.i16 d6, d1, d2
yading@10 33 vst1.16 {q15}, [r1,:128]!
yading@10 34 vsub.i16 d7, d1, d2
yading@10 35 vsub.i16 d5, d0, d3
yading@10 36 vst1.16 {q15}, [r1,:128]
yading@10 37 vadd.i16 q0, q2, q3
yading@10 38 vsub.i16 q1, q2, q3
yading@10 39
yading@10 40 vmov.i16 q8, #3
yading@10 41
yading@10 42 vtrn.32 d0, d2
yading@10 43 vtrn.32 d1, d3
yading@10 44 vtrn.16 d0, d1
yading@10 45 vtrn.16 d2, d3
yading@10 46
yading@10 47 vadd.i16 d0, d0, d16
yading@10 48
yading@10 49 vadd.i16 d4, d0, d3
yading@10 50 vadd.i16 d6, d1, d2
yading@10 51 vsub.i16 d7, d1, d2
yading@10 52 vsub.i16 d5, d0, d3
yading@10 53 vadd.i16 q0, q2, q3
yading@10 54 vsub.i16 q1, q2, q3
yading@10 55
yading@10 56 vshr.s16 q0, q0, #3
yading@10 57 vshr.s16 q1, q1, #3
yading@10 58
yading@10 59 mov r3, #32
yading@10 60 vst1.16 {d0[0]}, [r0,:16], r3
yading@10 61 vst1.16 {d1[0]}, [r0,:16], r3
yading@10 62 vst1.16 {d2[0]}, [r0,:16], r3
yading@10 63 vst1.16 {d3[0]}, [r0,:16], r3
yading@10 64 vst1.16 {d0[1]}, [r0,:16], r3
yading@10 65 vst1.16 {d1[1]}, [r0,:16], r3
yading@10 66 vst1.16 {d2[1]}, [r0,:16], r3
yading@10 67 vst1.16 {d3[1]}, [r0,:16], r3
yading@10 68 vst1.16 {d0[2]}, [r0,:16], r3
yading@10 69 vst1.16 {d1[2]}, [r0,:16], r3
yading@10 70 vst1.16 {d2[2]}, [r0,:16], r3
yading@10 71 vst1.16 {d3[2]}, [r0,:16], r3
yading@10 72 vst1.16 {d0[3]}, [r0,:16], r3
yading@10 73 vst1.16 {d1[3]}, [r0,:16], r3
yading@10 74 vst1.16 {d2[3]}, [r0,:16], r3
yading@10 75 vst1.16 {d3[3]}, [r0,:16], r3
yading@10 76
yading@10 77 bx lr
yading@10 78 endfunc
yading@10 79
yading@10 80 function ff_vp8_idct_add_neon, export=1
yading@10 81 vld1.16 {q0-q1}, [r1,:128]
yading@10 82 movw r3, #20091
yading@10 83 movt r3, #35468/2
yading@10 84 vdup.32 d4, r3
yading@10 85
yading@10 86 vmull.s16 q12, d1, d4[0]
yading@10 87 vmull.s16 q13, d3, d4[0]
yading@10 88 vqdmulh.s16 d20, d1, d4[1]
yading@10 89 vqdmulh.s16 d23, d3, d4[1]
yading@10 90 vshrn.s32 d21, q12, #16
yading@10 91 vshrn.s32 d22, q13, #16
yading@10 92 vadd.s16 d21, d21, d1
yading@10 93 vadd.s16 d22, d22, d3
yading@10 94
yading@10 95 vadd.s16 d16, d0, d2
yading@10 96 vsub.s16 d17, d0, d2
yading@10 97 vadd.s16 d18, d21, d23
yading@10 98 vsub.s16 d19, d20, d22
yading@10 99 vadd.s16 q0, q8, q9
yading@10 100 vsub.s16 q1, q8, q9
yading@10 101
yading@10 102 vtrn.32 d0, d3
yading@10 103 vtrn.32 d1, d2
yading@10 104 vtrn.16 d0, d1
yading@10 105 vtrn.16 d3, d2
yading@10 106
yading@10 107 vmov.i16 q15, #0
yading@10 108 vmull.s16 q12, d1, d4[0]
yading@10 109 vst1.16 {q15}, [r1,:128]!
yading@10 110 vmull.s16 q13, d2, d4[0]
yading@10 111 vst1.16 {q15}, [r1,:128]
yading@10 112 vqdmulh.s16 d21, d1, d4[1]
yading@10 113 vqdmulh.s16 d23, d2, d4[1]
yading@10 114 vshrn.s32 d20, q12, #16
yading@10 115 vshrn.s32 d22, q13, #16
yading@10 116 vadd.i16 d20, d20, d1
yading@10 117 vadd.i16 d22, d22, d2
yading@10 118
yading@10 119 vadd.i16 d16, d0, d3
yading@10 120 vsub.i16 d17, d0, d3
yading@10 121 vadd.i16 d18, d20, d23
yading@10 122 vld1.32 {d20[]}, [r0,:32], r2
yading@10 123 vsub.i16 d19, d21, d22
yading@10 124 vld1.32 {d22[]}, [r0,:32], r2
yading@10 125 vadd.s16 q0, q8, q9
yading@10 126 vld1.32 {d23[]}, [r0,:32], r2
yading@10 127 vsub.s16 q1, q8, q9
yading@10 128 vld1.32 {d21[]}, [r0,:32], r2
yading@10 129 vrshr.s16 q0, q0, #3
yading@10 130 vtrn.32 q10, q11
yading@10 131 vrshr.s16 q1, q1, #3
yading@10 132
yading@10 133 sub r0, r0, r2, lsl #2
yading@10 134
yading@10 135 vtrn.32 d0, d3
yading@10 136 vtrn.32 d1, d2
yading@10 137 vtrn.16 d0, d1
yading@10 138 vtrn.16 d3, d2
yading@10 139
yading@10 140 vaddw.u8 q0, q0, d20
yading@10 141 vaddw.u8 q1, q1, d21
yading@10 142 vqmovun.s16 d0, q0
yading@10 143 vqmovun.s16 d1, q1
yading@10 144
yading@10 145 vst1.32 {d0[0]}, [r0,:32], r2
yading@10 146 vst1.32 {d0[1]}, [r0,:32], r2
yading@10 147 vst1.32 {d1[1]}, [r0,:32], r2
yading@10 148 vst1.32 {d1[0]}, [r0,:32], r2
yading@10 149
yading@10 150 bx lr
yading@10 151 endfunc
yading@10 152
yading@10 153 function ff_vp8_idct_dc_add_neon, export=1
yading@10 154 mov r3, #0
yading@10 155 ldrsh r12, [r1]
yading@10 156 strh r3, [r1]
yading@10 157 vdup.16 q1, r12
yading@10 158 vrshr.s16 q1, q1, #3
yading@10 159 vld1.32 {d0[]}, [r0,:32], r2
yading@10 160 vld1.32 {d1[]}, [r0,:32], r2
yading@10 161 vld1.32 {d0[1]}, [r0,:32], r2
yading@10 162 vld1.32 {d1[1]}, [r0,:32], r2
yading@10 163 vaddw.u8 q2, q1, d0
yading@10 164 vaddw.u8 q3, q1, d1
yading@10 165 sub r0, r0, r2, lsl #2
yading@10 166 vqmovun.s16 d0, q2
yading@10 167 vqmovun.s16 d1, q3
yading@10 168 vst1.32 {d0[0]}, [r0,:32], r2
yading@10 169 vst1.32 {d1[0]}, [r0,:32], r2
yading@10 170 vst1.32 {d0[1]}, [r0,:32], r2
yading@10 171 vst1.32 {d1[1]}, [r0,:32], r2
yading@10 172 bx lr
yading@10 173 endfunc
yading@10 174
yading@10 175 function ff_vp8_idct_dc_add4uv_neon, export=1
yading@10 176 vmov.i16 d0, #0
yading@10 177 mov r3, #32
yading@10 178 vld1.16 {d16[]}, [r1,:16]
yading@10 179 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 180 vld1.16 {d17[]}, [r1,:16]
yading@10 181 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 182 vld1.16 {d18[]}, [r1,:16]
yading@10 183 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 184 vld1.16 {d19[]}, [r1,:16]
yading@10 185 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 186 mov r3, r0
yading@10 187 vrshr.s16 q8, q8, #3 @ dc >>= 3
yading@10 188 vld1.8 {d0}, [r0,:64], r2
yading@10 189 vrshr.s16 q9, q9, #3
yading@10 190 vld1.8 {d1}, [r0,:64], r2
yading@10 191 vaddw.u8 q10, q8, d0
yading@10 192 vld1.8 {d2}, [r0,:64], r2
yading@10 193 vaddw.u8 q0, q8, d1
yading@10 194 vld1.8 {d3}, [r0,:64], r2
yading@10 195 vaddw.u8 q11, q8, d2
yading@10 196 vld1.8 {d4}, [r0,:64], r2
yading@10 197 vaddw.u8 q1, q8, d3
yading@10 198 vld1.8 {d5}, [r0,:64], r2
yading@10 199 vaddw.u8 q12, q9, d4
yading@10 200 vld1.8 {d6}, [r0,:64], r2
yading@10 201 vaddw.u8 q2, q9, d5
yading@10 202 vld1.8 {d7}, [r0,:64], r2
yading@10 203 vaddw.u8 q13, q9, d6
yading@10 204 vqmovun.s16 d20, q10
yading@10 205 vaddw.u8 q3, q9, d7
yading@10 206 vqmovun.s16 d21, q0
yading@10 207 vqmovun.s16 d22, q11
yading@10 208 vst1.8 {d20}, [r3,:64], r2
yading@10 209 vqmovun.s16 d23, q1
yading@10 210 vst1.8 {d21}, [r3,:64], r2
yading@10 211 vqmovun.s16 d24, q12
yading@10 212 vst1.8 {d22}, [r3,:64], r2
yading@10 213 vqmovun.s16 d25, q2
yading@10 214 vst1.8 {d23}, [r3,:64], r2
yading@10 215 vqmovun.s16 d26, q13
yading@10 216 vst1.8 {d24}, [r3,:64], r2
yading@10 217 vqmovun.s16 d27, q3
yading@10 218 vst1.8 {d25}, [r3,:64], r2
yading@10 219 vst1.8 {d26}, [r3,:64], r2
yading@10 220 vst1.8 {d27}, [r3,:64], r2
yading@10 221
yading@10 222 bx lr
yading@10 223 endfunc
yading@10 224
yading@10 225 function ff_vp8_idct_dc_add4y_neon, export=1
yading@10 226 vmov.i16 d0, #0
yading@10 227 mov r3, #32
yading@10 228 vld1.16 {d16[]}, [r1,:16]
yading@10 229 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 230 vld1.16 {d17[]}, [r1,:16]
yading@10 231 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 232 vld1.16 {d18[]}, [r1,:16]
yading@10 233 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 234 vld1.16 {d19[]}, [r1,:16]
yading@10 235 vst1.16 {d0[0]}, [r1,:16], r3
yading@10 236 vrshr.s16 q8, q8, #3 @ dc >>= 3
yading@10 237 vld1.8 {q0}, [r0,:128], r2
yading@10 238 vrshr.s16 q9, q9, #3
yading@10 239 vld1.8 {q1}, [r0,:128], r2
yading@10 240 vaddw.u8 q10, q8, d0
yading@10 241 vld1.8 {q2}, [r0,:128], r2
yading@10 242 vaddw.u8 q0, q9, d1
yading@10 243 vld1.8 {q3}, [r0,:128], r2
yading@10 244 vaddw.u8 q11, q8, d2
yading@10 245 vaddw.u8 q1, q9, d3
yading@10 246 vaddw.u8 q12, q8, d4
yading@10 247 vaddw.u8 q2, q9, d5
yading@10 248 vaddw.u8 q13, q8, d6
yading@10 249 vaddw.u8 q3, q9, d7
yading@10 250 sub r0, r0, r2, lsl #2
yading@10 251 vqmovun.s16 d20, q10
yading@10 252 vqmovun.s16 d21, q0
yading@10 253 vqmovun.s16 d22, q11
yading@10 254 vqmovun.s16 d23, q1
yading@10 255 vqmovun.s16 d24, q12
yading@10 256 vst1.8 {q10}, [r0,:128], r2
yading@10 257 vqmovun.s16 d25, q2
yading@10 258 vst1.8 {q11}, [r0,:128], r2
yading@10 259 vqmovun.s16 d26, q13
yading@10 260 vst1.8 {q12}, [r0,:128], r2
yading@10 261 vqmovun.s16 d27, q3
yading@10 262 vst1.8 {q13}, [r0,:128], r2
yading@10 263
yading@10 264 bx lr
yading@10 265 endfunc
yading@10 266
yading@10 267 @ Register layout:
yading@10 268 @ P3..Q3 -> q0..q7
yading@10 269 @ flim_E -> q14
yading@10 270 @ flim_I -> q15
yading@10 271 @ hev_thresh -> r12
yading@10 272 @
yading@10 273 .macro vp8_loop_filter, inner=0, simple=0
yading@10 274 .if \simple
yading@10 275 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
yading@10 276 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
yading@10 277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
yading@10 278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
yading@10 279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
yading@10 280 vmov.i8 q13, #0x80
yading@10 281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
yading@10 282 .else
yading@10 283 @ calculate hev and normal_limit:
yading@10 284 vabd.u8 q12, q2, q3 @ abs(P1-P0)
yading@10 285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
yading@10 286 vabd.u8 q10, q0, q1 @ abs(P3-P2)
yading@10 287 vabd.u8 q11, q1, q2 @ abs(P2-P1)
yading@10 288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
yading@10 289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
yading@10 290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
yading@10 291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
yading@10 292 vand q8, q8, q9
yading@10 293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
yading@10 294 vand q8, q8, q11
yading@10 295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
yading@10 296 vand q8, q8, q10
yading@10 297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
yading@10 298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
yading@10 299 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
yading@10 300 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
yading@10 301 vand q8, q8, q10
yading@10 302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
yading@10 303 vand q8, q8, q11
yading@10 304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
yading@10 305 vdup.8 q15, r12 @ hev_thresh
yading@10 306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
yading@10 307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
yading@10 308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
yading@10 309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
yading@10 310 vand q8, q8, q11
yading@10 311 vmov.i8 q13, #0x80
yading@10 312 vorr q9, q12, q14
yading@10 313 .endif
yading@10 314
yading@10 315 @ at this point:
yading@10 316 @ q8: normal_limit
yading@10 317 @ q9: hev
yading@10 318
yading@10 319 @ convert to signed value:
yading@10 320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
yading@10 321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
yading@10 322
yading@10 323 vmov.i16 q12, #3
yading@10 324 vsubl.s8 q10, d8, d6 @ QS0 - PS0
yading@10 325 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
yading@10 326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
yading@10 327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
yading@10 328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
yading@10 329 vmul.i16 q11, q11, q12
yading@10 330
yading@10 331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
yading@10 332 vmov.i8 q14, #4
yading@10 333 vmov.i8 q15, #3
yading@10 334 .if \inner
yading@10 335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
yading@10 336 .endif
yading@10 337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
yading@10 338 vaddw.s8 q11, q11, d25
yading@10 339 vqmovn.s16 d20, q10 @ narrow result back into q10
yading@10 340 vqmovn.s16 d21, q11
yading@10 341 .if !\inner && !\simple
yading@10 342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
yading@10 343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
yading@10 344 .endif
yading@10 345 vand q10, q10, q8 @ w &= normal_limit
yading@10 346
yading@10 347 @ registers used at this point..
yading@10 348 @ q0 -> P3 (don't corrupt)
yading@10 349 @ q1-q6 -> PS2-QS2
yading@10 350 @ q7 -> Q3 (don't corrupt)
yading@10 351 @ q9 -> hev
yading@10 352 @ q10 -> w
yading@10 353 @ q13 -> #0x80
yading@10 354 @ q14 -> #4
yading@10 355 @ q15 -> #3
yading@10 356 @ q8, q11, q12 -> unused
yading@10 357
yading@10 358 @ filter_common: is4tap==1
yading@10 359 @ c1 = clamp(w + 4) >> 3;
yading@10 360 @ c2 = clamp(w + 3) >> 3;
yading@10 361 @ Q0 = s2u(QS0 - c1);
yading@10 362 @ P0 = s2u(PS0 + c2);
yading@10 363
yading@10 364 .if \simple
yading@10 365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
yading@10 366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
yading@10 367 vshr.s8 q11, q11, #3 @ c1 >>= 3
yading@10 368 vshr.s8 q12, q12, #3 @ c2 >>= 3
yading@10 369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
yading@10 370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
yading@10 371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
yading@10 372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
yading@10 373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
yading@10 374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
yading@10 375 .elseif \inner
yading@10 376 @ the !is4tap case of filter_common, only used for inner blocks
yading@10 377 @ c3 = ((c1&~hev) + 1) >> 1;
yading@10 378 @ Q1 = s2u(QS1 - c3);
yading@10 379 @ P1 = s2u(PS1 + c3);
yading@10 380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
yading@10 381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
yading@10 382 vshr.s8 q11, q11, #3 @ c1 >>= 3
yading@10 383 vshr.s8 q12, q12, #3 @ c2 >>= 3
yading@10 384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
yading@10 385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
yading@10 386 vbic q11, q11, q9 @ c1 & ~hev
yading@10 387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
yading@10 388 vrshr.s8 q11, q11, #1 @ c3 >>= 1
yading@10 389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
yading@10 390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
yading@10 391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
yading@10 392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
yading@10 393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
yading@10 394 .else
yading@10 395 vand q12, q10, q9 @ w & hev
yading@10 396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
yading@10 397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
yading@10 398 vshr.s8 q11, q11, #3 @ c1 >>= 3
yading@10 399 vshr.s8 q12, q12, #3 @ c2 >>= 3
yading@10 400 vbic q10, q10, q9 @ w &= ~hev
yading@10 401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
yading@10 402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
yading@10 403
yading@10 404 @ filter_mbedge:
yading@10 405 @ a = clamp((27*w + 63) >> 7);
yading@10 406 @ Q0 = s2u(QS0 - a);
yading@10 407 @ P0 = s2u(PS0 + a);
yading@10 408 @ a = clamp((18*w + 63) >> 7);
yading@10 409 @ Q1 = s2u(QS1 - a);
yading@10 410 @ P1 = s2u(PS1 + a);
yading@10 411 @ a = clamp((9*w + 63) >> 7);
yading@10 412 @ Q2 = s2u(QS2 - a);
yading@10 413 @ P2 = s2u(PS2 + a);
yading@10 414 vmov.i16 q9, #63
yading@10 415 vshll.s8 q14, d20, #3
yading@10 416 vshll.s8 q15, d21, #3
yading@10 417 vaddw.s8 q14, q14, d20
yading@10 418 vaddw.s8 q15, q15, d21
yading@10 419 vadd.s16 q8, q9, q14
yading@10 420 vadd.s16 q9, q9, q15 @ 9*w + 63
yading@10 421 vadd.s16 q11, q8, q14
yading@10 422 vadd.s16 q12, q9, q15 @ 18*w + 63
yading@10 423 vadd.s16 q14, q11, q14
yading@10 424 vadd.s16 q15, q12, q15 @ 27*w + 63
yading@10 425 vqshrn.s16 d16, q8, #7
yading@10 426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
yading@10 427 vqshrn.s16 d22, q11, #7
yading@10 428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
yading@10 429 vqshrn.s16 d28, q14, #7
yading@10 430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
yading@10 431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
yading@10 432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
yading@10 433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
yading@10 434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
yading@10 435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
yading@10 436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
yading@10 437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
yading@10 438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
yading@10 439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
yading@10 440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
yading@10 441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
yading@10 442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
yading@10 443 .endif
yading@10 444 .endm
yading@10 445
yading@10 446 .macro vp8_v_loop_filter16 name, inner=0, simple=0
yading@10 447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
yading@10 448 vpush {q4-q7}
yading@10 449 sub r0, r0, r1, lsl #1+!\simple
yading@10 450
yading@10 451 @ Load pixels:
yading@10 452 .if !\simple
yading@10 453 ldr r12, [sp, #64] @ hev_thresh
yading@10 454 vld1.8 {q0}, [r0,:128], r1 @ P3
yading@10 455 vld1.8 {q1}, [r0,:128], r1 @ P2
yading@10 456 .endif
yading@10 457 vld1.8 {q2}, [r0,:128], r1 @ P1
yading@10 458 vld1.8 {q3}, [r0,:128], r1 @ P0
yading@10 459 vld1.8 {q4}, [r0,:128], r1 @ Q0
yading@10 460 vld1.8 {q5}, [r0,:128], r1 @ Q1
yading@10 461 .if !\simple
yading@10 462 vld1.8 {q6}, [r0,:128], r1 @ Q2
yading@10 463 vld1.8 {q7}, [r0,:128] @ Q3
yading@10 464 vdup.8 q15, r3 @ flim_I
yading@10 465 .endif
yading@10 466 vdup.8 q14, r2 @ flim_E
yading@10 467
yading@10 468 vp8_loop_filter inner=\inner, simple=\simple
yading@10 469
yading@10 470 @ back up to P2: dst -= stride * 6
yading@10 471 sub r0, r0, r1, lsl #2
yading@10 472 .if !\simple
yading@10 473 sub r0, r0, r1, lsl #1
yading@10 474
yading@10 475 @ Store pixels:
yading@10 476 vst1.8 {q1}, [r0,:128], r1 @ P2
yading@10 477 .endif
yading@10 478 vst1.8 {q2}, [r0,:128], r1 @ P1
yading@10 479 vst1.8 {q3}, [r0,:128], r1 @ P0
yading@10 480 vst1.8 {q4}, [r0,:128], r1 @ Q0
yading@10 481 vst1.8 {q5}, [r0,:128], r1 @ Q1
yading@10 482 .if !\simple
yading@10 483 vst1.8 {q6}, [r0,:128] @ Q2
yading@10 484 .endif
yading@10 485
yading@10 486 vpop {q4-q7}
yading@10 487 bx lr
yading@10 488 endfunc
yading@10 489 .endm
yading@10 490
yading@10 491 vp8_v_loop_filter16
yading@10 492 vp8_v_loop_filter16 _inner, inner=1
yading@10 493 vp8_v_loop_filter16 _simple, simple=1
yading@10 494
yading@10 495 .macro vp8_v_loop_filter8uv name, inner=0
yading@10 496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
yading@10 497 vpush {q4-q7}
yading@10 498 sub r0, r0, r2, lsl #2
yading@10 499 sub r1, r1, r2, lsl #2
yading@10 500 ldr r12, [sp, #64] @ flim_I
yading@10 501
yading@10 502 @ Load pixels:
yading@10 503 vld1.8 {d0}, [r0,:64], r2 @ P3
yading@10 504 vld1.8 {d1}, [r1,:64], r2 @ P3
yading@10 505 vld1.8 {d2}, [r0,:64], r2 @ P2
yading@10 506 vld1.8 {d3}, [r1,:64], r2 @ P2
yading@10 507 vld1.8 {d4}, [r0,:64], r2 @ P1
yading@10 508 vld1.8 {d5}, [r1,:64], r2 @ P1
yading@10 509 vld1.8 {d6}, [r0,:64], r2 @ P0
yading@10 510 vld1.8 {d7}, [r1,:64], r2 @ P0
yading@10 511 vld1.8 {d8}, [r0,:64], r2 @ Q0
yading@10 512 vld1.8 {d9}, [r1,:64], r2 @ Q0
yading@10 513 vld1.8 {d10}, [r0,:64], r2 @ Q1
yading@10 514 vld1.8 {d11}, [r1,:64], r2 @ Q1
yading@10 515 vld1.8 {d12}, [r0,:64], r2 @ Q2
yading@10 516 vld1.8 {d13}, [r1,:64], r2 @ Q2
yading@10 517 vld1.8 {d14}, [r0,:64] @ Q3
yading@10 518 vld1.8 {d15}, [r1,:64] @ Q3
yading@10 519
yading@10 520 vdup.8 q14, r3 @ flim_E
yading@10 521 vdup.8 q15, r12 @ flim_I
yading@10 522 ldr r12, [sp, #68] @ hev_thresh
yading@10 523
yading@10 524 vp8_loop_filter inner=\inner
yading@10 525
yading@10 526 @ back up to P2: u,v -= stride * 6
yading@10 527 sub r0, r0, r2, lsl #2
yading@10 528 sub r1, r1, r2, lsl #2
yading@10 529 sub r0, r0, r2, lsl #1
yading@10 530 sub r1, r1, r2, lsl #1
yading@10 531
yading@10 532 @ Store pixels:
yading@10 533 vst1.8 {d2}, [r0,:64], r2 @ P2
yading@10 534 vst1.8 {d3}, [r1,:64], r2 @ P2
yading@10 535 vst1.8 {d4}, [r0,:64], r2 @ P1
yading@10 536 vst1.8 {d5}, [r1,:64], r2 @ P1
yading@10 537 vst1.8 {d6}, [r0,:64], r2 @ P0
yading@10 538 vst1.8 {d7}, [r1,:64], r2 @ P0
yading@10 539 vst1.8 {d8}, [r0,:64], r2 @ Q0
yading@10 540 vst1.8 {d9}, [r1,:64], r2 @ Q0
yading@10 541 vst1.8 {d10}, [r0,:64], r2 @ Q1
yading@10 542 vst1.8 {d11}, [r1,:64], r2 @ Q1
yading@10 543 vst1.8 {d12}, [r0,:64] @ Q2
yading@10 544 vst1.8 {d13}, [r1,:64] @ Q2
yading@10 545
yading@10 546 vpop {q4-q7}
yading@10 547 bx lr
yading@10 548 endfunc
yading@10 549 .endm
yading@10 550
yading@10 551 vp8_v_loop_filter8uv
yading@10 552 vp8_v_loop_filter8uv _inner, inner=1
yading@10 553
yading@10 554 .macro vp8_h_loop_filter16 name, inner=0, simple=0
yading@10 555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
yading@10 556 vpush {q4-q7}
yading@10 557 sub r0, r0, #4
yading@10 558 .if !\simple
yading@10 559 ldr r12, [sp, #64] @ hev_thresh
yading@10 560 .endif
yading@10 561
yading@10 562 @ Load pixels:
yading@10 563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
yading@10 564 vld1.8 {d2}, [r0], r1
yading@10 565 vld1.8 {d4}, [r0], r1
yading@10 566 vld1.8 {d6}, [r0], r1
yading@10 567 vld1.8 {d8}, [r0], r1
yading@10 568 vld1.8 {d10}, [r0], r1
yading@10 569 vld1.8 {d12}, [r0], r1
yading@10 570 vld1.8 {d14}, [r0], r1
yading@10 571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
yading@10 572 vld1.8 {d3}, [r0], r1
yading@10 573 vld1.8 {d5}, [r0], r1
yading@10 574 vld1.8 {d7}, [r0], r1
yading@10 575 vld1.8 {d9}, [r0], r1
yading@10 576 vld1.8 {d11}, [r0], r1
yading@10 577 vld1.8 {d13}, [r0], r1
yading@10 578 vld1.8 {d15}, [r0], r1
yading@10 579
yading@10 580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
yading@10 581
yading@10 582 vdup.8 q14, r2 @ flim_E
yading@10 583 .if !\simple
yading@10 584 vdup.8 q15, r3 @ flim_I
yading@10 585 .endif
yading@10 586
yading@10 587 vp8_loop_filter inner=\inner, simple=\simple
yading@10 588
yading@10 589 sub r0, r0, r1, lsl #4 @ backup 16 rows
yading@10 590
yading@10 591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
yading@10 592
yading@10 593 @ Store pixels:
yading@10 594 vst1.8 {d0}, [r0], r1
yading@10 595 vst1.8 {d2}, [r0], r1
yading@10 596 vst1.8 {d4}, [r0], r1
yading@10 597 vst1.8 {d6}, [r0], r1
yading@10 598 vst1.8 {d8}, [r0], r1
yading@10 599 vst1.8 {d10}, [r0], r1
yading@10 600 vst1.8 {d12}, [r0], r1
yading@10 601 vst1.8 {d14}, [r0], r1
yading@10 602 vst1.8 {d1}, [r0], r1
yading@10 603 vst1.8 {d3}, [r0], r1
yading@10 604 vst1.8 {d5}, [r0], r1
yading@10 605 vst1.8 {d7}, [r0], r1
yading@10 606 vst1.8 {d9}, [r0], r1
yading@10 607 vst1.8 {d11}, [r0], r1
yading@10 608 vst1.8 {d13}, [r0], r1
yading@10 609 vst1.8 {d15}, [r0]
yading@10 610
yading@10 611 vpop {q4-q7}
yading@10 612 bx lr
yading@10 613 endfunc
yading@10 614 .endm
yading@10 615
yading@10 616 vp8_h_loop_filter16
yading@10 617 vp8_h_loop_filter16 _inner, inner=1
yading@10 618 vp8_h_loop_filter16 _simple, simple=1
yading@10 619
yading@10 620 .macro vp8_h_loop_filter8uv name, inner=0
yading@10 621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
yading@10 622 vpush {q4-q7}
yading@10 623 sub r0, r0, #4
yading@10 624 sub r1, r1, #4
yading@10 625 ldr r12, [sp, #64] @ flim_I
yading@10 626
yading@10 627 @ Load pixels:
yading@10 628 vld1.8 {d0}, [r0], r2 @ load u
yading@10 629 vld1.8 {d1}, [r1], r2 @ load v
yading@10 630 vld1.8 {d2}, [r0], r2
yading@10 631 vld1.8 {d3}, [r1], r2
yading@10 632 vld1.8 {d4}, [r0], r2
yading@10 633 vld1.8 {d5}, [r1], r2
yading@10 634 vld1.8 {d6}, [r0], r2
yading@10 635 vld1.8 {d7}, [r1], r2
yading@10 636 vld1.8 {d8}, [r0], r2
yading@10 637 vld1.8 {d9}, [r1], r2
yading@10 638 vld1.8 {d10}, [r0], r2
yading@10 639 vld1.8 {d11}, [r1], r2
yading@10 640 vld1.8 {d12}, [r0], r2
yading@10 641 vld1.8 {d13}, [r1], r2
yading@10 642 vld1.8 {d14}, [r0], r2
yading@10 643 vld1.8 {d15}, [r1], r2
yading@10 644
yading@10 645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
yading@10 646
yading@10 647 vdup.8 q14, r3 @ flim_E
yading@10 648 vdup.8 q15, r12 @ flim_I
yading@10 649 ldr r12, [sp, #68] @ hev_thresh
yading@10 650
yading@10 651 vp8_loop_filter inner=\inner
yading@10 652
yading@10 653 sub r0, r0, r2, lsl #3 @ backup u 8 rows
yading@10 654 sub r1, r1, r2, lsl #3 @ backup v 8 rows
yading@10 655
yading@10 656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
yading@10 657
yading@10 658 @ Store pixels:
yading@10 659 vst1.8 {d0}, [r0], r2
yading@10 660 vst1.8 {d1}, [r1], r2
yading@10 661 vst1.8 {d2}, [r0], r2
yading@10 662 vst1.8 {d3}, [r1], r2
yading@10 663 vst1.8 {d4}, [r0], r2
yading@10 664 vst1.8 {d5}, [r1], r2
yading@10 665 vst1.8 {d6}, [r0], r2
yading@10 666 vst1.8 {d7}, [r1], r2
yading@10 667 vst1.8 {d8}, [r0], r2
yading@10 668 vst1.8 {d9}, [r1], r2
yading@10 669 vst1.8 {d10}, [r0], r2
yading@10 670 vst1.8 {d11}, [r1], r2
yading@10 671 vst1.8 {d12}, [r0], r2
yading@10 672 vst1.8 {d13}, [r1], r2
yading@10 673 vst1.8 {d14}, [r0]
yading@10 674 vst1.8 {d15}, [r1]
yading@10 675
yading@10 676 vpop {q4-q7}
yading@10 677 bx lr
yading@10 678 endfunc
yading@10 679 .endm
yading@10 680
yading@10 681 vp8_h_loop_filter8uv
yading@10 682 vp8_h_loop_filter8uv _inner, inner=1
yading@10 683
yading@10 684 function ff_put_vp8_pixels16_neon, export=1
yading@10 685 ldr r12, [sp, #0] @ h
yading@10 686 1:
yading@10 687 subs r12, r12, #4
yading@10 688 vld1.8 {q0}, [r2], r3
yading@10 689 vld1.8 {q1}, [r2], r3
yading@10 690 vld1.8 {q2}, [r2], r3
yading@10 691 vld1.8 {q3}, [r2], r3
yading@10 692 vst1.8 {q0}, [r0,:128], r1
yading@10 693 vst1.8 {q1}, [r0,:128], r1
yading@10 694 vst1.8 {q2}, [r0,:128], r1
yading@10 695 vst1.8 {q3}, [r0,:128], r1
yading@10 696 bgt 1b
yading@10 697 bx lr
yading@10 698 endfunc
yading@10 699
yading@10 700 function ff_put_vp8_pixels8_neon, export=1
yading@10 701 ldr r12, [sp, #0] @ h
yading@10 702 1:
yading@10 703 subs r12, r12, #4
yading@10 704 vld1.8 {d0}, [r2], r3
yading@10 705 vld1.8 {d1}, [r2], r3
yading@10 706 vld1.8 {d2}, [r2], r3
yading@10 707 vld1.8 {d3}, [r2], r3
yading@10 708 vst1.8 {d0}, [r0,:64], r1
yading@10 709 vst1.8 {d1}, [r0,:64], r1
yading@10 710 vst1.8 {d2}, [r0,:64], r1
yading@10 711 vst1.8 {d3}, [r0,:64], r1
yading@10 712 bgt 1b
yading@10 713 bx lr
yading@10 714 endfunc
yading@10 715
yading@10 716 /* 4/6-tap 8th-pel MC */
yading@10 717
yading@10 718 .macro vp8_epel8_h6 d, a, b
yading@10 719 vext.8 d27, \a, \b, #1
yading@10 720 vmovl.u8 q8, \a
yading@10 721 vext.8 d28, \a, \b, #2
yading@10 722 vmovl.u8 q9, d27
yading@10 723 vext.8 d29, \a, \b, #3
yading@10 724 vmovl.u8 q10, d28
yading@10 725 vext.8 d30, \a, \b, #4
yading@10 726 vmovl.u8 q11, d29
yading@10 727 vext.8 d31, \a, \b, #5
yading@10 728 vmovl.u8 q12, d30
yading@10 729 vmul.u16 q10, q10, d0[2]
yading@10 730 vmovl.u8 q13, d31
yading@10 731 vmul.u16 q11, q11, d0[3]
yading@10 732 vmls.u16 q10, q9, d0[1]
yading@10 733 vmls.u16 q11, q12, d1[0]
yading@10 734 vmla.u16 q10, q8, d0[0]
yading@10 735 vmla.u16 q11, q13, d1[1]
yading@10 736 vqadd.s16 q11, q10, q11
yading@10 737 vqrshrun.s16 \d, q11, #7
yading@10 738 .endm
yading@10 739
yading@10 740 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
yading@10 741 vext.8 q14, \q0, \q1, #3
yading@10 742 vext.8 q15, \q0, \q1, #4
yading@10 743 vmovl.u8 q11, d28
yading@10 744 vmovl.u8 q14, d29
yading@10 745 vext.8 q3, \q0, \q1, #2
yading@10 746 vmovl.u8 q12, d30
yading@10 747 vmovl.u8 q15, d31
yading@10 748 vext.8 q8, \q0, \q1, #1
yading@10 749 vmovl.u8 q10, d6
yading@10 750 vmovl.u8 q3, d7
yading@10 751 vext.8 q2, \q0, \q1, #5
yading@10 752 vmovl.u8 q13, d4
yading@10 753 vmovl.u8 q2, d5
yading@10 754 vmovl.u8 q9, d16
yading@10 755 vmovl.u8 q8, d17
yading@10 756 vmul.u16 q11, q11, d0[3]
yading@10 757 vmul.u16 q10, q10, d0[2]
yading@10 758 vmul.u16 q3, q3, d0[2]
yading@10 759 vmul.u16 q14, q14, d0[3]
yading@10 760 vmls.u16 q11, q12, d1[0]
yading@10 761 vmovl.u8 q12, \s0
yading@10 762 vmovl.u8 q1, \s1
yading@10 763 vmls.u16 q10, q9, d0[1]
yading@10 764 vmls.u16 q3, q8, d0[1]
yading@10 765 vmls.u16 q14, q15, d1[0]
yading@10 766 vmla.u16 q10, q12, d0[0]
yading@10 767 vmla.u16 q11, q13, d1[1]
yading@10 768 vmla.u16 q3, q1, d0[0]
yading@10 769 vmla.u16 q14, q2, d1[1]
yading@10 770 vqadd.s16 q11, q10, q11
yading@10 771 vqadd.s16 q14, q3, q14
yading@10 772 vqrshrun.s16 \d0, q11, #7
yading@10 773 vqrshrun.s16 \d1, q14, #7
yading@10 774 .endm
yading@10 775
yading@10 776 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
yading@10 777 vmovl.u8 q10, \s2
yading@10 778 vmovl.u8 q11, \s3
yading@10 779 vmovl.u8 q9, \s1
yading@10 780 vmovl.u8 q12, \s4
yading@10 781 vmovl.u8 q8, \s0
yading@10 782 vmovl.u8 q13, \s5
yading@10 783 vmul.u16 q10, q10, d0[2]
yading@10 784 vmul.u16 q11, q11, d0[3]
yading@10 785 vmls.u16 q10, q9, d0[1]
yading@10 786 vmls.u16 q11, q12, d1[0]
yading@10 787 vmla.u16 q10, q8, d0[0]
yading@10 788 vmla.u16 q11, q13, d1[1]
yading@10 789 vqadd.s16 q11, q10, q11
yading@10 790 vqrshrun.s16 \d0, q11, #7
yading@10 791 .endm
yading@10 792
yading@10 793 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
yading@10 794 vmovl.u8 q10, \s0
yading@10 795 vmovl.u8 q11, \s3
yading@10 796 vmovl.u8 q14, \s6
yading@10 797 vmovl.u8 q9, \s1
yading@10 798 vmovl.u8 q12, \s4
yading@10 799 vmovl.u8 q8, \s2
yading@10 800 vmovl.u8 q13, \s5
yading@10 801 vmul.u16 q10, q10, d0[0]
yading@10 802 vmul.u16 q15, q11, d0[3]
yading@10 803 vmul.u16 q11, q11, d0[2]
yading@10 804 vmul.u16 q14, q14, d1[1]
yading@10 805 vmls.u16 q10, q9, d0[1]
yading@10 806 vmls.u16 q15, q12, d1[0]
yading@10 807 vmls.u16 q11, q8, d0[1]
yading@10 808 vmls.u16 q14, q13, d1[0]
yading@10 809 vmla.u16 q10, q8, d0[2]
yading@10 810 vmla.u16 q15, q13, d1[1]
yading@10 811 vmla.u16 q11, q9, d0[0]
yading@10 812 vmla.u16 q14, q12, d0[3]
yading@10 813 vqadd.s16 q15, q10, q15
yading@10 814 vqadd.s16 q14, q11, q14
yading@10 815 vqrshrun.s16 \d0, q15, #7
yading@10 816 vqrshrun.s16 \d1, q14, #7
yading@10 817 .endm
yading@10 818
yading@10 819 .macro vp8_epel8_h4 d, a, b
yading@10 820 vext.8 d28, \a, \b, #1
yading@10 821 vmovl.u8 q9, \a
yading@10 822 vext.8 d29, \a, \b, #2
yading@10 823 vmovl.u8 q10, d28
yading@10 824 vext.8 d30, \a, \b, #3
yading@10 825 vmovl.u8 q11, d29
yading@10 826 vmovl.u8 q12, d30
yading@10 827 vmul.u16 q10, q10, d0[2]
yading@10 828 vmul.u16 q11, q11, d0[3]
yading@10 829 vmls.u16 q10, q9, d0[1]
yading@10 830 vmls.u16 q11, q12, d1[0]
yading@10 831 vqadd.s16 q11, q10, q11
yading@10 832 vqrshrun.s16 \d, q11, #7
yading@10 833 .endm
yading@10 834
yading@10 835 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
yading@10 836 vmovl.u8 q9, \s0
yading@10 837 vmovl.u8 q10, \s1
yading@10 838 vmovl.u8 q11, \s2
yading@10 839 vmovl.u8 q12, \s3
yading@10 840 vmovl.u8 q13, \s4
yading@10 841 vmul.u16 q8, q10, d0[2]
yading@10 842 vmul.u16 q14, q11, d0[3]
yading@10 843 vmul.u16 q11, q11, d0[2]
yading@10 844 vmul.u16 q15, q12, d0[3]
yading@10 845 vmls.u16 q8, q9, d0[1]
yading@10 846 vmls.u16 q14, q12, d1[0]
yading@10 847 vmls.u16 q11, q10, d0[1]
yading@10 848 vmls.u16 q15, q13, d1[0]
yading@10 849 vqadd.s16 q8, q8, q14
yading@10 850 vqadd.s16 q11, q11, q15
yading@10 851 vqrshrun.s16 \d0, q8, #7
yading@10 852 vqrshrun.s16 \d1, q11, #7
yading@10 853 .endm
yading@10 854
yading@10 855 function ff_put_vp8_epel16_v6_neon, export=1
yading@10 856 sub r2, r2, r3, lsl #1
yading@10 857 push {r4,lr}
yading@10 858 vpush {d8-d15}
yading@10 859
yading@10 860 ldr r4, [sp, #80] @ my
yading@10 861 movrel lr, subpel_filters-16
yading@10 862 ldr r12, [sp, #72] @ h
yading@10 863 add r4, lr, r4, lsl #4
yading@10 864 vld1.16 {q0}, [r4,:128]
yading@10 865 1:
yading@10 866 vld1.8 {d2-d3}, [r2], r3
yading@10 867 vld1.8 {d4-d5}, [r2], r3
yading@10 868 vld1.8 {d6-d7}, [r2], r3
yading@10 869 vld1.8 {d8-d9}, [r2], r3
yading@10 870 vld1.8 {d10-d11},[r2], r3
yading@10 871 vld1.8 {d12-d13},[r2], r3
yading@10 872 vld1.8 {d14-d15},[r2]
yading@10 873 sub r2, r2, r3, lsl #2
yading@10 874
yading@10 875 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
yading@10 876 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
yading@10 877
yading@10 878 vst1.8 {d2-d3}, [r0,:128], r1
yading@10 879 vst1.8 {d4-d5}, [r0,:128], r1
yading@10 880 subs r12, r12, #2
yading@10 881 bne 1b
yading@10 882
yading@10 883 vpop {d8-d15}
yading@10 884 pop {r4,pc}
yading@10 885 endfunc
yading@10 886
yading@10 887 function ff_put_vp8_epel16_h6_neon, export=1
yading@10 888 sub r2, r2, #2
yading@10 889 push {r4,lr}
yading@10 890
yading@10 891 ldr r4, [sp, #12] @ mx
yading@10 892 movrel lr, subpel_filters-16
yading@10 893 ldr r12, [sp, #8] @ h
yading@10 894 add r4, lr, r4, lsl #4
yading@10 895 vld1.16 {q0}, [r4,:128]
yading@10 896 1:
yading@10 897 vld1.8 {d2-d4}, [r2], r3
yading@10 898
yading@10 899 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
yading@10 900
yading@10 901 vst1.8 {d2-d3}, [r0,:128], r1
yading@10 902 subs r12, r12, #1
yading@10 903 bne 1b
yading@10 904
yading@10 905 pop {r4,pc}
yading@10 906 endfunc
yading@10 907
yading@10 908 function ff_put_vp8_epel16_h6v6_neon, export=1
yading@10 909 sub r2, r2, r3, lsl #1
yading@10 910 sub r2, r2, #2
yading@10 911 push {r4,lr}
yading@10 912 vpush {d8-d9}
yading@10 913
yading@10 914 @ first pass (horizontal):
yading@10 915 ldr r4, [sp, #28] @ mx
yading@10 916 movrel lr, subpel_filters-16
yading@10 917 ldr r12, [sp, #24] @ h
yading@10 918 add r4, lr, r4, lsl #4
yading@10 919 sub sp, sp, #336+16
yading@10 920 vld1.16 {q0}, [r4,:128]
yading@10 921 add lr, sp, #15
yading@10 922 add r12, r12, #5
yading@10 923 bic lr, lr, #15
yading@10 924 1:
yading@10 925 vld1.8 {d2,d3,d4}, [r2], r3
yading@10 926
yading@10 927 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
yading@10 928
yading@10 929 vst1.8 {d2-d3}, [lr,:128]!
yading@10 930 subs r12, r12, #1
yading@10 931 bne 1b
yading@10 932
yading@10 933 @ second pass (vertical):
yading@10 934 ldr r4, [sp, #336+16+32] @ my
yading@10 935 movrel lr, subpel_filters-16
yading@10 936 ldr r12, [sp, #336+16+24] @ h
yading@10 937 add r4, lr, r4, lsl #4
yading@10 938 add lr, sp, #15
yading@10 939 vld1.16 {q0}, [r4,:128]
yading@10 940 bic lr, lr, #15
yading@10 941 2:
yading@10 942 vld1.8 {d2-d5}, [lr,:128]!
yading@10 943 vld1.8 {d6-d9}, [lr,:128]!
yading@10 944 vld1.8 {d28-d31},[lr,:128]
yading@10 945 sub lr, lr, #48
yading@10 946
yading@10 947 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
yading@10 948 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
yading@10 949
yading@10 950 vst1.8 {d2-d3}, [r0,:128], r1
yading@10 951 subs r12, r12, #1
yading@10 952 bne 2b
yading@10 953
yading@10 954 add sp, sp, #336+16
yading@10 955 vpop {d8-d9}
yading@10 956 pop {r4,pc}
yading@10 957 endfunc
yading@10 958
yading@10 959 function ff_put_vp8_epel8_v6_neon, export=1
yading@10 960 sub r2, r2, r3, lsl #1
yading@10 961 push {r4,lr}
yading@10 962
yading@10 963 ldr r4, [sp, #16] @ my
yading@10 964 movrel lr, subpel_filters-16
yading@10 965 ldr r12, [sp, #8] @ h
yading@10 966 add r4, lr, r4, lsl #4
yading@10 967 vld1.16 {q0}, [r4,:128]
yading@10 968 1:
yading@10 969 vld1.8 {d2}, [r2], r3
yading@10 970 vld1.8 {d3}, [r2], r3
yading@10 971 vld1.8 {d4}, [r2], r3
yading@10 972 vld1.8 {d5}, [r2], r3
yading@10 973 vld1.8 {d6}, [r2], r3
yading@10 974 vld1.8 {d7}, [r2], r3
yading@10 975 vld1.8 {d28}, [r2]
yading@10 976
yading@10 977 sub r2, r2, r3, lsl #2
yading@10 978
yading@10 979 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
yading@10 980
yading@10 981 vst1.8 {d2}, [r0,:64], r1
yading@10 982 vst1.8 {d3}, [r0,:64], r1
yading@10 983 subs r12, r12, #2
yading@10 984 bne 1b
yading@10 985
yading@10 986 pop {r4,pc}
yading@10 987 endfunc
yading@10 988
yading@10 989 function ff_put_vp8_epel8_h6_neon, export=1
yading@10 990 sub r2, r2, #2
yading@10 991 push {r4,lr}
yading@10 992
yading@10 993 ldr r4, [sp, #12] @ mx
yading@10 994 movrel lr, subpel_filters-16
yading@10 995 ldr r12, [sp, #8] @ h
yading@10 996 add r4, lr, r4, lsl #4
yading@10 997 vld1.16 {q0}, [r4,:128]
yading@10 998 1:
yading@10 999 vld1.8 {d2,d3}, [r2], r3
yading@10 1000
yading@10 1001 vp8_epel8_h6 d2, d2, d3
yading@10 1002
yading@10 1003 vst1.8 {d2}, [r0,:64], r1
yading@10 1004 subs r12, r12, #1
yading@10 1005 bne 1b
yading@10 1006
yading@10 1007 pop {r4,pc}
yading@10 1008 endfunc
yading@10 1009
yading@10 1010 function ff_put_vp8_epel8_h6v6_neon, export=1
yading@10 1011 sub r2, r2, r3, lsl #1
yading@10 1012 sub r2, r2, #2
yading@10 1013 push {r4,lr}
yading@10 1014
yading@10 1015 @ first pass (horizontal):
yading@10 1016 ldr r4, [sp, #12] @ mx
yading@10 1017 movrel lr, subpel_filters-16
yading@10 1018 ldr r12, [sp, #8] @ h
yading@10 1019 add r4, lr, r4, lsl #4
yading@10 1020 sub sp, sp, #168+16
yading@10 1021 vld1.16 {q0}, [r4,:128]
yading@10 1022 add lr, sp, #15
yading@10 1023 add r12, r12, #5
yading@10 1024 bic lr, lr, #15
yading@10 1025 1:
yading@10 1026 vld1.8 {d2,d3}, [r2], r3
yading@10 1027
yading@10 1028 vp8_epel8_h6 d2, d2, d3
yading@10 1029
yading@10 1030 vst1.8 {d2}, [lr,:64]!
yading@10 1031 subs r12, r12, #1
yading@10 1032 bne 1b
yading@10 1033
yading@10 1034 @ second pass (vertical):
yading@10 1035 ldr r4, [sp, #168+16+16] @ my
yading@10 1036 movrel lr, subpel_filters-16
yading@10 1037 ldr r12, [sp, #168+16+8] @ h
yading@10 1038 add r4, lr, r4, lsl #4
yading@10 1039 add lr, sp, #15
yading@10 1040 vld1.16 {q0}, [r4,:128]
yading@10 1041 bic lr, lr, #15
yading@10 1042 2:
yading@10 1043 vld1.8 {d2-d5}, [lr,:128]!
yading@10 1044 vld1.8 {d6-d7}, [lr,:128]!
yading@10 1045 vld1.8 {d30}, [lr,:64]
yading@10 1046 sub lr, lr, #32
yading@10 1047
yading@10 1048 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
yading@10 1049
yading@10 1050 vst1.8 {d2}, [r0,:64], r1
yading@10 1051 vst1.8 {d3}, [r0,:64], r1
yading@10 1052 subs r12, r12, #2
yading@10 1053 bne 2b
yading@10 1054
yading@10 1055 add sp, sp, #168+16
yading@10 1056 pop {r4,pc}
yading@10 1057 endfunc
yading@10 1058
yading@10 1059 function ff_put_vp8_epel8_v4_neon, export=1
yading@10 1060 sub r2, r2, r3
yading@10 1061 push {r4,lr}
yading@10 1062
yading@10 1063 ldr r4, [sp, #16] @ my
yading@10 1064 movrel lr, subpel_filters-16
yading@10 1065 ldr r12, [sp, #8] @ h
yading@10 1066 add r4, lr, r4, lsl #4
yading@10 1067 vld1.16 {q0}, [r4,:128]
yading@10 1068 1:
yading@10 1069 vld1.8 {d2}, [r2], r3
yading@10 1070 vld1.8 {d3}, [r2], r3
yading@10 1071 vld1.8 {d4}, [r2], r3
yading@10 1072 vld1.8 {d5}, [r2], r3
yading@10 1073 vld1.8 {d6}, [r2]
yading@10 1074 sub r2, r2, r3, lsl #1
yading@10 1075
yading@10 1076 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
yading@10 1077
yading@10 1078 vst1.8 {d2}, [r0,:64], r1
yading@10 1079 vst1.8 {d3}, [r0,:64], r1
yading@10 1080 subs r12, r12, #2
yading@10 1081 bne 1b
yading@10 1082
yading@10 1083 pop {r4,pc}
yading@10 1084 endfunc
yading@10 1085
yading@10 1086 function ff_put_vp8_epel8_h4_neon, export=1
yading@10 1087 sub r2, r2, #1
yading@10 1088 push {r4,lr}
yading@10 1089
yading@10 1090 ldr r4, [sp, #12] @ mx
yading@10 1091 movrel lr, subpel_filters-16
yading@10 1092 ldr r12, [sp, #8] @ h
yading@10 1093 add r4, lr, r4, lsl #4
yading@10 1094 vld1.16 {q0}, [r4,:128]
yading@10 1095 1:
yading@10 1096 vld1.8 {d2,d3}, [r2], r3
yading@10 1097
yading@10 1098 vp8_epel8_h4 d2, d2, d3
yading@10 1099
yading@10 1100 vst1.8 {d2}, [r0,:64], r1
yading@10 1101 subs r12, r12, #1
yading@10 1102 bne 1b
yading@10 1103
yading@10 1104 pop {r4,pc}
yading@10 1105 endfunc
yading@10 1106
yading@10 1107 function ff_put_vp8_epel8_h4v4_neon, export=1
yading@10 1108 sub r2, r2, r3
yading@10 1109 sub r2, r2, #1
yading@10 1110 push {r4,lr}
yading@10 1111
yading@10 1112 @ first pass (horizontal):
yading@10 1113 ldr r4, [sp, #12] @ mx
yading@10 1114 movrel lr, subpel_filters-16
yading@10 1115 ldr r12, [sp, #8] @ h
yading@10 1116 add r4, lr, r4, lsl #4
yading@10 1117 sub sp, sp, #168+16
yading@10 1118 vld1.16 {q0}, [r4,:128]
yading@10 1119 add lr, sp, #15
yading@10 1120 add r12, r12, #3
yading@10 1121 bic lr, lr, #15
yading@10 1122 1:
yading@10 1123 vld1.8 {d2,d3}, [r2], r3
yading@10 1124
yading@10 1125 vp8_epel8_h4 d2, d2, d3
yading@10 1126
yading@10 1127 vst1.8 {d2}, [lr,:64]!
yading@10 1128 subs r12, r12, #1
yading@10 1129 bne 1b
yading@10 1130
yading@10 1131 @ second pass (vertical):
yading@10 1132 ldr r4, [sp, #168+16+16] @ my
yading@10 1133 movrel lr, subpel_filters-16
yading@10 1134 ldr r12, [sp, #168+16+8] @ h
yading@10 1135 add r4, lr, r4, lsl #4
yading@10 1136 add lr, sp, #15
yading@10 1137 vld1.16 {q0}, [r4,:128]
yading@10 1138 bic lr, lr, #15
yading@10 1139 2:
yading@10 1140 vld1.8 {d2-d5}, [lr,:128]!
yading@10 1141 vld1.8 {d6}, [lr,:64]
yading@10 1142 sub lr, lr, #16
yading@10 1143
yading@10 1144 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
yading@10 1145
yading@10 1146 vst1.8 {d2}, [r0,:64], r1
yading@10 1147 vst1.8 {d3}, [r0,:64], r1
yading@10 1148 subs r12, r12, #2
yading@10 1149 bne 2b
yading@10 1150
yading@10 1151 add sp, sp, #168+16
yading@10 1152 pop {r4,pc}
yading@10 1153 endfunc
yading@10 1154
yading@10 1155 function ff_put_vp8_epel8_h6v4_neon, export=1
yading@10 1156 sub r2, r2, r3
yading@10 1157 sub r2, r2, #2
yading@10 1158 push {r4,lr}
yading@10 1159
yading@10 1160 @ first pass (horizontal):
yading@10 1161 ldr r4, [sp, #12] @ mx
yading@10 1162 movrel lr, subpel_filters-16
yading@10 1163 ldr r12, [sp, #8] @ h
yading@10 1164 add r4, lr, r4, lsl #4
yading@10 1165 sub sp, sp, #168+16
yading@10 1166 vld1.16 {q0}, [r4,:128]
yading@10 1167 add lr, sp, #15
yading@10 1168 add r12, r12, #3
yading@10 1169 bic lr, lr, #15
yading@10 1170 1:
yading@10 1171 vld1.8 {d2,d3}, [r2], r3
yading@10 1172
yading@10 1173 vp8_epel8_h6 d2, d2, d3
yading@10 1174
yading@10 1175 vst1.8 {d2}, [lr,:64]!
yading@10 1176 subs r12, r12, #1
yading@10 1177 bne 1b
yading@10 1178
yading@10 1179 @ second pass (vertical):
yading@10 1180 ldr r4, [sp, #168+16+16] @ my
yading@10 1181 movrel lr, subpel_filters-16
yading@10 1182 ldr r12, [sp, #168+16+8] @ h
yading@10 1183 add r4, lr, r4, lsl #4
yading@10 1184 add lr, sp, #15
yading@10 1185 vld1.16 {q0}, [r4,:128]
yading@10 1186 bic lr, lr, #15
yading@10 1187 2:
yading@10 1188 vld1.8 {d2-d5}, [lr,:128]!
yading@10 1189 vld1.8 {d6}, [lr,:64]
yading@10 1190 sub lr, lr, #16
yading@10 1191
yading@10 1192 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
yading@10 1193
yading@10 1194 vst1.8 {d2}, [r0,:64], r1
yading@10 1195 vst1.8 {d3}, [r0,:64], r1
yading@10 1196 subs r12, r12, #2
yading@10 1197 bne 2b
yading@10 1198
yading@10 1199 add sp, sp, #168+16
yading@10 1200 pop {r4,pc}
yading@10 1201 endfunc
yading@10 1202
yading@10 1203 function ff_put_vp8_epel8_h4v6_neon, export=1
yading@10 1204 sub r2, r2, r3, lsl #1
yading@10 1205 sub r2, r2, #1
yading@10 1206 push {r4,lr}
yading@10 1207
yading@10 1208 @ first pass (horizontal):
yading@10 1209 ldr r4, [sp, #12] @ mx
yading@10 1210 movrel lr, subpel_filters-16
yading@10 1211 ldr r12, [sp, #8] @ h
yading@10 1212 add r4, lr, r4, lsl #4
yading@10 1213 sub sp, sp, #168+16
yading@10 1214 vld1.16 {q0}, [r4,:128]
yading@10 1215 add lr, sp, #15
yading@10 1216 add r12, r12, #5
yading@10 1217 bic lr, lr, #15
yading@10 1218 1:
yading@10 1219 vld1.8 {d2,d3}, [r2], r3
yading@10 1220
yading@10 1221 vp8_epel8_h4 d2, d2, d3
yading@10 1222
yading@10 1223 vst1.8 {d2}, [lr,:64]!
yading@10 1224 subs r12, r12, #1
yading@10 1225 bne 1b
yading@10 1226
yading@10 1227 @ second pass (vertical):
yading@10 1228 ldr r4, [sp, #168+16+16] @ my
yading@10 1229 movrel lr, subpel_filters-16
yading@10 1230 ldr r12, [sp, #168+16+8] @ h
yading@10 1231 add r4, lr, r4, lsl #4
yading@10 1232 add lr, sp, #15
yading@10 1233 vld1.16 {q0}, [r4,:128]
yading@10 1234 bic lr, lr, #15
yading@10 1235 2:
yading@10 1236 vld1.8 {d2-d5}, [lr,:128]!
yading@10 1237 vld1.8 {d6-d7}, [lr,:128]!
yading@10 1238 vld1.8 {d30}, [lr,:64]
yading@10 1239 sub lr, lr, #32
yading@10 1240
yading@10 1241 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
yading@10 1242
yading@10 1243 vst1.8 {d2}, [r0,:64], r1
yading@10 1244 vst1.8 {d3}, [r0,:64], r1
yading@10 1245 subs r12, r12, #2
yading@10 1246 bne 2b
yading@10 1247
yading@10 1248 add sp, sp, #168+16
yading@10 1249 pop {r4,pc}
yading@10 1250 endfunc
yading@10 1251
yading@10 1252 .ltorg
yading@10 1253
yading@10 1254 function ff_put_vp8_epel4_v6_neon, export=1
yading@10 1255 sub r2, r2, r3, lsl #1
yading@10 1256 push {r4,lr}
yading@10 1257
yading@10 1258 ldr r4, [sp, #16] @ my
yading@10 1259 movrel lr, subpel_filters-16
yading@10 1260 ldr r12, [sp, #8] @ h
yading@10 1261 add r4, lr, r4, lsl #4
yading@10 1262 vld1.16 {q0}, [r4,:128]
yading@10 1263 1:
yading@10 1264 vld1.32 {d2[]}, [r2], r3
yading@10 1265 vld1.32 {d3[]}, [r2], r3
yading@10 1266 vld1.32 {d4[]}, [r2], r3
yading@10 1267 vld1.32 {d5[]}, [r2], r3
yading@10 1268 vld1.32 {d6[]}, [r2], r3
yading@10 1269 vld1.32 {d7[]}, [r2], r3
yading@10 1270 vld1.32 {d28[]}, [r2]
yading@10 1271 sub r2, r2, r3, lsl #2
yading@10 1272 vld1.32 {d2[1]}, [r2], r3
yading@10 1273 vld1.32 {d3[1]}, [r2], r3
yading@10 1274 vld1.32 {d4[1]}, [r2], r3
yading@10 1275 vld1.32 {d5[1]}, [r2], r3
yading@10 1276 vld1.32 {d6[1]}, [r2], r3
yading@10 1277 vld1.32 {d7[1]}, [r2], r3
yading@10 1278 vld1.32 {d28[1]}, [r2]
yading@10 1279 sub r2, r2, r3, lsl #2
yading@10 1280
yading@10 1281 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
yading@10 1282
yading@10 1283 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1284 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1285 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1286 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1287 subs r12, r12, #4
yading@10 1288 bne 1b
yading@10 1289
yading@10 1290 pop {r4,pc}
yading@10 1291 endfunc
yading@10 1292
yading@10 1293 function ff_put_vp8_epel4_h6_neon, export=1
yading@10 1294 sub r2, r2, #2
yading@10 1295 push {r4,lr}
yading@10 1296
yading@10 1297 ldr r4, [sp, #12] @ mx
yading@10 1298 movrel lr, subpel_filters-16
yading@10 1299 ldr r12, [sp, #8] @ h
yading@10 1300 add r4, lr, r4, lsl #4
yading@10 1301 vld1.16 {q0}, [r4,:128]
yading@10 1302 1:
yading@10 1303 vld1.8 {q1}, [r2], r3
yading@10 1304 vp8_epel8_h6 d2, d2, d3
yading@10 1305 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1306 subs r12, r12, #1
yading@10 1307 bne 1b
yading@10 1308
yading@10 1309 pop {r4,pc}
yading@10 1310 endfunc
yading@10 1311
yading@10 1312 function ff_put_vp8_epel4_h6v6_neon, export=1
yading@10 1313 sub r2, r2, r3, lsl #1
yading@10 1314 sub r2, r2, #2
yading@10 1315 push {r4,lr}
yading@10 1316
yading@10 1317 ldr r4, [sp, #12] @ mx
yading@10 1318 movrel lr, subpel_filters-16
yading@10 1319 ldr r12, [sp, #8] @ h
yading@10 1320 add r4, lr, r4, lsl #4
yading@10 1321 sub sp, sp, #52+16
yading@10 1322 vld1.16 {q0}, [r4,:128]
yading@10 1323 add lr, sp, #15
yading@10 1324 add r12, r12, #5
yading@10 1325 bic lr, lr, #15
yading@10 1326 1:
yading@10 1327 vld1.8 {q1}, [r2], r3
yading@10 1328 vp8_epel8_h6 d2, d2, d3
yading@10 1329 vst1.32 {d2[0]}, [lr,:32]!
yading@10 1330 subs r12, r12, #1
yading@10 1331 bne 1b
yading@10 1332
yading@10 1333 ldr r4, [sp, #52+16+16] @ my
yading@10 1334 movrel lr, subpel_filters-16
yading@10 1335 ldr r12, [sp, #52+16+8] @ h
yading@10 1336 add r4, lr, r4, lsl #4
yading@10 1337 add lr, sp, #15
yading@10 1338 vld1.16 {q0}, [r4,:128]
yading@10 1339 bic lr, lr, #15
yading@10 1340 2:
yading@10 1341 vld1.8 {d2-d3}, [lr,:128]!
yading@10 1342 vld1.8 {d6}, [lr,:64]!
yading@10 1343 vld1.32 {d28[]}, [lr,:32]
yading@10 1344 sub lr, lr, #16
yading@10 1345 vld1.8 {d4-d5}, [lr]!
yading@10 1346 vld1.8 {d7}, [lr,:64]!
yading@10 1347 vld1.32 {d28[1]}, [lr,:32]
yading@10 1348 sub lr, lr, #16
yading@10 1349 vtrn.32 q1, q2
yading@10 1350 vtrn.32 d6, d7
yading@10 1351 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
yading@10 1352 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1353 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1354 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1355 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1356 subs r12, r12, #4
yading@10 1357 bne 2b
yading@10 1358
yading@10 1359 add sp, sp, #52+16
yading@10 1360 pop {r4,pc}
yading@10 1361 endfunc
yading@10 1362
yading@10 1363 function ff_put_vp8_epel4_h4v6_neon, export=1
yading@10 1364 sub r2, r2, r3, lsl #1
yading@10 1365 sub r2, r2, #1
yading@10 1366 push {r4,lr}
yading@10 1367
yading@10 1368 ldr r4, [sp, #12] @ mx
yading@10 1369 movrel lr, subpel_filters-16
yading@10 1370 ldr r12, [sp, #8] @ h
yading@10 1371 add r4, lr, r4, lsl #4
yading@10 1372 sub sp, sp, #52+16
yading@10 1373 vld1.16 {q0}, [r4,:128]
yading@10 1374 add lr, sp, #15
yading@10 1375 add r12, r12, #5
yading@10 1376 bic lr, lr, #15
yading@10 1377 1:
yading@10 1378 vld1.8 {d2}, [r2], r3
yading@10 1379 vp8_epel8_h4 d2, d2, d2
yading@10 1380 vst1.32 {d2[0]}, [lr,:32]!
yading@10 1381 subs r12, r12, #1
yading@10 1382 bne 1b
yading@10 1383
yading@10 1384 ldr r4, [sp, #52+16+16] @ my
yading@10 1385 movrel lr, subpel_filters-16
yading@10 1386 ldr r12, [sp, #52+16+8] @ h
yading@10 1387 add r4, lr, r4, lsl #4
yading@10 1388 add lr, sp, #15
yading@10 1389 vld1.16 {q0}, [r4,:128]
yading@10 1390 bic lr, lr, #15
yading@10 1391 2:
yading@10 1392 vld1.8 {d2-d3}, [lr,:128]!
yading@10 1393 vld1.8 {d6}, [lr,:64]!
yading@10 1394 vld1.32 {d28[]}, [lr,:32]
yading@10 1395 sub lr, lr, #16
yading@10 1396 vld1.8 {d4-d5}, [lr]!
yading@10 1397 vld1.8 {d7}, [lr,:64]!
yading@10 1398 vld1.32 {d28[1]}, [lr,:32]
yading@10 1399 sub lr, lr, #16
yading@10 1400 vtrn.32 q1, q2
yading@10 1401 vtrn.32 d6, d7
yading@10 1402 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
yading@10 1403 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1404 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1405 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1406 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1407 subs r12, r12, #4
yading@10 1408 bne 2b
yading@10 1409
yading@10 1410 add sp, sp, #52+16
yading@10 1411 pop {r4,pc}
yading@10 1412 endfunc
yading@10 1413
yading@10 1414 function ff_put_vp8_epel4_h6v4_neon, export=1
yading@10 1415 sub r2, r2, r3
yading@10 1416 sub r2, r2, #2
yading@10 1417 push {r4,lr}
yading@10 1418
yading@10 1419 ldr r4, [sp, #12] @ mx
yading@10 1420 movrel lr, subpel_filters-16
yading@10 1421 ldr r12, [sp, #8] @ h
yading@10 1422 add r4, lr, r4, lsl #4
yading@10 1423 sub sp, sp, #44+16
yading@10 1424 vld1.16 {q0}, [r4,:128]
yading@10 1425 add lr, sp, #15
yading@10 1426 add r12, r12, #3
yading@10 1427 bic lr, lr, #15
yading@10 1428 1:
yading@10 1429 vld1.8 {q1}, [r2], r3
yading@10 1430 vp8_epel8_h6 d2, d2, d3
yading@10 1431 vst1.32 {d2[0]}, [lr,:32]!
yading@10 1432 subs r12, r12, #1
yading@10 1433 bne 1b
yading@10 1434
yading@10 1435 ldr r4, [sp, #44+16+16] @ my
yading@10 1436 movrel lr, subpel_filters-16
yading@10 1437 ldr r12, [sp, #44+16+8] @ h
yading@10 1438 add r4, lr, r4, lsl #4
yading@10 1439 add lr, sp, #15
yading@10 1440 vld1.16 {q0}, [r4,:128]
yading@10 1441 bic lr, lr, #15
yading@10 1442 2:
yading@10 1443 vld1.8 {d2-d3}, [lr,:128]!
yading@10 1444 vld1.32 {d6[]}, [lr,:32]
yading@10 1445 sub lr, lr, #8
yading@10 1446 vld1.8 {d4-d5}, [lr]!
yading@10 1447 vld1.32 {d6[1]}, [lr,:32]
yading@10 1448 sub lr, lr, #8
yading@10 1449 vtrn.32 q1, q2
yading@10 1450 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
yading@10 1451 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1452 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1453 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1454 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1455 subs r12, r12, #4
yading@10 1456 bne 2b
yading@10 1457
yading@10 1458 add sp, sp, #44+16
yading@10 1459 pop {r4,pc}
yading@10 1460 endfunc
yading@10 1461
yading@10 1462 function ff_put_vp8_epel4_h4_neon, export=1
yading@10 1463 sub r2, r2, #1
yading@10 1464 push {r4,lr}
yading@10 1465
yading@10 1466 ldr r4, [sp, #12] @ mx
yading@10 1467 movrel lr, subpel_filters-16
yading@10 1468 ldr r12, [sp, #8] @ h
yading@10 1469 add r4, lr, r4, lsl #4
yading@10 1470 vld1.16 {q0}, [r4,:128]
yading@10 1471 1:
yading@10 1472 vld1.8 {d2}, [r2], r3
yading@10 1473 vp8_epel8_h4 d2, d2, d2
yading@10 1474 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1475 subs r12, r12, #1
yading@10 1476 bne 1b
yading@10 1477
yading@10 1478 pop {r4,pc}
yading@10 1479 endfunc
yading@10 1480
yading@10 1481 function ff_put_vp8_epel4_v4_neon, export=1
yading@10 1482 sub r2, r2, r3
yading@10 1483 push {r4,lr}
yading@10 1484
yading@10 1485 ldr r4, [sp, #16] @ my
yading@10 1486 movrel lr, subpel_filters-16
yading@10 1487 ldr r12, [sp, #8] @ h
yading@10 1488 add r4, lr, r4, lsl #4
yading@10 1489 vld1.16 {q0}, [r4,:128]
yading@10 1490 1:
yading@10 1491 vld1.32 {d2[]}, [r2], r3
yading@10 1492 vld1.32 {d3[]}, [r2], r3
yading@10 1493 vld1.32 {d4[]}, [r2], r3
yading@10 1494 vld1.32 {d5[]}, [r2], r3
yading@10 1495 vld1.32 {d6[]}, [r2]
yading@10 1496 sub r2, r2, r3, lsl #1
yading@10 1497 vld1.32 {d2[1]}, [r2], r3
yading@10 1498 vld1.32 {d3[1]}, [r2], r3
yading@10 1499 vld1.32 {d4[1]}, [r2], r3
yading@10 1500 vld1.32 {d5[1]}, [r2], r3
yading@10 1501 vld1.32 {d6[1]}, [r2]
yading@10 1502 sub r2, r2, r3, lsl #1
yading@10 1503
yading@10 1504 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
yading@10 1505
yading@10 1506 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1507 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1508 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1509 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1510 subs r12, r12, #4
yading@10 1511 bne 1b
yading@10 1512
yading@10 1513 pop {r4,pc}
yading@10 1514 endfunc
yading@10 1515
yading@10 1516 function ff_put_vp8_epel4_h4v4_neon, export=1
yading@10 1517 sub r2, r2, r3
yading@10 1518 sub r2, r2, #1
yading@10 1519 push {r4,lr}
yading@10 1520
yading@10 1521 ldr r4, [sp, #12] @ mx
yading@10 1522 movrel lr, subpel_filters-16
yading@10 1523 ldr r12, [sp, #8] @ h
yading@10 1524 add r4, lr, r4, lsl #4
yading@10 1525 sub sp, sp, #44+16
yading@10 1526 vld1.16 {q0}, [r4,:128]
yading@10 1527 add lr, sp, #15
yading@10 1528 add r12, r12, #3
yading@10 1529 bic lr, lr, #15
yading@10 1530 1:
yading@10 1531 vld1.8 {d2}, [r2], r3
yading@10 1532 vp8_epel8_h4 d2, d2, d3
yading@10 1533 vst1.32 {d2[0]}, [lr,:32]!
yading@10 1534 subs r12, r12, #1
yading@10 1535 bne 1b
yading@10 1536
yading@10 1537 ldr r4, [sp, #44+16+16] @ my
yading@10 1538 movrel lr, subpel_filters-16
yading@10 1539 ldr r12, [sp, #44+16+8] @ h
yading@10 1540 add r4, lr, r4, lsl #4
yading@10 1541 add lr, sp, #15
yading@10 1542 vld1.16 {q0}, [r4,:128]
yading@10 1543 bic lr, lr, #15
yading@10 1544 2:
yading@10 1545 vld1.8 {d2-d3}, [lr,:128]!
yading@10 1546 vld1.32 {d6[]}, [lr,:32]
yading@10 1547 sub lr, lr, #8
yading@10 1548 vld1.8 {d4-d5}, [lr]!
yading@10 1549 vld1.32 {d6[1]}, [lr,:32]
yading@10 1550 sub lr, lr, #8
yading@10 1551 vtrn.32 q1, q2
yading@10 1552 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
yading@10 1553 vst1.32 {d2[0]}, [r0,:32], r1
yading@10 1554 vst1.32 {d3[0]}, [r0,:32], r1
yading@10 1555 vst1.32 {d2[1]}, [r0,:32], r1
yading@10 1556 vst1.32 {d3[1]}, [r0,:32], r1
yading@10 1557 subs r12, r12, #4
yading@10 1558 bne 2b
yading@10 1559
yading@10 1560 add sp, sp, #44+16
yading@10 1561 pop {r4,pc}
yading@10 1562 endfunc
yading@10 1563
yading@10 1564 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
yading@10 1565 @ arithmatic can be used to apply filters
yading@10 1566 const subpel_filters, align=4
yading@10 1567 .short 0, 6, 123, 12, 1, 0, 0, 0
yading@10 1568 .short 2, 11, 108, 36, 8, 1, 0, 0
yading@10 1569 .short 0, 9, 93, 50, 6, 0, 0, 0
yading@10 1570 .short 3, 16, 77, 77, 16, 3, 0, 0
yading@10 1571 .short 0, 6, 50, 93, 9, 0, 0, 0
yading@10 1572 .short 1, 8, 36, 108, 11, 2, 0, 0
yading@10 1573 .short 0, 1, 12, 123, 6, 0, 0, 0
yading@10 1574 endconst
yading@10 1575
yading@10 1576 /* Bilinear MC */
yading@10 1577
yading@10 1578 function ff_put_vp8_bilin16_h_neon, export=1
yading@10 1579 ldr r3, [sp, #4] @ mx
yading@10 1580 rsb r12, r3, #8
yading@10 1581 vdup.8 d0, r3
yading@10 1582 vdup.8 d1, r12
yading@10 1583 ldr r12, [sp] @ h
yading@10 1584 1:
yading@10 1585 subs r12, r12, #2
yading@10 1586 vld1.8 {d2-d4}, [r2], r1
yading@10 1587 vext.8 q2, q1, q2, #1
yading@10 1588 vmull.u8 q8, d2, d1
yading@10 1589 vmlal.u8 q8, d4, d0
yading@10 1590 vld1.8 {d18-d20},[r2], r1
yading@10 1591 vmull.u8 q3, d3, d1
yading@10 1592 vmlal.u8 q3, d5, d0
yading@10 1593 vext.8 q10, q9, q10, #1
yading@10 1594 vmull.u8 q11, d18, d1
yading@10 1595 vmlal.u8 q11, d20, d0
yading@10 1596 vmull.u8 q12, d19, d1
yading@10 1597 vmlal.u8 q12, d21, d0
yading@10 1598 vrshrn.u16 d4, q8, #3
yading@10 1599 vrshrn.u16 d5, q3, #3
yading@10 1600 vrshrn.u16 d6, q11, #3
yading@10 1601 vrshrn.u16 d7, q12, #3
yading@10 1602 vst1.8 {q2}, [r0,:128], r1
yading@10 1603 vst1.8 {q3}, [r0,:128], r1
yading@10 1604 bgt 1b
yading@10 1605
yading@10 1606 bx lr
yading@10 1607 endfunc
yading@10 1608
yading@10 1609 function ff_put_vp8_bilin16_v_neon, export=1
yading@10 1610 ldr r3, [sp, #8] @ my
yading@10 1611 rsb r12, r3, #8
yading@10 1612 vdup.8 d0, r3
yading@10 1613 vdup.8 d1, r12
yading@10 1614 ldr r12, [sp] @ h
yading@10 1615 vld1.8 {q1}, [r2], r1
yading@10 1616 1:
yading@10 1617 subs r12, r12, #2
yading@10 1618 vld1.8 {q2}, [r2], r1
yading@10 1619 vmull.u8 q3, d2, d1
yading@10 1620 vmlal.u8 q3, d4, d0
yading@10 1621 vmull.u8 q8, d3, d1
yading@10 1622 vmlal.u8 q8, d5, d0
yading@10 1623 vld1.8 {q1}, [r2], r1
yading@10 1624 vmull.u8 q9, d4, d1
yading@10 1625 vmlal.u8 q9, d2, d0
yading@10 1626 vmull.u8 q10, d5, d1
yading@10 1627 vmlal.u8 q10, d3, d0
yading@10 1628 vrshrn.u16 d4, q3, #3
yading@10 1629 vrshrn.u16 d5, q8, #3
yading@10 1630 vrshrn.u16 d6, q9, #3
yading@10 1631 vrshrn.u16 d7, q10, #3
yading@10 1632 vst1.8 {q2}, [r0,:128], r1
yading@10 1633 vst1.8 {q3}, [r0,:128], r1
yading@10 1634 bgt 1b
yading@10 1635
yading@10 1636 bx lr
yading@10 1637 endfunc
yading@10 1638
yading@10 1639 function ff_put_vp8_bilin16_hv_neon, export=1
yading@10 1640 ldr r3, [sp, #4] @ mx
yading@10 1641 rsb r12, r3, #8
yading@10 1642 vdup.8 d0, r3
yading@10 1643 vdup.8 d1, r12
yading@10 1644 ldr r3, [sp, #8] @ my
yading@10 1645 rsb r12, r3, #8
yading@10 1646 vdup.8 d2, r3
yading@10 1647 vdup.8 d3, r12
yading@10 1648 ldr r12, [sp] @ h
yading@10 1649
yading@10 1650 vld1.8 {d4-d6}, [r2], r1
yading@10 1651 vext.8 q3, q2, q3, #1
yading@10 1652 vmull.u8 q8, d4, d1
yading@10 1653 vmlal.u8 q8, d6, d0
yading@10 1654 vmull.u8 q9, d5, d1
yading@10 1655 vmlal.u8 q9, d7, d0
yading@10 1656 vrshrn.u16 d4, q8, #3
yading@10 1657 vrshrn.u16 d5, q9, #3
yading@10 1658 1:
yading@10 1659 subs r12, r12, #2
yading@10 1660 vld1.8 {d18-d20},[r2], r1
yading@10 1661 vext.8 q10, q9, q10, #1
yading@10 1662 vmull.u8 q11, d18, d1
yading@10 1663 vmlal.u8 q11, d20, d0
yading@10 1664 vld1.8 {d26-d28},[r2], r1
yading@10 1665 vmull.u8 q12, d19, d1
yading@10 1666 vmlal.u8 q12, d21, d0
yading@10 1667 vext.8 q14, q13, q14, #1
yading@10 1668 vmull.u8 q8, d26, d1
yading@10 1669 vmlal.u8 q8, d28, d0
yading@10 1670 vmull.u8 q9, d27, d1
yading@10 1671 vmlal.u8 q9, d29, d0
yading@10 1672 vrshrn.u16 d6, q11, #3
yading@10 1673 vrshrn.u16 d7, q12, #3
yading@10 1674 vmull.u8 q12, d4, d3
yading@10 1675 vmlal.u8 q12, d6, d2
yading@10 1676 vmull.u8 q15, d5, d3
yading@10 1677 vmlal.u8 q15, d7, d2
yading@10 1678 vrshrn.u16 d4, q8, #3
yading@10 1679 vrshrn.u16 d5, q9, #3
yading@10 1680 vmull.u8 q10, d6, d3
yading@10 1681 vmlal.u8 q10, d4, d2
yading@10 1682 vmull.u8 q11, d7, d3
yading@10 1683 vmlal.u8 q11, d5, d2
yading@10 1684 vrshrn.u16 d24, q12, #3
yading@10 1685 vrshrn.u16 d25, q15, #3
yading@10 1686 vst1.8 {q12}, [r0,:128], r1
yading@10 1687 vrshrn.u16 d20, q10, #3
yading@10 1688 vrshrn.u16 d21, q11, #3
yading@10 1689 vst1.8 {q10}, [r0,:128], r1
yading@10 1690 bgt 1b
yading@10 1691
yading@10 1692 bx lr
yading@10 1693 endfunc
yading@10 1694
yading@10 1695 function ff_put_vp8_bilin8_h_neon, export=1
yading@10 1696 ldr r3, [sp, #4] @ mx
yading@10 1697 rsb r12, r3, #8
yading@10 1698 vdup.8 d0, r3
yading@10 1699 vdup.8 d1, r12
yading@10 1700 ldr r12, [sp] @ h
yading@10 1701 1:
yading@10 1702 subs r12, r12, #2
yading@10 1703 vld1.8 {q1}, [r2], r1
yading@10 1704 vext.8 d3, d2, d3, #1
yading@10 1705 vmull.u8 q2, d2, d1
yading@10 1706 vmlal.u8 q2, d3, d0
yading@10 1707 vld1.8 {q3}, [r2], r1
yading@10 1708 vext.8 d7, d6, d7, #1
yading@10 1709 vmull.u8 q8, d6, d1
yading@10 1710 vmlal.u8 q8, d7, d0
yading@10 1711 vrshrn.u16 d4, q2, #3
yading@10 1712 vrshrn.u16 d16, q8, #3
yading@10 1713 vst1.8 {d4}, [r0,:64], r1
yading@10 1714 vst1.8 {d16}, [r0,:64], r1
yading@10 1715 bgt 1b
yading@10 1716
yading@10 1717 bx lr
yading@10 1718 endfunc
yading@10 1719
yading@10 1720 function ff_put_vp8_bilin8_v_neon, export=1
yading@10 1721 ldr r3, [sp, #8] @ my
yading@10 1722 rsb r12, r3, #8
yading@10 1723 vdup.8 d0, r3
yading@10 1724 vdup.8 d1, r12
yading@10 1725 ldr r12, [sp] @ h
yading@10 1726 vld1.8 {d2}, [r2], r1
yading@10 1727 1:
yading@10 1728 subs r12, r12, #2
yading@10 1729 vld1.8 {d3}, [r2], r1
yading@10 1730 vmull.u8 q2, d2, d1
yading@10 1731 vmlal.u8 q2, d3, d0
yading@10 1732 vld1.8 {d2}, [r2], r1
yading@10 1733 vmull.u8 q3, d3, d1
yading@10 1734 vmlal.u8 q3, d2, d0
yading@10 1735 vrshrn.u16 d4, q2, #3
yading@10 1736 vrshrn.u16 d6, q3, #3
yading@10 1737 vst1.8 {d4}, [r0,:64], r1
yading@10 1738 vst1.8 {d6}, [r0,:64], r1
yading@10 1739 bgt 1b
yading@10 1740
yading@10 1741 bx lr
yading@10 1742 endfunc
yading@10 1743
yading@10 1744 function ff_put_vp8_bilin8_hv_neon, export=1
yading@10 1745 ldr r3, [sp, #4] @ mx
yading@10 1746 rsb r12, r3, #8
yading@10 1747 vdup.8 d0, r3
yading@10 1748 vdup.8 d1, r12
yading@10 1749 ldr r3, [sp, #8] @ my
yading@10 1750 rsb r12, r3, #8
yading@10 1751 vdup.8 d2, r3
yading@10 1752 vdup.8 d3, r12
yading@10 1753 ldr r12, [sp] @ h
yading@10 1754
yading@10 1755 vld1.8 {q2}, [r2], r1
yading@10 1756 vext.8 d5, d4, d5, #1
yading@10 1757 vmull.u8 q9, d4, d1
yading@10 1758 vmlal.u8 q9, d5, d0
yading@10 1759 vrshrn.u16 d22, q9, #3
yading@10 1760 1:
yading@10 1761 subs r12, r12, #2
yading@10 1762 vld1.8 {q3}, [r2], r1
yading@10 1763 vext.8 d7, d6, d7, #1
yading@10 1764 vmull.u8 q8, d6, d1
yading@10 1765 vmlal.u8 q8, d7, d0
yading@10 1766 vld1.8 {q2}, [r2], r1
yading@10 1767 vext.8 d5, d4, d5, #1
yading@10 1768 vmull.u8 q9, d4, d1
yading@10 1769 vmlal.u8 q9, d5, d0
yading@10 1770 vrshrn.u16 d16, q8, #3
yading@10 1771 vmull.u8 q10, d22, d3
yading@10 1772 vmlal.u8 q10, d16, d2
yading@10 1773 vrshrn.u16 d22, q9, #3
yading@10 1774 vmull.u8 q12, d16, d3
yading@10 1775 vmlal.u8 q12, d22, d2
yading@10 1776 vrshrn.u16 d20, q10, #3
yading@10 1777 vst1.8 {d20}, [r0,:64], r1
yading@10 1778 vrshrn.u16 d23, q12, #3
yading@10 1779 vst1.8 {d23}, [r0,:64], r1
yading@10 1780 bgt 1b
yading@10 1781
yading@10 1782 bx lr
yading@10 1783 endfunc
yading@10 1784
yading@10 1785 function ff_put_vp8_bilin4_h_neon, export=1
yading@10 1786 ldr r3, [sp, #4] @ mx
yading@10 1787 rsb r12, r3, #8
yading@10 1788 vdup.8 d0, r3
yading@10 1789 vdup.8 d1, r12
yading@10 1790 ldr r12, [sp] @ h
yading@10 1791 1:
yading@10 1792 subs r12, r12, #2
yading@10 1793 vld1.8 {d2}, [r2], r1
yading@10 1794 vext.8 d3, d2, d3, #1
yading@10 1795 vld1.8 {d6}, [r2], r1
yading@10 1796 vext.8 d7, d6, d7, #1
yading@10 1797 vtrn.32 q1, q3
yading@10 1798 vmull.u8 q2, d2, d1
yading@10 1799 vmlal.u8 q2, d3, d0
yading@10 1800 vrshrn.u16 d4, q2, #3
yading@10 1801 vst1.32 {d4[0]}, [r0,:32], r1
yading@10 1802 vst1.32 {d4[1]}, [r0,:32], r1
yading@10 1803 bgt 1b
yading@10 1804
yading@10 1805 bx lr
yading@10 1806 endfunc
yading@10 1807
yading@10 1808 function ff_put_vp8_bilin4_v_neon, export=1
yading@10 1809 ldr r3, [sp, #8] @ my
yading@10 1810 rsb r12, r3, #8
yading@10 1811 vdup.8 d0, r3
yading@10 1812 vdup.8 d1, r12
yading@10 1813 ldr r12, [sp] @ h
yading@10 1814 vld1.32 {d2[]}, [r2], r1
yading@10 1815 1:
yading@10 1816 vld1.32 {d3[]}, [r2]
yading@10 1817 vld1.32 {d2[1]}, [r2], r1
yading@10 1818 vld1.32 {d3[1]}, [r2], r1
yading@10 1819 vmull.u8 q2, d2, d1
yading@10 1820 vmlal.u8 q2, d3, d0
yading@10 1821 vtrn.32 d3, d2
yading@10 1822 vrshrn.u16 d4, q2, #3
yading@10 1823 vst1.32 {d4[0]}, [r0,:32], r1
yading@10 1824 vst1.32 {d4[1]}, [r0,:32], r1
yading@10 1825 subs r12, r12, #2
yading@10 1826 bgt 1b
yading@10 1827
yading@10 1828 bx lr
yading@10 1829 endfunc
yading@10 1830
yading@10 1831 function ff_put_vp8_bilin4_hv_neon, export=1
yading@10 1832 ldr r3, [sp, #4] @ mx
yading@10 1833 rsb r12, r3, #8
yading@10 1834 vdup.8 d0, r3
yading@10 1835 vdup.8 d1, r12
yading@10 1836 ldr r3, [sp, #8] @ my
yading@10 1837 rsb r12, r3, #8
yading@10 1838 vdup.8 d2, r3
yading@10 1839 vdup.8 d3, r12
yading@10 1840 ldr r12, [sp] @ h
yading@10 1841
yading@10 1842 vld1.8 {d4}, [r2], r1
yading@10 1843 vext.8 d5, d4, d4, #1
yading@10 1844 vmull.u8 q9, d4, d1
yading@10 1845 vmlal.u8 q9, d5, d0
yading@10 1846 vrshrn.u16 d22, q9, #3
yading@10 1847 1:
yading@10 1848 subs r12, r12, #2
yading@10 1849 vld1.8 {d6}, [r2], r1
yading@10 1850 vext.8 d7, d6, d6, #1
yading@10 1851 vld1.8 {d4}, [r2], r1
yading@10 1852 vext.8 d5, d4, d4, #1
yading@10 1853 vtrn.32 q3, q2
yading@10 1854 vmull.u8 q8, d6, d1
yading@10 1855 vmlal.u8 q8, d7, d0
yading@10 1856 vrshrn.u16 d16, q8, #3
yading@10 1857 vmull.u8 q10, d16, d2
yading@10 1858 vtrn.32 d22, d16
yading@10 1859 vmlal.u8 q10, d22, d3
yading@10 1860 vrev64.32 d22, d16
yading@10 1861 vrshrn.u16 d20, q10, #3
yading@10 1862 vst1.32 {d20[0]}, [r0,:32], r1
yading@10 1863 vst1.32 {d20[1]}, [r0,:32], r1
yading@10 1864 bgt 1b
yading@10 1865
yading@10 1866 bx lr
yading@10 1867 endfunc