annotate ffmpeg/libavcodec/arm/h264pred_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
yading@10 3 *
yading@10 4 * This file is part of FFmpeg.
yading@10 5 *
yading@10 6 * FFmpeg is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * FFmpeg is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with FFmpeg; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 #include "libavutil/arm/asm.S"
yading@10 22
yading@10 23 .macro ldcol.8 rd, rs, rt, n=8, hi=0
yading@10 24 .if \n == 8 || \hi == 0
yading@10 25 vld1.8 {\rd[0]}, [\rs], \rt
yading@10 26 vld1.8 {\rd[1]}, [\rs], \rt
yading@10 27 vld1.8 {\rd[2]}, [\rs], \rt
yading@10 28 vld1.8 {\rd[3]}, [\rs], \rt
yading@10 29 .endif
yading@10 30 .if \n == 8 || \hi == 1
yading@10 31 vld1.8 {\rd[4]}, [\rs], \rt
yading@10 32 vld1.8 {\rd[5]}, [\rs], \rt
yading@10 33 vld1.8 {\rd[6]}, [\rs], \rt
yading@10 34 vld1.8 {\rd[7]}, [\rs], \rt
yading@10 35 .endif
yading@10 36 .endm
yading@10 37
yading@10 38 .macro add16x8 dq, dl, dh, rl, rh
yading@10 39 vaddl.u8 \dq, \rl, \rh
yading@10 40 vadd.u16 \dl, \dl, \dh
yading@10 41 vpadd.u16 \dl, \dl, \dl
yading@10 42 vpadd.u16 \dl, \dl, \dl
yading@10 43 .endm
yading@10 44
yading@10 45 function ff_pred16x16_128_dc_neon, export=1
yading@10 46 vmov.i8 q0, #128
yading@10 47 b .L_pred16x16_dc_end
yading@10 48 endfunc
yading@10 49
yading@10 50 function ff_pred16x16_top_dc_neon, export=1
yading@10 51 sub r2, r0, r1
yading@10 52 vld1.8 {q0}, [r2,:128]
yading@10 53 add16x8 q0, d0, d1, d0, d1
yading@10 54 vrshrn.u16 d0, q0, #4
yading@10 55 vdup.8 q0, d0[0]
yading@10 56 b .L_pred16x16_dc_end
yading@10 57 endfunc
yading@10 58
yading@10 59 function ff_pred16x16_left_dc_neon, export=1
yading@10 60 sub r2, r0, #1
yading@10 61 ldcol.8 d0, r2, r1
yading@10 62 ldcol.8 d1, r2, r1
yading@10 63 add16x8 q0, d0, d1, d0, d1
yading@10 64 vrshrn.u16 d0, q0, #4
yading@10 65 vdup.8 q0, d0[0]
yading@10 66 b .L_pred16x16_dc_end
yading@10 67 endfunc
yading@10 68
yading@10 69 function ff_pred16x16_dc_neon, export=1
yading@10 70 sub r2, r0, r1
yading@10 71 vld1.8 {q0}, [r2,:128]
yading@10 72 sub r2, r0, #1
yading@10 73 ldcol.8 d2, r2, r1
yading@10 74 ldcol.8 d3, r2, r1
yading@10 75 vaddl.u8 q0, d0, d1
yading@10 76 vaddl.u8 q1, d2, d3
yading@10 77 vadd.u16 q0, q0, q1
yading@10 78 vadd.u16 d0, d0, d1
yading@10 79 vpadd.u16 d0, d0, d0
yading@10 80 vpadd.u16 d0, d0, d0
yading@10 81 vrshrn.u16 d0, q0, #5
yading@10 82 vdup.8 q0, d0[0]
yading@10 83 .L_pred16x16_dc_end:
yading@10 84 mov r3, #8
yading@10 85 6: vst1.8 {q0}, [r0,:128], r1
yading@10 86 vst1.8 {q0}, [r0,:128], r1
yading@10 87 subs r3, r3, #1
yading@10 88 bne 6b
yading@10 89 bx lr
yading@10 90 endfunc
yading@10 91
yading@10 92 function ff_pred16x16_hor_neon, export=1
yading@10 93 sub r2, r0, #1
yading@10 94 mov r3, #16
yading@10 95 1: vld1.8 {d0[],d1[]},[r2], r1
yading@10 96 vst1.8 {q0}, [r0,:128], r1
yading@10 97 subs r3, r3, #1
yading@10 98 bne 1b
yading@10 99 bx lr
yading@10 100 endfunc
yading@10 101
yading@10 102 function ff_pred16x16_vert_neon, export=1
yading@10 103 sub r0, r0, r1
yading@10 104 vld1.8 {q0}, [r0,:128], r1
yading@10 105 mov r3, #8
yading@10 106 1: vst1.8 {q0}, [r0,:128], r1
yading@10 107 vst1.8 {q0}, [r0,:128], r1
yading@10 108 subs r3, r3, #1
yading@10 109 bne 1b
yading@10 110 bx lr
yading@10 111 endfunc
yading@10 112
yading@10 113 function ff_pred16x16_plane_neon, export=1
yading@10 114 sub r3, r0, r1
yading@10 115 add r2, r3, #8
yading@10 116 sub r3, r3, #1
yading@10 117 vld1.8 {d0}, [r3]
yading@10 118 vld1.8 {d2}, [r2,:64], r1
yading@10 119 ldcol.8 d1, r3, r1
yading@10 120 add r3, r3, r1
yading@10 121 ldcol.8 d3, r3, r1
yading@10 122 vrev64.8 q0, q0
yading@10 123 vaddl.u8 q8, d2, d3
yading@10 124 vsubl.u8 q2, d2, d0
yading@10 125 vsubl.u8 q3, d3, d1
yading@10 126 movrel r3, p16weight
yading@10 127 vld1.8 {q0}, [r3,:128]
yading@10 128 vmul.s16 q2, q2, q0
yading@10 129 vmul.s16 q3, q3, q0
yading@10 130 vadd.i16 d4, d4, d5
yading@10 131 vadd.i16 d5, d6, d7
yading@10 132 vpadd.i16 d4, d4, d5
yading@10 133 vpadd.i16 d4, d4, d4
yading@10 134 vshll.s16 q3, d4, #2
yading@10 135 vaddw.s16 q2, q3, d4
yading@10 136 vrshrn.s32 d4, q2, #6
yading@10 137 mov r3, #0
yading@10 138 vtrn.16 d4, d5
yading@10 139 vadd.i16 d2, d4, d5
yading@10 140 vshl.i16 d3, d2, #3
yading@10 141 vrev64.16 d16, d17
yading@10 142 vsub.i16 d3, d3, d2
yading@10 143 vadd.i16 d16, d16, d0
yading@10 144 vshl.i16 d2, d16, #4
yading@10 145 vsub.i16 d2, d2, d3
yading@10 146 vshl.i16 d3, d4, #4
yading@10 147 vext.16 q0, q0, q0, #7
yading@10 148 vsub.i16 d6, d5, d3
yading@10 149 vmov.16 d0[0], r3
yading@10 150 vmul.i16 q0, q0, d4[0]
yading@10 151 vdup.16 q1, d2[0]
yading@10 152 vdup.16 q2, d4[0]
yading@10 153 vdup.16 q3, d6[0]
yading@10 154 vshl.i16 q2, q2, #3
yading@10 155 vadd.i16 q1, q1, q0
yading@10 156 vadd.i16 q3, q3, q2
yading@10 157 mov r3, #16
yading@10 158 1:
yading@10 159 vqshrun.s16 d0, q1, #5
yading@10 160 vadd.i16 q1, q1, q2
yading@10 161 vqshrun.s16 d1, q1, #5
yading@10 162 vadd.i16 q1, q1, q3
yading@10 163 vst1.8 {q0}, [r0,:128], r1
yading@10 164 subs r3, r3, #1
yading@10 165 bne 1b
yading@10 166 bx lr
yading@10 167 endfunc
yading@10 168
yading@10 169 const p16weight, align=4
yading@10 170 .short 1,2,3,4,5,6,7,8
yading@10 171 endconst
yading@10 172
yading@10 173 function ff_pred8x8_hor_neon, export=1
yading@10 174 sub r2, r0, #1
yading@10 175 mov r3, #8
yading@10 176 1: vld1.8 {d0[]}, [r2], r1
yading@10 177 vst1.8 {d0}, [r0,:64], r1
yading@10 178 subs r3, r3, #1
yading@10 179 bne 1b
yading@10 180 bx lr
yading@10 181 endfunc
yading@10 182
yading@10 183 function ff_pred8x8_vert_neon, export=1
yading@10 184 sub r0, r0, r1
yading@10 185 vld1.8 {d0}, [r0,:64], r1
yading@10 186 mov r3, #4
yading@10 187 1: vst1.8 {d0}, [r0,:64], r1
yading@10 188 vst1.8 {d0}, [r0,:64], r1
yading@10 189 subs r3, r3, #1
yading@10 190 bne 1b
yading@10 191 bx lr
yading@10 192 endfunc
yading@10 193
yading@10 194 function ff_pred8x8_plane_neon, export=1
yading@10 195 sub r3, r0, r1
yading@10 196 add r2, r3, #4
yading@10 197 sub r3, r3, #1
yading@10 198 vld1.32 {d0[0]}, [r3]
yading@10 199 vld1.32 {d2[0]}, [r2,:32], r1
yading@10 200 ldcol.8 d0, r3, r1, 4, hi=1
yading@10 201 add r3, r3, r1
yading@10 202 ldcol.8 d3, r3, r1, 4
yading@10 203 vaddl.u8 q8, d2, d3
yading@10 204 vrev32.8 d0, d0
yading@10 205 vtrn.32 d2, d3
yading@10 206 vsubl.u8 q2, d2, d0
yading@10 207 movrel r3, p16weight
yading@10 208 vld1.16 {q0}, [r3,:128]
yading@10 209 vmul.s16 d4, d4, d0
yading@10 210 vmul.s16 d5, d5, d0
yading@10 211 vpadd.i16 d4, d4, d5
yading@10 212 vpaddl.s16 d4, d4
yading@10 213 vshl.i32 d5, d4, #4
yading@10 214 vadd.s32 d4, d4, d5
yading@10 215 vrshrn.s32 d4, q2, #5
yading@10 216 mov r3, #0
yading@10 217 vtrn.16 d4, d5
yading@10 218 vadd.i16 d2, d4, d5
yading@10 219 vshl.i16 d3, d2, #2
yading@10 220 vrev64.16 d16, d16
yading@10 221 vsub.i16 d3, d3, d2
yading@10 222 vadd.i16 d16, d16, d0
yading@10 223 vshl.i16 d2, d16, #4
yading@10 224 vsub.i16 d2, d2, d3
yading@10 225 vshl.i16 d3, d4, #3
yading@10 226 vext.16 q0, q0, q0, #7
yading@10 227 vsub.i16 d6, d5, d3
yading@10 228 vmov.16 d0[0], r3
yading@10 229 vmul.i16 q0, q0, d4[0]
yading@10 230 vdup.16 q1, d2[0]
yading@10 231 vdup.16 q2, d4[0]
yading@10 232 vdup.16 q3, d6[0]
yading@10 233 vshl.i16 q2, q2, #3
yading@10 234 vadd.i16 q1, q1, q0
yading@10 235 vadd.i16 q3, q3, q2
yading@10 236 mov r3, #8
yading@10 237 1:
yading@10 238 vqshrun.s16 d0, q1, #5
yading@10 239 vadd.i16 q1, q1, q3
yading@10 240 vst1.8 {d0}, [r0,:64], r1
yading@10 241 subs r3, r3, #1
yading@10 242 bne 1b
yading@10 243 bx lr
yading@10 244 endfunc
yading@10 245
yading@10 246 function ff_pred8x8_128_dc_neon, export=1
yading@10 247 vmov.i8 q0, #128
yading@10 248 b .L_pred8x8_dc_end
yading@10 249 endfunc
yading@10 250
yading@10 251 function ff_pred8x8_top_dc_neon, export=1
yading@10 252 sub r2, r0, r1
yading@10 253 vld1.8 {d0}, [r2,:64]
yading@10 254 vpaddl.u8 d0, d0
yading@10 255 vpadd.u16 d0, d0, d0
yading@10 256 vrshrn.u16 d0, q0, #2
yading@10 257 vdup.8 d1, d0[1]
yading@10 258 vdup.8 d0, d0[0]
yading@10 259 vtrn.32 d0, d1
yading@10 260 b .L_pred8x8_dc_end
yading@10 261 endfunc
yading@10 262
yading@10 263 function ff_pred8x8_left_dc_neon, export=1
yading@10 264 sub r2, r0, #1
yading@10 265 ldcol.8 d0, r2, r1
yading@10 266 vpaddl.u8 d0, d0
yading@10 267 vpadd.u16 d0, d0, d0
yading@10 268 vrshrn.u16 d0, q0, #2
yading@10 269 vdup.8 d1, d0[1]
yading@10 270 vdup.8 d0, d0[0]
yading@10 271 b .L_pred8x8_dc_end
yading@10 272 endfunc
yading@10 273
yading@10 274 function ff_pred8x8_dc_neon, export=1
yading@10 275 sub r2, r0, r1
yading@10 276 vld1.8 {d0}, [r2,:64]
yading@10 277 sub r2, r0, #1
yading@10 278 ldcol.8 d1, r2, r1
yading@10 279 vtrn.32 d0, d1
yading@10 280 vpaddl.u8 q0, q0
yading@10 281 vpadd.u16 d0, d0, d1
yading@10 282 vpadd.u16 d1, d0, d0
yading@10 283 vrshrn.u16 d2, q0, #3
yading@10 284 vrshrn.u16 d3, q0, #2
yading@10 285 vdup.8 d0, d2[4]
yading@10 286 vdup.8 d1, d3[3]
yading@10 287 vdup.8 d4, d3[2]
yading@10 288 vdup.8 d5, d2[5]
yading@10 289 vtrn.32 q0, q2
yading@10 290 .L_pred8x8_dc_end:
yading@10 291 mov r3, #4
yading@10 292 add r2, r0, r1, lsl #2
yading@10 293 6: vst1.8 {d0}, [r0,:64], r1
yading@10 294 vst1.8 {d1}, [r2,:64], r1
yading@10 295 subs r3, r3, #1
yading@10 296 bne 6b
yading@10 297 bx lr
yading@10 298 endfunc
yading@10 299
yading@10 300 function ff_pred8x8_l0t_dc_neon, export=1
yading@10 301 sub r2, r0, r1
yading@10 302 vld1.8 {d0}, [r2,:64]
yading@10 303 sub r2, r0, #1
yading@10 304 ldcol.8 d1, r2, r1, 4
yading@10 305 vtrn.32 d0, d1
yading@10 306 vpaddl.u8 q0, q0
yading@10 307 vpadd.u16 d0, d0, d1
yading@10 308 vpadd.u16 d1, d0, d0
yading@10 309 vrshrn.u16 d2, q0, #3
yading@10 310 vrshrn.u16 d3, q0, #2
yading@10 311 vdup.8 d0, d2[4]
yading@10 312 vdup.8 d1, d3[0]
yading@10 313 vdup.8 q2, d3[2]
yading@10 314 vtrn.32 q0, q2
yading@10 315 b .L_pred8x8_dc_end
yading@10 316 endfunc
yading@10 317
yading@10 318 function ff_pred8x8_l00_dc_neon, export=1
yading@10 319 sub r2, r0, #1
yading@10 320 ldcol.8 d0, r2, r1, 4
yading@10 321 vpaddl.u8 d0, d0
yading@10 322 vpadd.u16 d0, d0, d0
yading@10 323 vrshrn.u16 d0, q0, #2
yading@10 324 vmov.i8 d1, #128
yading@10 325 vdup.8 d0, d0[0]
yading@10 326 b .L_pred8x8_dc_end
yading@10 327 endfunc
yading@10 328
yading@10 329 function ff_pred8x8_0lt_dc_neon, export=1
yading@10 330 sub r2, r0, r1
yading@10 331 vld1.8 {d0}, [r2,:64]
yading@10 332 add r2, r0, r1, lsl #2
yading@10 333 sub r2, r2, #1
yading@10 334 ldcol.8 d1, r2, r1, 4, hi=1
yading@10 335 vtrn.32 d0, d1
yading@10 336 vpaddl.u8 q0, q0
yading@10 337 vpadd.u16 d0, d0, d1
yading@10 338 vpadd.u16 d1, d0, d0
yading@10 339 vrshrn.u16 d3, q0, #2
yading@10 340 vrshrn.u16 d2, q0, #3
yading@10 341 vdup.8 d0, d3[0]
yading@10 342 vdup.8 d1, d3[3]
yading@10 343 vdup.8 d4, d3[2]
yading@10 344 vdup.8 d5, d2[5]
yading@10 345 vtrn.32 q0, q2
yading@10 346 b .L_pred8x8_dc_end
yading@10 347 endfunc
yading@10 348
yading@10 349 function ff_pred8x8_0l0_dc_neon, export=1
yading@10 350 add r2, r0, r1, lsl #2
yading@10 351 sub r2, r2, #1
yading@10 352 ldcol.8 d1, r2, r1, 4
yading@10 353 vpaddl.u8 d2, d1
yading@10 354 vpadd.u16 d2, d2, d2
yading@10 355 vrshrn.u16 d1, q1, #2
yading@10 356 vmov.i8 d0, #128
yading@10 357 vdup.8 d1, d1[0]
yading@10 358 b .L_pred8x8_dc_end
yading@10 359 endfunc