annotate ffmpeg/libavcodec/arm/h264cmc_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
yading@10 3 *
yading@10 4 * This file is part of Libav.
yading@10 5 *
yading@10 6 * Libav is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * Libav is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with Libav; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 #include "libavutil/arm/asm.S"
yading@10 22
yading@10 23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
yading@10 24 .macro h264_chroma_mc8 type, codec=h264
yading@10 25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
yading@10 26 push {r4-r7, lr}
yading@10 27 ldrd r4, r5, [sp, #20]
yading@10 28 .ifc \type,avg
yading@10 29 mov lr, r0
yading@10 30 .endif
yading@10 31 pld [r1]
yading@10 32 pld [r1, r2]
yading@10 33
yading@10 34 .ifc \codec,rv40
yading@10 35 movrel r6, rv40bias
yading@10 36 lsr r7, r5, #1
yading@10 37 add r6, r6, r7, lsl #3
yading@10 38 lsr r7, r4, #1
yading@10 39 add r6, r6, r7, lsl #1
yading@10 40 vld1.16 {d22[],d23[]}, [r6,:16]
yading@10 41 .endif
yading@10 42
yading@10 43 A muls r7, r4, r5
yading@10 44 T mul r7, r4, r5
yading@10 45 T cmp r7, #0
yading@10 46 rsb r6, r7, r5, lsl #3
yading@10 47 rsb r12, r7, r4, lsl #3
yading@10 48 sub r4, r7, r4, lsl #3
yading@10 49 sub r4, r4, r5, lsl #3
yading@10 50 add r4, r4, #64
yading@10 51
yading@10 52 beq 2f
yading@10 53
yading@10 54 vdup.8 d0, r4
yading@10 55 vdup.8 d1, r12
yading@10 56 vld1.8 {d4, d5}, [r1], r2
yading@10 57 vdup.8 d2, r6
yading@10 58 vdup.8 d3, r7
yading@10 59 vext.8 d5, d4, d5, #1
yading@10 60
yading@10 61 1: vld1.8 {d6, d7}, [r1], r2
yading@10 62 vmull.u8 q8, d4, d0
yading@10 63 vmlal.u8 q8, d5, d1
yading@10 64 vext.8 d7, d6, d7, #1
yading@10 65 vld1.8 {d4, d5}, [r1], r2
yading@10 66 vmlal.u8 q8, d6, d2
yading@10 67 pld [r1]
yading@10 68 vext.8 d5, d4, d5, #1
yading@10 69 vmlal.u8 q8, d7, d3
yading@10 70 vmull.u8 q9, d6, d0
yading@10 71 subs r3, r3, #2
yading@10 72 vmlal.u8 q9, d7, d1
yading@10 73 vmlal.u8 q9, d4, d2
yading@10 74 vmlal.u8 q9, d5, d3
yading@10 75 pld [r1, r2]
yading@10 76 .ifc \codec,h264
yading@10 77 vrshrn.u16 d16, q8, #6
yading@10 78 vrshrn.u16 d17, q9, #6
yading@10 79 .else
yading@10 80 vadd.u16 q8, q8, q11
yading@10 81 vadd.u16 q9, q9, q11
yading@10 82 vshrn.u16 d16, q8, #6
yading@10 83 vshrn.u16 d17, q9, #6
yading@10 84 .endif
yading@10 85 .ifc \type,avg
yading@10 86 vld1.8 {d20}, [lr,:64], r2
yading@10 87 vld1.8 {d21}, [lr,:64], r2
yading@10 88 vrhadd.u8 q8, q8, q10
yading@10 89 .endif
yading@10 90 vst1.8 {d16}, [r0,:64], r2
yading@10 91 vst1.8 {d17}, [r0,:64], r2
yading@10 92 bgt 1b
yading@10 93
yading@10 94 pop {r4-r7, pc}
yading@10 95
yading@10 96 2: tst r6, r6
yading@10 97 add r12, r12, r6
yading@10 98 vdup.8 d0, r4
yading@10 99 vdup.8 d1, r12
yading@10 100
yading@10 101 beq 4f
yading@10 102
yading@10 103 vld1.8 {d4}, [r1], r2
yading@10 104
yading@10 105 3: vld1.8 {d6}, [r1], r2
yading@10 106 vmull.u8 q8, d4, d0
yading@10 107 vmlal.u8 q8, d6, d1
yading@10 108 vld1.8 {d4}, [r1], r2
yading@10 109 vmull.u8 q9, d6, d0
yading@10 110 vmlal.u8 q9, d4, d1
yading@10 111 pld [r1]
yading@10 112 .ifc \codec,h264
yading@10 113 vrshrn.u16 d16, q8, #6
yading@10 114 vrshrn.u16 d17, q9, #6
yading@10 115 .else
yading@10 116 vadd.u16 q8, q8, q11
yading@10 117 vadd.u16 q9, q9, q11
yading@10 118 vshrn.u16 d16, q8, #6
yading@10 119 vshrn.u16 d17, q9, #6
yading@10 120 .endif
yading@10 121 pld [r1, r2]
yading@10 122 .ifc \type,avg
yading@10 123 vld1.8 {d20}, [lr,:64], r2
yading@10 124 vld1.8 {d21}, [lr,:64], r2
yading@10 125 vrhadd.u8 q8, q8, q10
yading@10 126 .endif
yading@10 127 subs r3, r3, #2
yading@10 128 vst1.8 {d16}, [r0,:64], r2
yading@10 129 vst1.8 {d17}, [r0,:64], r2
yading@10 130 bgt 3b
yading@10 131
yading@10 132 pop {r4-r7, pc}
yading@10 133
yading@10 134 4: vld1.8 {d4, d5}, [r1], r2
yading@10 135 vld1.8 {d6, d7}, [r1], r2
yading@10 136 vext.8 d5, d4, d5, #1
yading@10 137 vext.8 d7, d6, d7, #1
yading@10 138 pld [r1]
yading@10 139 subs r3, r3, #2
yading@10 140 vmull.u8 q8, d4, d0
yading@10 141 vmlal.u8 q8, d5, d1
yading@10 142 vmull.u8 q9, d6, d0
yading@10 143 vmlal.u8 q9, d7, d1
yading@10 144 pld [r1, r2]
yading@10 145 .ifc \codec,h264
yading@10 146 vrshrn.u16 d16, q8, #6
yading@10 147 vrshrn.u16 d17, q9, #6
yading@10 148 .else
yading@10 149 vadd.u16 q8, q8, q11
yading@10 150 vadd.u16 q9, q9, q11
yading@10 151 vshrn.u16 d16, q8, #6
yading@10 152 vshrn.u16 d17, q9, #6
yading@10 153 .endif
yading@10 154 .ifc \type,avg
yading@10 155 vld1.8 {d20}, [lr,:64], r2
yading@10 156 vld1.8 {d21}, [lr,:64], r2
yading@10 157 vrhadd.u8 q8, q8, q10
yading@10 158 .endif
yading@10 159 vst1.8 {d16}, [r0,:64], r2
yading@10 160 vst1.8 {d17}, [r0,:64], r2
yading@10 161 bgt 4b
yading@10 162
yading@10 163 pop {r4-r7, pc}
yading@10 164 endfunc
yading@10 165 .endm
yading@10 166
yading@10 167 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
yading@10 168 .macro h264_chroma_mc4 type, codec=h264
yading@10 169 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
yading@10 170 push {r4-r7, lr}
yading@10 171 ldrd r4, r5, [sp, #20]
yading@10 172 .ifc \type,avg
yading@10 173 mov lr, r0
yading@10 174 .endif
yading@10 175 pld [r1]
yading@10 176 pld [r1, r2]
yading@10 177
yading@10 178 .ifc \codec,rv40
yading@10 179 movrel r6, rv40bias
yading@10 180 lsr r7, r5, #1
yading@10 181 add r6, r6, r7, lsl #3
yading@10 182 lsr r7, r4, #1
yading@10 183 add r6, r6, r7, lsl #1
yading@10 184 vld1.16 {d22[],d23[]}, [r6,:16]
yading@10 185 .endif
yading@10 186
yading@10 187 A muls r7, r4, r5
yading@10 188 T mul r7, r4, r5
yading@10 189 T cmp r7, #0
yading@10 190 rsb r6, r7, r5, lsl #3
yading@10 191 rsb r12, r7, r4, lsl #3
yading@10 192 sub r4, r7, r4, lsl #3
yading@10 193 sub r4, r4, r5, lsl #3
yading@10 194 add r4, r4, #64
yading@10 195
yading@10 196 beq 2f
yading@10 197
yading@10 198 vdup.8 d0, r4
yading@10 199 vdup.8 d1, r12
yading@10 200 vld1.8 {d4}, [r1], r2
yading@10 201 vdup.8 d2, r6
yading@10 202 vdup.8 d3, r7
yading@10 203
yading@10 204 vext.8 d5, d4, d5, #1
yading@10 205 vtrn.32 d4, d5
yading@10 206
yading@10 207 vtrn.32 d0, d1
yading@10 208 vtrn.32 d2, d3
yading@10 209
yading@10 210 1: vld1.8 {d6}, [r1], r2
yading@10 211 vext.8 d7, d6, d7, #1
yading@10 212 vtrn.32 d6, d7
yading@10 213 vmull.u8 q8, d4, d0
yading@10 214 vmlal.u8 q8, d6, d2
yading@10 215 vld1.8 {d4}, [r1], r2
yading@10 216 vext.8 d5, d4, d5, #1
yading@10 217 vtrn.32 d4, d5
yading@10 218 pld [r1]
yading@10 219 vmull.u8 q9, d6, d0
yading@10 220 vmlal.u8 q9, d4, d2
yading@10 221 vadd.i16 d16, d16, d17
yading@10 222 vadd.i16 d17, d18, d19
yading@10 223 .ifc \codec,h264
yading@10 224 vrshrn.u16 d16, q8, #6
yading@10 225 .else
yading@10 226 vadd.u16 q8, q8, q11
yading@10 227 vshrn.u16 d16, q8, #6
yading@10 228 .endif
yading@10 229 subs r3, r3, #2
yading@10 230 pld [r1, r2]
yading@10 231 .ifc \type,avg
yading@10 232 vld1.32 {d20[0]}, [lr,:32], r2
yading@10 233 vld1.32 {d20[1]}, [lr,:32], r2
yading@10 234 vrhadd.u8 d16, d16, d20
yading@10 235 .endif
yading@10 236 vst1.32 {d16[0]}, [r0,:32], r2
yading@10 237 vst1.32 {d16[1]}, [r0,:32], r2
yading@10 238 bgt 1b
yading@10 239
yading@10 240 pop {r4-r7, pc}
yading@10 241
yading@10 242 2: tst r6, r6
yading@10 243 add r12, r12, r6
yading@10 244 vdup.8 d0, r4
yading@10 245 vdup.8 d1, r12
yading@10 246 vtrn.32 d0, d1
yading@10 247
yading@10 248 beq 4f
yading@10 249
yading@10 250 vext.32 d1, d0, d1, #1
yading@10 251 vld1.32 {d4[0]}, [r1], r2
yading@10 252
yading@10 253 3: vld1.32 {d4[1]}, [r1], r2
yading@10 254 vmull.u8 q8, d4, d0
yading@10 255 vld1.32 {d4[0]}, [r1], r2
yading@10 256 vmull.u8 q9, d4, d1
yading@10 257 vadd.i16 d16, d16, d17
yading@10 258 vadd.i16 d17, d18, d19
yading@10 259 pld [r1]
yading@10 260 .ifc \codec,h264
yading@10 261 vrshrn.u16 d16, q8, #6
yading@10 262 .else
yading@10 263 vadd.u16 q8, q8, q11
yading@10 264 vshrn.u16 d16, q8, #6
yading@10 265 .endif
yading@10 266 .ifc \type,avg
yading@10 267 vld1.32 {d20[0]}, [lr,:32], r2
yading@10 268 vld1.32 {d20[1]}, [lr,:32], r2
yading@10 269 vrhadd.u8 d16, d16, d20
yading@10 270 .endif
yading@10 271 subs r3, r3, #2
yading@10 272 pld [r1, r2]
yading@10 273 vst1.32 {d16[0]}, [r0,:32], r2
yading@10 274 vst1.32 {d16[1]}, [r0,:32], r2
yading@10 275 bgt 3b
yading@10 276
yading@10 277 pop {r4-r7, pc}
yading@10 278
yading@10 279 4: vld1.8 {d4}, [r1], r2
yading@10 280 vld1.8 {d6}, [r1], r2
yading@10 281 vext.8 d5, d4, d5, #1
yading@10 282 vext.8 d7, d6, d7, #1
yading@10 283 vtrn.32 d4, d5
yading@10 284 vtrn.32 d6, d7
yading@10 285 vmull.u8 q8, d4, d0
yading@10 286 vmull.u8 q9, d6, d0
yading@10 287 subs r3, r3, #2
yading@10 288 vadd.i16 d16, d16, d17
yading@10 289 vadd.i16 d17, d18, d19
yading@10 290 pld [r1]
yading@10 291 .ifc \codec,h264
yading@10 292 vrshrn.u16 d16, q8, #6
yading@10 293 .else
yading@10 294 vadd.u16 q8, q8, q11
yading@10 295 vshrn.u16 d16, q8, #6
yading@10 296 .endif
yading@10 297 .ifc \type,avg
yading@10 298 vld1.32 {d20[0]}, [lr,:32], r2
yading@10 299 vld1.32 {d20[1]}, [lr,:32], r2
yading@10 300 vrhadd.u8 d16, d16, d20
yading@10 301 .endif
yading@10 302 pld [r1]
yading@10 303 vst1.32 {d16[0]}, [r0,:32], r2
yading@10 304 vst1.32 {d16[1]}, [r0,:32], r2
yading@10 305 bgt 4b
yading@10 306
yading@10 307 pop {r4-r7, pc}
yading@10 308 endfunc
yading@10 309 .endm
yading@10 310
yading@10 311 .macro h264_chroma_mc2 type
yading@10 312 function ff_\type\()_h264_chroma_mc2_neon, export=1
yading@10 313 push {r4-r6, lr}
yading@10 314 ldr r4, [sp, #16]
yading@10 315 ldr lr, [sp, #20]
yading@10 316 pld [r1]
yading@10 317 pld [r1, r2]
yading@10 318 orrs r5, r4, lr
yading@10 319 beq 2f
yading@10 320
yading@10 321 mul r5, r4, lr
yading@10 322 rsb r6, r5, lr, lsl #3
yading@10 323 rsb r12, r5, r4, lsl #3
yading@10 324 sub r4, r5, r4, lsl #3
yading@10 325 sub r4, r4, lr, lsl #3
yading@10 326 add r4, r4, #64
yading@10 327 vdup.8 d0, r4
yading@10 328 vdup.8 d2, r12
yading@10 329 vdup.8 d1, r6
yading@10 330 vdup.8 d3, r5
yading@10 331 vtrn.16 q0, q1
yading@10 332 1:
yading@10 333 vld1.32 {d4[0]}, [r1], r2
yading@10 334 vld1.32 {d4[1]}, [r1], r2
yading@10 335 vrev64.32 d5, d4
yading@10 336 vld1.32 {d5[1]}, [r1]
yading@10 337 vext.8 q3, q2, q2, #1
yading@10 338 vtrn.16 q2, q3
yading@10 339 vmull.u8 q8, d4, d0
yading@10 340 vmlal.u8 q8, d5, d1
yading@10 341 .ifc \type,avg
yading@10 342 vld1.16 {d18[0]}, [r0,:16], r2
yading@10 343 vld1.16 {d18[1]}, [r0,:16]
yading@10 344 sub r0, r0, r2
yading@10 345 .endif
yading@10 346 vtrn.32 d16, d17
yading@10 347 vadd.i16 d16, d16, d17
yading@10 348 vrshrn.u16 d16, q8, #6
yading@10 349 .ifc \type,avg
yading@10 350 vrhadd.u8 d16, d16, d18
yading@10 351 .endif
yading@10 352 vst1.16 {d16[0]}, [r0,:16], r2
yading@10 353 vst1.16 {d16[1]}, [r0,:16], r2
yading@10 354 subs r3, r3, #2
yading@10 355 bgt 1b
yading@10 356 pop {r4-r6, pc}
yading@10 357 2:
yading@10 358 .ifc \type,put
yading@10 359 ldrh_post r5, r1, r2
yading@10 360 strh_post r5, r0, r2
yading@10 361 ldrh_post r6, r1, r2
yading@10 362 strh_post r6, r0, r2
yading@10 363 .else
yading@10 364 vld1.16 {d16[0]}, [r1], r2
yading@10 365 vld1.16 {d16[1]}, [r1], r2
yading@10 366 vld1.16 {d18[0]}, [r0,:16], r2
yading@10 367 vld1.16 {d18[1]}, [r0,:16]
yading@10 368 sub r0, r0, r2
yading@10 369 vrhadd.u8 d16, d16, d18
yading@10 370 vst1.16 {d16[0]}, [r0,:16], r2
yading@10 371 vst1.16 {d16[1]}, [r0,:16], r2
yading@10 372 .endif
yading@10 373 subs r3, r3, #2
yading@10 374 bgt 2b
yading@10 375 pop {r4-r6, pc}
yading@10 376 endfunc
yading@10 377 .endm
yading@10 378
yading@10 379 #if CONFIG_H264_DECODER
yading@10 380 h264_chroma_mc8 put
yading@10 381 h264_chroma_mc8 avg
yading@10 382 h264_chroma_mc4 put
yading@10 383 h264_chroma_mc4 avg
yading@10 384 h264_chroma_mc2 put
yading@10 385 h264_chroma_mc2 avg
yading@10 386 #endif
yading@10 387
yading@10 388 #if CONFIG_RV40_DECODER
yading@10 389 const rv40bias
yading@10 390 .short 0, 16, 32, 16
yading@10 391 .short 32, 28, 32, 28
yading@10 392 .short 0, 32, 16, 32
yading@10 393 .short 32, 28, 32, 28
yading@10 394 endconst
yading@10 395
yading@10 396 h264_chroma_mc8 put, rv40
yading@10 397 h264_chroma_mc8 avg, rv40
yading@10 398 h264_chroma_mc4 put, rv40
yading@10 399 h264_chroma_mc4 avg, rv40
yading@10 400 #endif