annotate ffmpeg/libavcodec/arm/dsputil_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * ARM NEON optimised DSP functions
yading@10 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
yading@10 4 *
yading@10 5 * This file is part of FFmpeg.
yading@10 6 *
yading@10 7 * FFmpeg is free software; you can redistribute it and/or
yading@10 8 * modify it under the terms of the GNU Lesser General Public
yading@10 9 * License as published by the Free Software Foundation; either
yading@10 10 * version 2.1 of the License, or (at your option) any later version.
yading@10 11 *
yading@10 12 * FFmpeg is distributed in the hope that it will be useful,
yading@10 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 * Lesser General Public License for more details.
yading@10 16 *
yading@10 17 * You should have received a copy of the GNU Lesser General Public
yading@10 18 * License along with FFmpeg; if not, write to the Free Software
yading@10 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 */
yading@10 21
yading@10 22 #include "libavutil/arm/asm.S"
yading@10 23
yading@10 24 function ff_clear_block_neon, export=1
yading@10 25 vmov.i16 q0, #0
yading@10 26 .rept 8
yading@10 27 vst1.16 {q0}, [r0,:128]!
yading@10 28 .endr
yading@10 29 bx lr
yading@10 30 endfunc
yading@10 31
yading@10 32 function ff_clear_blocks_neon, export=1
yading@10 33 vmov.i16 q0, #0
yading@10 34 .rept 8*6
yading@10 35 vst1.16 {q0}, [r0,:128]!
yading@10 36 .endr
yading@10 37 bx lr
yading@10 38 endfunc
yading@10 39
yading@10 40 function ff_put_pixels_clamped_neon, export=1
yading@10 41 vld1.16 {d16-d19}, [r0,:128]!
yading@10 42 vqmovun.s16 d0, q8
yading@10 43 vld1.16 {d20-d23}, [r0,:128]!
yading@10 44 vqmovun.s16 d1, q9
yading@10 45 vld1.16 {d24-d27}, [r0,:128]!
yading@10 46 vqmovun.s16 d2, q10
yading@10 47 vld1.16 {d28-d31}, [r0,:128]!
yading@10 48 vqmovun.s16 d3, q11
yading@10 49 vst1.8 {d0}, [r1,:64], r2
yading@10 50 vqmovun.s16 d4, q12
yading@10 51 vst1.8 {d1}, [r1,:64], r2
yading@10 52 vqmovun.s16 d5, q13
yading@10 53 vst1.8 {d2}, [r1,:64], r2
yading@10 54 vqmovun.s16 d6, q14
yading@10 55 vst1.8 {d3}, [r1,:64], r2
yading@10 56 vqmovun.s16 d7, q15
yading@10 57 vst1.8 {d4}, [r1,:64], r2
yading@10 58 vst1.8 {d5}, [r1,:64], r2
yading@10 59 vst1.8 {d6}, [r1,:64], r2
yading@10 60 vst1.8 {d7}, [r1,:64], r2
yading@10 61 bx lr
yading@10 62 endfunc
yading@10 63
yading@10 64 function ff_put_signed_pixels_clamped_neon, export=1
yading@10 65 vmov.u8 d31, #128
yading@10 66 vld1.16 {d16-d17}, [r0,:128]!
yading@10 67 vqmovn.s16 d0, q8
yading@10 68 vld1.16 {d18-d19}, [r0,:128]!
yading@10 69 vqmovn.s16 d1, q9
yading@10 70 vld1.16 {d16-d17}, [r0,:128]!
yading@10 71 vqmovn.s16 d2, q8
yading@10 72 vld1.16 {d18-d19}, [r0,:128]!
yading@10 73 vadd.u8 d0, d0, d31
yading@10 74 vld1.16 {d20-d21}, [r0,:128]!
yading@10 75 vadd.u8 d1, d1, d31
yading@10 76 vld1.16 {d22-d23}, [r0,:128]!
yading@10 77 vadd.u8 d2, d2, d31
yading@10 78 vst1.8 {d0}, [r1,:64], r2
yading@10 79 vqmovn.s16 d3, q9
yading@10 80 vst1.8 {d1}, [r1,:64], r2
yading@10 81 vqmovn.s16 d4, q10
yading@10 82 vst1.8 {d2}, [r1,:64], r2
yading@10 83 vqmovn.s16 d5, q11
yading@10 84 vld1.16 {d24-d25}, [r0,:128]!
yading@10 85 vadd.u8 d3, d3, d31
yading@10 86 vld1.16 {d26-d27}, [r0,:128]!
yading@10 87 vadd.u8 d4, d4, d31
yading@10 88 vadd.u8 d5, d5, d31
yading@10 89 vst1.8 {d3}, [r1,:64], r2
yading@10 90 vqmovn.s16 d6, q12
yading@10 91 vst1.8 {d4}, [r1,:64], r2
yading@10 92 vqmovn.s16 d7, q13
yading@10 93 vst1.8 {d5}, [r1,:64], r2
yading@10 94 vadd.u8 d6, d6, d31
yading@10 95 vadd.u8 d7, d7, d31
yading@10 96 vst1.8 {d6}, [r1,:64], r2
yading@10 97 vst1.8 {d7}, [r1,:64], r2
yading@10 98 bx lr
yading@10 99 endfunc
yading@10 100
yading@10 101 function ff_add_pixels_clamped_neon, export=1
yading@10 102 mov r3, r1
yading@10 103 vld1.8 {d16}, [r1,:64], r2
yading@10 104 vld1.16 {d0-d1}, [r0,:128]!
yading@10 105 vaddw.u8 q0, q0, d16
yading@10 106 vld1.8 {d17}, [r1,:64], r2
yading@10 107 vld1.16 {d2-d3}, [r0,:128]!
yading@10 108 vqmovun.s16 d0, q0
yading@10 109 vld1.8 {d18}, [r1,:64], r2
yading@10 110 vaddw.u8 q1, q1, d17
yading@10 111 vld1.16 {d4-d5}, [r0,:128]!
yading@10 112 vaddw.u8 q2, q2, d18
yading@10 113 vst1.8 {d0}, [r3,:64], r2
yading@10 114 vqmovun.s16 d2, q1
yading@10 115 vld1.8 {d19}, [r1,:64], r2
yading@10 116 vld1.16 {d6-d7}, [r0,:128]!
yading@10 117 vaddw.u8 q3, q3, d19
yading@10 118 vqmovun.s16 d4, q2
yading@10 119 vst1.8 {d2}, [r3,:64], r2
yading@10 120 vld1.8 {d16}, [r1,:64], r2
yading@10 121 vqmovun.s16 d6, q3
yading@10 122 vld1.16 {d0-d1}, [r0,:128]!
yading@10 123 vaddw.u8 q0, q0, d16
yading@10 124 vst1.8 {d4}, [r3,:64], r2
yading@10 125 vld1.8 {d17}, [r1,:64], r2
yading@10 126 vld1.16 {d2-d3}, [r0,:128]!
yading@10 127 vaddw.u8 q1, q1, d17
yading@10 128 vst1.8 {d6}, [r3,:64], r2
yading@10 129 vqmovun.s16 d0, q0
yading@10 130 vld1.8 {d18}, [r1,:64], r2
yading@10 131 vld1.16 {d4-d5}, [r0,:128]!
yading@10 132 vaddw.u8 q2, q2, d18
yading@10 133 vst1.8 {d0}, [r3,:64], r2
yading@10 134 vqmovun.s16 d2, q1
yading@10 135 vld1.8 {d19}, [r1,:64], r2
yading@10 136 vqmovun.s16 d4, q2
yading@10 137 vld1.16 {d6-d7}, [r0,:128]!
yading@10 138 vaddw.u8 q3, q3, d19
yading@10 139 vst1.8 {d2}, [r3,:64], r2
yading@10 140 vqmovun.s16 d6, q3
yading@10 141 vst1.8 {d4}, [r3,:64], r2
yading@10 142 vst1.8 {d6}, [r3,:64], r2
yading@10 143 bx lr
yading@10 144 endfunc
yading@10 145
yading@10 146 function ff_vector_clipf_neon, export=1
yading@10 147 VFP vdup.32 q1, d0[1]
yading@10 148 VFP vdup.32 q0, d0[0]
yading@10 149 NOVFP vdup.32 q0, r2
yading@10 150 NOVFP vdup.32 q1, r3
yading@10 151 NOVFP ldr r2, [sp]
yading@10 152 vld1.f32 {q2},[r1,:128]!
yading@10 153 vmin.f32 q10, q2, q1
yading@10 154 vld1.f32 {q3},[r1,:128]!
yading@10 155 vmin.f32 q11, q3, q1
yading@10 156 1: vmax.f32 q8, q10, q0
yading@10 157 vmax.f32 q9, q11, q0
yading@10 158 subs r2, r2, #8
yading@10 159 beq 2f
yading@10 160 vld1.f32 {q2},[r1,:128]!
yading@10 161 vmin.f32 q10, q2, q1
yading@10 162 vld1.f32 {q3},[r1,:128]!
yading@10 163 vmin.f32 q11, q3, q1
yading@10 164 vst1.f32 {q8},[r0,:128]!
yading@10 165 vst1.f32 {q9},[r0,:128]!
yading@10 166 b 1b
yading@10 167 2: vst1.f32 {q8},[r0,:128]!
yading@10 168 vst1.f32 {q9},[r0,:128]!
yading@10 169 bx lr
yading@10 170 endfunc
yading@10 171
yading@10 172 function ff_apply_window_int16_neon, export=1
yading@10 173 push {r4,lr}
yading@10 174 add r4, r1, r3, lsl #1
yading@10 175 add lr, r0, r3, lsl #1
yading@10 176 sub r4, r4, #16
yading@10 177 sub lr, lr, #16
yading@10 178 mov r12, #-16
yading@10 179 1:
yading@10 180 vld1.16 {q0}, [r1,:128]!
yading@10 181 vld1.16 {q2}, [r2,:128]!
yading@10 182 vld1.16 {q1}, [r4,:128], r12
yading@10 183 vrev64.16 q3, q2
yading@10 184 vqrdmulh.s16 q0, q0, q2
yading@10 185 vqrdmulh.s16 d2, d2, d7
yading@10 186 vqrdmulh.s16 d3, d3, d6
yading@10 187 vst1.16 {q0}, [r0,:128]!
yading@10 188 vst1.16 {q1}, [lr,:128], r12
yading@10 189 subs r3, r3, #16
yading@10 190 bgt 1b
yading@10 191
yading@10 192 pop {r4,pc}
yading@10 193 endfunc
yading@10 194
yading@10 195 function ff_vector_clip_int32_neon, export=1
yading@10 196 vdup.32 q0, r2
yading@10 197 vdup.32 q1, r3
yading@10 198 ldr r2, [sp]
yading@10 199 1:
yading@10 200 vld1.32 {q2-q3}, [r1,:128]!
yading@10 201 vmin.s32 q2, q2, q1
yading@10 202 vmin.s32 q3, q3, q1
yading@10 203 vmax.s32 q2, q2, q0
yading@10 204 vmax.s32 q3, q3, q0
yading@10 205 vst1.32 {q2-q3}, [r0,:128]!
yading@10 206 subs r2, r2, #8
yading@10 207 bgt 1b
yading@10 208 bx lr
yading@10 209 endfunc