Chris@0
|
1 /*****************************************************************************
|
Chris@0
|
2 * Copyright (C) 2000-2001 Andre McCurdy <armccurdy@yahoo.co.uk>
|
Chris@0
|
3 *
|
Chris@0
|
4 * This program is free software. you can redistribute it and/or modify
|
Chris@0
|
5 * it under the terms of the GNU General Public License as published by
|
Chris@0
|
6 * the Free Software Foundation@ either version 2 of the License, or
|
Chris@0
|
7 * (at your option) any later version.
|
Chris@0
|
8 *
|
Chris@0
|
9 * This program is distributed in the hope that it will be useful,
|
Chris@0
|
10 * but WITHOUT ANY WARRANTY, without even the implied warranty of
|
Chris@0
|
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Chris@0
|
12 * GNU General Public License for more details.
|
Chris@0
|
13 *
|
Chris@0
|
14 * You should have received a copy of the GNU General Public License
|
Chris@0
|
15 * along with this program@ if not, write to the Free Software
|
Chris@0
|
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
Chris@0
|
17 *
|
Chris@0
|
18 *****************************************************************************
|
Chris@0
|
19 *
|
Chris@0
|
20 * Notes:
|
Chris@0
|
21 *
|
Chris@0
|
22 *
|
Chris@0
|
23 *****************************************************************************
|
Chris@0
|
24 *
|
Chris@0
|
25 * $Id: imdct_l_arm.S,v 1.7 2001/03/25 20:03:34 rob Rel $
|
Chris@0
|
26 *
|
Chris@0
|
27 * 2001/03/24: Andre McCurdy <armccurdy@yahoo.co.uk>
|
Chris@0
|
28 * - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
|
Chris@0
|
29 *
|
Chris@0
|
30 * 2000/09/20: Robert Leslie <rob@mars.org>
|
Chris@0
|
31 * - Added a global symbol with leading underscore per suggestion of
|
Chris@0
|
32 * Simon Burge to support linking with the a.out format.
|
Chris@0
|
33 *
|
Chris@0
|
34 * 2000/09/15: Robert Leslie <rob@mars.org>
|
Chris@0
|
35 * - Fixed a small bug where flags were changed before a conditional branch.
|
Chris@0
|
36 *
|
Chris@0
|
37 * 2000/09/15: Andre McCurdy <armccurdy@yahoo.co.uk>
|
Chris@0
|
38 * - Applied Nicolas Pitre's rounding optimisation in all remaining places.
|
Chris@0
|
39 *
|
Chris@0
|
40 * 2000/09/09: Nicolas Pitre <nico@cam.org>
|
Chris@0
|
41 * - Optimized rounding + scaling operations.
|
Chris@0
|
42 *
|
Chris@0
|
43 * 2000/08/09: Andre McCurdy <armccurdy@yahoo.co.uk>
|
Chris@0
|
44 * - Original created.
|
Chris@0
|
45 *
|
Chris@0
|
46 ****************************************************************************/
|
Chris@0
|
47
|
Chris@0
|
48
|
Chris@0
|
49 /*
|
Chris@0
|
50 On entry:
|
Chris@0
|
51
|
Chris@0
|
52 r0 = pointer to 18 element input array
|
Chris@0
|
53 r1 = pointer to 36 element output array
|
Chris@0
|
54 r2 = windowing block type
|
Chris@0
|
55
|
Chris@0
|
56
|
Chris@0
|
57 Stack frame created during execution of the function:
|
Chris@0
|
58
|
Chris@0
|
59 Initial Holds:
|
Chris@0
|
60 Stack
|
Chris@0
|
61 pointer
|
Chris@0
|
62 minus:
|
Chris@0
|
63
|
Chris@0
|
64 0
|
Chris@0
|
65 4 lr
|
Chris@0
|
66 8 r11
|
Chris@0
|
67 12 r10
|
Chris@0
|
68 16 r9
|
Chris@0
|
69 20 r8
|
Chris@0
|
70 24 r7
|
Chris@0
|
71 28 r6
|
Chris@0
|
72 32 r5
|
Chris@0
|
73 36 r4
|
Chris@0
|
74
|
Chris@0
|
75 40 r2 : windowing block type
|
Chris@0
|
76
|
Chris@0
|
77 44 ct00 high
|
Chris@0
|
78 48 ct00 low
|
Chris@0
|
79 52 ct01 high
|
Chris@0
|
80 56 ct01 low
|
Chris@0
|
81 60 ct04 high
|
Chris@0
|
82 64 ct04 low
|
Chris@0
|
83 68 ct06 high
|
Chris@0
|
84 72 ct06 low
|
Chris@0
|
85 76 ct05 high
|
Chris@0
|
86 80 ct05 low
|
Chris@0
|
87 84 ct03 high
|
Chris@0
|
88 88 ct03 low
|
Chris@0
|
89 92 -ct05 high
|
Chris@0
|
90 96 -ct05 low
|
Chris@0
|
91 100 -ct07 high
|
Chris@0
|
92 104 -ct07 low
|
Chris@0
|
93 108 ct07 high
|
Chris@0
|
94 112 ct07 low
|
Chris@0
|
95 116 ct02 high
|
Chris@0
|
96 120 ct02 low
|
Chris@0
|
97 */
|
Chris@0
|
98
|
Chris@0
|
99 #define BLOCK_MODE_NORMAL 0
|
Chris@0
|
100 #define BLOCK_MODE_START 1
|
Chris@0
|
101 #define BLOCK_MODE_STOP 3
|
Chris@0
|
102
|
Chris@0
|
103
|
Chris@0
|
104 #define X0 0x00
|
Chris@0
|
105 #define X1 0x04
|
Chris@0
|
106 #define X2 0x08
|
Chris@0
|
107 #define X3 0x0C
|
Chris@0
|
108 #define X4 0x10
|
Chris@0
|
109 #define X5 0x14
|
Chris@0
|
110 #define X6 0x18
|
Chris@0
|
111 #define X7 0x1c
|
Chris@0
|
112 #define X8 0x20
|
Chris@0
|
113 #define X9 0x24
|
Chris@0
|
114 #define X10 0x28
|
Chris@0
|
115 #define X11 0x2c
|
Chris@0
|
116 #define X12 0x30
|
Chris@0
|
117 #define X13 0x34
|
Chris@0
|
118 #define X14 0x38
|
Chris@0
|
119 #define X15 0x3c
|
Chris@0
|
120 #define X16 0x40
|
Chris@0
|
121 #define X17 0x44
|
Chris@0
|
122
|
Chris@0
|
123 #define x0 0x00
|
Chris@0
|
124 #define x1 0x04
|
Chris@0
|
125 #define x2 0x08
|
Chris@0
|
126 #define x3 0x0C
|
Chris@0
|
127 #define x4 0x10
|
Chris@0
|
128 #define x5 0x14
|
Chris@0
|
129 #define x6 0x18
|
Chris@0
|
130 #define x7 0x1c
|
Chris@0
|
131 #define x8 0x20
|
Chris@0
|
132 #define x9 0x24
|
Chris@0
|
133 #define x10 0x28
|
Chris@0
|
134 #define x11 0x2c
|
Chris@0
|
135 #define x12 0x30
|
Chris@0
|
136 #define x13 0x34
|
Chris@0
|
137 #define x14 0x38
|
Chris@0
|
138 #define x15 0x3c
|
Chris@0
|
139 #define x16 0x40
|
Chris@0
|
140 #define x17 0x44
|
Chris@0
|
141 #define x18 0x48
|
Chris@0
|
142 #define x19 0x4c
|
Chris@0
|
143 #define x20 0x50
|
Chris@0
|
144 #define x21 0x54
|
Chris@0
|
145 #define x22 0x58
|
Chris@0
|
146 #define x23 0x5c
|
Chris@0
|
147 #define x24 0x60
|
Chris@0
|
148 #define x25 0x64
|
Chris@0
|
149 #define x26 0x68
|
Chris@0
|
150 #define x27 0x6c
|
Chris@0
|
151 #define x28 0x70
|
Chris@0
|
152 #define x29 0x74
|
Chris@0
|
153 #define x30 0x78
|
Chris@0
|
154 #define x31 0x7c
|
Chris@0
|
155 #define x32 0x80
|
Chris@0
|
156 #define x33 0x84
|
Chris@0
|
157 #define x34 0x88
|
Chris@0
|
158 #define x35 0x8c
|
Chris@0
|
159
|
Chris@0
|
160 #define K00 0x0ffc19fd
|
Chris@0
|
161 #define K01 0x00b2aa3e
|
Chris@0
|
162 #define K02 0x0fdcf549
|
Chris@0
|
163 #define K03 0x0216a2a2
|
Chris@0
|
164 #define K04 0x0f9ee890
|
Chris@0
|
165 #define K05 0x03768962
|
Chris@0
|
166 #define K06 0x0f426cb5
|
Chris@0
|
167 #define K07 0x04cfb0e2
|
Chris@0
|
168 #define K08 0x0ec835e8
|
Chris@0
|
169 #define K09 0x061f78aa
|
Chris@0
|
170 #define K10 0x0e313245
|
Chris@0
|
171 #define K11 0x07635284
|
Chris@0
|
172 #define K12 0x0d7e8807
|
Chris@0
|
173 #define K13 0x0898c779
|
Chris@0
|
174 #define K14 0x0cb19346
|
Chris@0
|
175 #define K15 0x09bd7ca0
|
Chris@0
|
176 #define K16 0x0bcbe352
|
Chris@0
|
177 #define K17 0x0acf37ad
|
Chris@0
|
178
|
Chris@0
|
179 #define minus_K02 0xf0230ab7
|
Chris@0
|
180
|
Chris@0
|
181 #define WL0 0x00b2aa3e
|
Chris@0
|
182 #define WL1 0x0216a2a2
|
Chris@0
|
183 #define WL2 0x03768962
|
Chris@0
|
184 #define WL3 0x04cfb0e2
|
Chris@0
|
185 #define WL4 0x061f78aa
|
Chris@0
|
186 #define WL5 0x07635284
|
Chris@0
|
187 #define WL6 0x0898c779
|
Chris@0
|
188 #define WL7 0x09bd7ca0
|
Chris@0
|
189 #define WL8 0x0acf37ad
|
Chris@0
|
190 #define WL9 0x0bcbe352
|
Chris@0
|
191 #define WL10 0x0cb19346
|
Chris@0
|
192 #define WL11 0x0d7e8807
|
Chris@0
|
193 #define WL12 0x0e313245
|
Chris@0
|
194 #define WL13 0x0ec835e8
|
Chris@0
|
195 #define WL14 0x0f426cb5
|
Chris@0
|
196 #define WL15 0x0f9ee890
|
Chris@0
|
197 #define WL16 0x0fdcf549
|
Chris@0
|
198 #define WL17 0x0ffc19fd
|
Chris@0
|
199
|
Chris@0
|
200
|
Chris@0
|
201 @*****************************************************************************
|
Chris@0
|
202
|
Chris@0
|
203
|
Chris@0
|
204 .text
|
Chris@0
|
205 .align
|
Chris@0
|
206
|
Chris@0
|
207 .global III_imdct_l
|
Chris@0
|
208 .global _III_imdct_l
|
Chris@0
|
209
|
Chris@0
|
210 III_imdct_l:
|
Chris@0
|
211 _III_imdct_l:
|
Chris@0
|
212
|
Chris@0
|
213 stmdb sp!, { r2, r4 - r11, lr } @ all callee saved regs, plus arg3
|
Chris@0
|
214
|
Chris@0
|
215 ldr r4, =K08 @ r4 = K08
|
Chris@0
|
216 ldr r5, =K09 @ r5 = K09
|
Chris@0
|
217 ldr r8, [r0, #X4] @ r8 = X4
|
Chris@0
|
218 ldr r9, [r0, #X13] @ r9 = X13
|
Chris@0
|
219 rsb r6, r4, #0 @ r6 = -K08
|
Chris@0
|
220 rsb r7, r5, #0 @ r7 = -K09
|
Chris@0
|
221
|
Chris@0
|
222 smull r2, r3, r4, r8 @ r2..r3 = (X4 * K08)
|
Chris@0
|
223 smlal r2, r3, r5, r9 @ r2..r3 = (X4 * K08) + (X13 * K09) = ct01
|
Chris@0
|
224
|
Chris@0
|
225 smull r10, lr, r8, r5 @ r10..lr = (X4 * K09)
|
Chris@0
|
226 smlal r10, lr, r9, r6 @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
|
Chris@0
|
227
|
Chris@0
|
228 ldr r8, [r0, #X7] @ r8 = X7
|
Chris@0
|
229 ldr r9, [r0, #X16] @ r9 = X16
|
Chris@0
|
230
|
Chris@0
|
231 stmdb sp!, { r2, r3, r10, lr } @ stack ct00_h, ct00_l, ct01_h, ct01_l
|
Chris@0
|
232
|
Chris@0
|
233 add r8, r8, r9 @ r8 = (X7 + X16)
|
Chris@0
|
234 ldr r9, [r0, #X1] @ r9 = X1
|
Chris@0
|
235
|
Chris@0
|
236 smlal r2, r3, r6, r8 @ r2..r3 = ct01 + ((X7 + X16) * -K08)
|
Chris@0
|
237 smlal r2, r3, r7, r9 @ r2..r3 += (X1 * -K09)
|
Chris@0
|
238
|
Chris@0
|
239 ldr r7, [r0, #X10] @ r7 = X10
|
Chris@0
|
240
|
Chris@0
|
241 rsbs r10, r10, #0
|
Chris@0
|
242 rsc lr, lr, #0 @ r10..lr = -ct00
|
Chris@0
|
243
|
Chris@0
|
244 smlal r2, r3, r5, r7 @ r2..r3 += (X10 * K09) = ct06
|
Chris@0
|
245
|
Chris@0
|
246 smlal r10, lr, r9, r6 @ r10..lr = -ct00 + ( X1 * -K08)
|
Chris@0
|
247 smlal r10, lr, r8, r5 @ r10..lr += ((X7 + X16) * K09)
|
Chris@0
|
248 smlal r10, lr, r7, r4 @ r10..lr += ( X10 * K08) = ct04
|
Chris@0
|
249
|
Chris@0
|
250 stmdb sp!, { r2, r3, r10, lr } @ stack ct04_h, ct04_l, ct06_h, ct06_l
|
Chris@0
|
251
|
Chris@0
|
252 @----
|
Chris@0
|
253
|
Chris@0
|
254 ldr r7, [r0, #X0]
|
Chris@0
|
255 ldr r8, [r0, #X11]
|
Chris@0
|
256 ldr r9, [r0, #X12]
|
Chris@0
|
257 sub r7, r7, r8
|
Chris@0
|
258 sub r7, r7, r9 @ r7 = (X0 - X11 -X12) = ct14
|
Chris@0
|
259
|
Chris@0
|
260 ldr r9, [r0, #X3]
|
Chris@0
|
261 ldr r8, [r0, #X8]
|
Chris@0
|
262 ldr r11, [r0, #X15]
|
Chris@0
|
263 sub r8, r8, r9
|
Chris@0
|
264 add r8, r8, r11 @ r8 = (X8 - X3 + X15) = ct16
|
Chris@0
|
265
|
Chris@0
|
266 add r11, r7, r8 @ r11 = ct14 + ct16 = ct18
|
Chris@0
|
267
|
Chris@0
|
268 smlal r2, r3, r6, r11 @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
|
Chris@0
|
269
|
Chris@0
|
270 ldr r6, [r0, #X2]
|
Chris@0
|
271 ldr r9, [r0, #X9]
|
Chris@0
|
272 ldr r12, [r0, #X14]
|
Chris@0
|
273 sub r6, r6, r9
|
Chris@0
|
274 sub r6, r6, r12 @ r6 = (X2 - X9 - X14) = ct15
|
Chris@0
|
275
|
Chris@0
|
276 ldr r9, [r0, #X5]
|
Chris@0
|
277 ldr r12, [r0, #X6]
|
Chris@0
|
278 sub r9, r9, r12
|
Chris@0
|
279 ldr r12, [r0, #X17]
|
Chris@0
|
280 sub r9, r9, r12 @ r9 = (X5 - X6 - X17) = ct17
|
Chris@0
|
281
|
Chris@0
|
282 add r12, r9, r6 @ r12 = ct15 + ct17 = ct19
|
Chris@0
|
283
|
Chris@0
|
284 smlal r2, r3, r5, r12 @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
|
Chris@0
|
285
|
Chris@0
|
286 smlal r10, lr, r11, r5 @ r10..lr = ct04 + (ct18 * K09)
|
Chris@0
|
287 smlal r10, lr, r12, r4 @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
|
Chris@0
|
288
|
Chris@0
|
289 movs r2, r2, lsr #28
|
Chris@0
|
290 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
|
Chris@0
|
291 str r2, [r1, #x22] @ store result x22
|
Chris@0
|
292
|
Chris@0
|
293 movs r10, r10, lsr #28
|
Chris@0
|
294 adc r10, r10, lr, lsl #4 @ r10 = bits[59..28] of r10..lr
|
Chris@0
|
295 str r10, [r1, #x4] @ store result x4
|
Chris@0
|
296
|
Chris@0
|
297 @----
|
Chris@0
|
298
|
Chris@0
|
299 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
|
Chris@0
|
300
|
Chris@0
|
301 @ r2..r3 = ct06
|
Chris@0
|
302 @ r4..r5 = ct04
|
Chris@0
|
303 @ r6 = ct15
|
Chris@0
|
304 @ r7 = ct14
|
Chris@0
|
305 @ r8 = ct16
|
Chris@0
|
306 @ r9 = ct17
|
Chris@0
|
307 @ r10 = .
|
Chris@0
|
308 @ r11 = .
|
Chris@0
|
309 @ r12 = .
|
Chris@0
|
310 @ lr = .
|
Chris@0
|
311
|
Chris@0
|
312 ldr r10, =K03 @ r10 = K03
|
Chris@0
|
313 ldr lr, =K15 @ lr = K15
|
Chris@0
|
314
|
Chris@0
|
315 smlal r2, r3, r10, r7 @ r2..r3 = ct06 + (ct14 * K03)
|
Chris@0
|
316 smlal r4, r5, lr, r7 @ r4..r5 = ct04 + (ct14 * K15)
|
Chris@0
|
317
|
Chris@0
|
318 ldr r12, =K14 @ r12 = K14
|
Chris@0
|
319 rsb r10, r10, #0 @ r10 = -K03
|
Chris@0
|
320
|
Chris@0
|
321 smlal r2, r3, lr, r6 @ r2..r3 += (ct15 * K15)
|
Chris@0
|
322 smlal r4, r5, r10, r6 @ r4..r5 += (ct15 * -K03)
|
Chris@0
|
323 smlal r2, r3, r12, r8 @ r2..r3 += (ct16 * K14)
|
Chris@0
|
324
|
Chris@0
|
325 ldr r11, =minus_K02 @ r11 = -K02
|
Chris@0
|
326 rsb r12, r12, #0 @ r12 = -K14
|
Chris@0
|
327
|
Chris@0
|
328 smlal r4, r5, r12, r9 @ r4..r5 += (ct17 * -K14)
|
Chris@0
|
329 smlal r2, r3, r11, r9 @ r2..r3 += (ct17 * -K02)
|
Chris@0
|
330 smlal r4, r5, r11, r8 @ r4..r5 += (ct16 * -K02)
|
Chris@0
|
331
|
Chris@0
|
332 movs r2, r2, lsr #28
|
Chris@0
|
333 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
|
Chris@0
|
334 str r2, [r1, #x7] @ store result x7
|
Chris@0
|
335
|
Chris@0
|
336 movs r4, r4, lsr #28
|
Chris@0
|
337 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
|
Chris@0
|
338 str r4, [r1, #x1] @ store result x1
|
Chris@0
|
339
|
Chris@0
|
340 @----
|
Chris@0
|
341
|
Chris@0
|
342 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
|
Chris@0
|
343
|
Chris@0
|
344 @ r2..r3 = ct06
|
Chris@0
|
345 @ r4..r5 = ct04
|
Chris@0
|
346 @ r6 = ct15
|
Chris@0
|
347 @ r7 = ct14
|
Chris@0
|
348 @ r8 = ct16
|
Chris@0
|
349 @ r9 = ct17
|
Chris@0
|
350 @ r10 = -K03
|
Chris@0
|
351 @ r11 = -K02
|
Chris@0
|
352 @ r12 = -K14
|
Chris@0
|
353 @ lr = K15
|
Chris@0
|
354
|
Chris@0
|
355 rsbs r2, r2, #0
|
Chris@0
|
356 rsc r3, r3, #0 @ r2..r3 = -ct06
|
Chris@0
|
357
|
Chris@0
|
358 smlal r2, r3, r12, r7 @ r2..r3 = -ct06 + (ct14 * -K14)
|
Chris@0
|
359 smlal r2, r3, r10, r8 @ r2..r3 += (ct16 * -K03)
|
Chris@0
|
360
|
Chris@0
|
361 smlal r4, r5, r12, r6 @ r4..r5 = ct04 + (ct15 * -K14)
|
Chris@0
|
362 smlal r4, r5, r10, r9 @ r4..r5 += (ct17 * -K03)
|
Chris@0
|
363 smlal r4, r5, lr, r8 @ r4..r5 += (ct16 * K15)
|
Chris@0
|
364 smlal r4, r5, r11, r7 @ r4..r5 += (ct14 * -K02)
|
Chris@0
|
365
|
Chris@0
|
366 rsb lr, lr, #0 @ lr = -K15
|
Chris@0
|
367 rsb r11, r11, #0 @ r11 = K02
|
Chris@0
|
368
|
Chris@0
|
369 smlal r2, r3, lr, r9 @ r2..r3 += (ct17 * -K15)
|
Chris@0
|
370 smlal r2, r3, r11, r6 @ r2..r3 += (ct15 * K02)
|
Chris@0
|
371
|
Chris@0
|
372 movs r4, r4, lsr #28
|
Chris@0
|
373 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
|
Chris@0
|
374 str r4, [r1, #x25] @ store result x25
|
Chris@0
|
375
|
Chris@0
|
376 movs r2, r2, lsr #28
|
Chris@0
|
377 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
|
Chris@0
|
378 str r2, [r1, #x19] @ store result x19
|
Chris@0
|
379
|
Chris@0
|
380 @----
|
Chris@0
|
381
|
Chris@0
|
382 ldr r2, [sp, #16] @ r2 = ct01_l
|
Chris@0
|
383 ldr r3, [sp, #20] @ r3 = ct01_h
|
Chris@0
|
384
|
Chris@0
|
385 ldr r6, [r0, #X1]
|
Chris@0
|
386 ldr r8, [r0, #X7]
|
Chris@0
|
387 ldr r9, [r0, #X10]
|
Chris@0
|
388 ldr r7, [r0, #X16]
|
Chris@0
|
389
|
Chris@0
|
390 rsbs r2, r2, #0
|
Chris@0
|
391 rsc r3, r3, #0 @ r2..r3 = -ct01
|
Chris@0
|
392
|
Chris@0
|
393 mov r4, r2
|
Chris@0
|
394 mov r5, r3 @ r4..r5 = -ct01
|
Chris@0
|
395
|
Chris@0
|
396 @ r2..r3 = -ct01
|
Chris@0
|
397 @ r4..r5 = -ct01
|
Chris@0
|
398 @ r6 = X1
|
Chris@0
|
399 @ r7 = X16
|
Chris@0
|
400 @ r8 = X7
|
Chris@0
|
401 @ r9 = X10
|
Chris@0
|
402 @ r10 = -K03
|
Chris@0
|
403 @ r11 = K02
|
Chris@0
|
404 @ r12 = -K14
|
Chris@0
|
405 @ lr = -K15
|
Chris@0
|
406
|
Chris@0
|
407 smlal r4, r5, r12, r7 @ r4..r5 = -ct01 + (X16 * -K14)
|
Chris@0
|
408 smlal r2, r3, lr, r9 @ r2..r3 = -ct01 + (X10 * -K15)
|
Chris@0
|
409
|
Chris@0
|
410 smlal r4, r5, r10, r8 @ r4..r5 += (X7 * -K03)
|
Chris@0
|
411 smlal r2, r3, r10, r7 @ r2..r3 += (X16 * -K03)
|
Chris@0
|
412
|
Chris@0
|
413 smlal r4, r5, r11, r9 @ r4..r5 += (X10 * K02)
|
Chris@0
|
414 smlal r2, r3, r12, r8 @ r2..r3 += (X7 * -K14)
|
Chris@0
|
415
|
Chris@0
|
416 rsb lr, lr, #0 @ lr = K15
|
Chris@0
|
417 rsb r11, r11, #0 @ r11 = -K02
|
Chris@0
|
418
|
Chris@0
|
419 smlal r4, r5, lr, r6 @ r4..r5 += (X1 * K15) = ct05
|
Chris@0
|
420 smlal r2, r3, r11, r6 @ r2..r3 += (X1 * -K02) = ct03
|
Chris@0
|
421
|
Chris@0
|
422 stmdb sp!, { r2, r3, r4, r5 } @ stack ct05_h, ct05_l, ct03_h, ct03_l
|
Chris@0
|
423
|
Chris@0
|
424 rsbs r4, r4, #0
|
Chris@0
|
425 rsc r5, r5, #0 @ r4..r5 = -ct05
|
Chris@0
|
426
|
Chris@0
|
427 stmdb sp!, { r4, r5 } @ stack -ct05_h, -ct05_l
|
Chris@0
|
428
|
Chris@0
|
429 ldr r2, [sp, #48] @ r2 = ct00_l
|
Chris@0
|
430 ldr r3, [sp, #52] @ r3 = ct00_h
|
Chris@0
|
431
|
Chris@0
|
432 rsb r10, r10, #0 @ r10 = K03
|
Chris@0
|
433
|
Chris@0
|
434 rsbs r4, r2, #0
|
Chris@0
|
435 rsc r5, r3, #0 @ r4..r5 = -ct00
|
Chris@0
|
436
|
Chris@0
|
437 @ r2..r3 = ct00
|
Chris@0
|
438 @ r4..r5 = -ct00
|
Chris@0
|
439 @ r6 = X1
|
Chris@0
|
440 @ r7 = X16
|
Chris@0
|
441 @ r8 = X7
|
Chris@0
|
442 @ r9 = X10
|
Chris@0
|
443 @ r10 = K03
|
Chris@0
|
444 @ r11 = -K02
|
Chris@0
|
445 @ r12 = -K14
|
Chris@0
|
446 @ lr = K15
|
Chris@0
|
447
|
Chris@0
|
448 smlal r4, r5, r10, r6 @ r4..r5 = -ct00 + (X1 * K03)
|
Chris@0
|
449 smlal r2, r3, r10, r9 @ r2..r3 = ct00 + (X10 * K03)
|
Chris@0
|
450
|
Chris@0
|
451 smlal r4, r5, r12, r9 @ r4..r5 += (X10 * -K14)
|
Chris@0
|
452 smlal r2, r3, r12, r6 @ r2..r3 += (X1 * -K14)
|
Chris@0
|
453
|
Chris@0
|
454 smlal r4, r5, r11, r7 @ r4..r5 += (X16 * -K02)
|
Chris@0
|
455 smlal r4, r5, lr, r8 @ r4..r5 += (X7 * K15) = ct07
|
Chris@0
|
456
|
Chris@0
|
457 rsb lr, lr, #0 @ lr = -K15
|
Chris@0
|
458 rsb r11, r11, #0 @ r11 = K02
|
Chris@0
|
459
|
Chris@0
|
460 smlal r2, r3, r11, r8 @ r2..r3 += (X7 * K02)
|
Chris@0
|
461 smlal r2, r3, lr, r7 @ r2..r3 += (X16 * -K15) = ct02
|
Chris@0
|
462
|
Chris@0
|
463 rsbs r6, r4, #0
|
Chris@0
|
464 rsc r7, r5, #0 @ r6..r7 = -ct07
|
Chris@0
|
465
|
Chris@0
|
466 stmdb sp!, { r2 - r7 } @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
|
Chris@0
|
467
|
Chris@0
|
468
|
Chris@0
|
469 @----
|
Chris@0
|
470
|
Chris@0
|
471 add r2, pc, #(imdct36_long_karray-.-8) @ r2 = base address of Knn array (PIC safe ?)
|
Chris@0
|
472
|
Chris@0
|
473
|
Chris@0
|
474 loop:
|
Chris@0
|
475 ldr r12, [r0, #X0]
|
Chris@0
|
476
|
Chris@0
|
477 ldmia r2!, { r5 - r11 } @ first 7 words from Karray element
|
Chris@0
|
478
|
Chris@0
|
479 smull r3, r4, r5, r12 @ sum = (Kxx * X0)
|
Chris@0
|
480 ldr r12, [r0, #X2]
|
Chris@0
|
481 ldr r5, [r0, #X3]
|
Chris@0
|
482 smlal r3, r4, r6, r12 @ sum += (Kxx * X2)
|
Chris@0
|
483 ldr r12, [r0, #X5]
|
Chris@0
|
484 ldr r6, [r0, #X6]
|
Chris@0
|
485 smlal r3, r4, r7, r5 @ sum += (Kxx * X3)
|
Chris@0
|
486 smlal r3, r4, r8, r12 @ sum += (Kxx * X5)
|
Chris@0
|
487 ldr r12, [r0, #X8]
|
Chris@0
|
488 ldr r5, [r0, #X9]
|
Chris@0
|
489 smlal r3, r4, r9, r6 @ sum += (Kxx * X6)
|
Chris@0
|
490 smlal r3, r4, r10, r12 @ sum += (Kxx * X8)
|
Chris@0
|
491 smlal r3, r4, r11, r5 @ sum += (Kxx * X9)
|
Chris@0
|
492
|
Chris@0
|
493 ldmia r2!, { r5 - r10 } @ final 6 words from Karray element
|
Chris@0
|
494
|
Chris@0
|
495 ldr r11, [r0, #X11]
|
Chris@0
|
496 ldr r12, [r0, #X12]
|
Chris@0
|
497 smlal r3, r4, r5, r11 @ sum += (Kxx * X11)
|
Chris@0
|
498 ldr r11, [r0, #X14]
|
Chris@0
|
499 ldr r5, [r0, #X15]
|
Chris@0
|
500 smlal r3, r4, r6, r12 @ sum += (Kxx * X12)
|
Chris@0
|
501 smlal r3, r4, r7, r11 @ sum += (Kxx * X14)
|
Chris@0
|
502 ldr r11, [r0, #X17]
|
Chris@0
|
503 smlal r3, r4, r8, r5 @ sum += (Kxx * X15)
|
Chris@0
|
504 smlal r3, r4, r9, r11 @ sum += (Kxx * X17)
|
Chris@0
|
505
|
Chris@0
|
506 add r5, sp, r10, lsr #16 @ create index back into stack for required ctxx
|
Chris@0
|
507
|
Chris@0
|
508 ldmia r5, { r6, r7 } @ r6..r7 = ctxx
|
Chris@0
|
509
|
Chris@0
|
510 mov r8, r10, lsl #16 @ push ctxx index off the top end
|
Chris@0
|
511
|
Chris@0
|
512 adds r3, r3, r6 @ add low words
|
Chris@0
|
513 adc r4, r4, r7 @ add high words, with carry
|
Chris@0
|
514 movs r3, r3, lsr #28
|
Chris@0
|
515 adc r3, r3, r4, lsl #4 @ r3 = bits[59..28] of r3..r4
|
Chris@0
|
516
|
Chris@0
|
517 str r3, [r1, r8, lsr #24] @ push completion flag off the bottom end
|
Chris@0
|
518
|
Chris@0
|
519 movs r8, r8, lsl #8 @ push result location index off the top end
|
Chris@0
|
520 beq loop @ loop back if completion flag not set
|
Chris@0
|
521 b imdct_l_windowing @ branch to windowing stage if looping finished
|
Chris@0
|
522
|
Chris@0
|
523 imdct36_long_karray:
|
Chris@0
|
524
|
Chris@0
|
525 .word K17, -K13, K10, -K06, -K05, K01, -K00, K04, -K07, K11, K12, -K16, 0x00000000
|
Chris@0
|
526 .word K13, K07, K16, K01, K10, -K05, K04, -K11, K00, -K17, K06, -K12, 0x00200800
|
Chris@0
|
527 .word K11, K17, K05, K12, -K01, K06, -K07, K00, -K13, K04, -K16, K10, 0x00200c00
|
Chris@0
|
528 .word K07, K00, -K12, K05, -K16, -K10, K11, -K17, K04, K13, K01, K06, 0x00001400
|
Chris@0
|
529 .word K05, K10, -K00, -K17, K07, -K13, K12, K06, -K16, K01, -K11, -K04, 0x00181800
|
Chris@0
|
530 .word K01, K05, -K07, -K11, K13, K17, -K16, -K12, K10, K06, -K04, -K00, 0x00102000
|
Chris@0
|
531 .word -K16, K12, -K11, K07, K04, -K00, -K01, K05, -K06, K10, K13, -K17, 0x00284800
|
Chris@0
|
532 .word -K12, K06, K17, -K00, -K11, K04, K05, -K10, K01, K16, -K07, -K13, 0x00085000
|
Chris@0
|
533 .word -K10, K16, K04, -K13, -K00, K07, K06, -K01, -K12, -K05, K17, K11, 0x00105400
|
Chris@0
|
534 .word -K06, -K01, K13, K04, K17, -K11, -K10, -K16, -K05, K12, K00, K07, 0x00185c00
|
Chris@0
|
535 .word -K04, -K11, -K01, K16, K06, K12, K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
|
Chris@0
|
536 .word -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
|
Chris@0
|
537
|
Chris@0
|
538
|
Chris@0
|
539 @----
|
Chris@0
|
540 @-------------------------------------------------------------------------
|
Chris@0
|
541 @----
|
Chris@0
|
542
|
Chris@0
|
543 imdct_l_windowing:
|
Chris@0
|
544
|
Chris@0
|
545 ldr r11, [sp, #80] @ fetch function parameter 3 from out of the stack
|
Chris@0
|
546 ldmia r1!, { r0, r2 - r9 } @ load 9 words from x0, update pointer
|
Chris@0
|
547
|
Chris@0
|
548 @ r0 = x0
|
Chris@0
|
549 @ r1 = &x[9]
|
Chris@0
|
550 @ r2 = x1
|
Chris@0
|
551 @ r3 = x2
|
Chris@0
|
552 @ r4 = x3
|
Chris@0
|
553 @ r5 = x4
|
Chris@0
|
554 @ r6 = x5
|
Chris@0
|
555 @ r7 = x6
|
Chris@0
|
556 @ r8 = x7
|
Chris@0
|
557 @ r9 = x8
|
Chris@0
|
558 @ r10 = .
|
Chris@0
|
559 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
|
Chris@0
|
560 @ r12 = .
|
Chris@0
|
561 @ lr = .
|
Chris@0
|
562
|
Chris@0
|
563 cmp r11, #BLOCK_MODE_STOP @ setup flags
|
Chris@0
|
564 rsb r10, r0, #0 @ r10 = -x0 (DONT change flags !!)
|
Chris@0
|
565 beq stop_block_x0_to_x17
|
Chris@0
|
566
|
Chris@0
|
567
|
Chris@0
|
568 @ start and normal blocks are treated the same for x[0]..x[17]
|
Chris@0
|
569
|
Chris@0
|
570 normal_block_x0_to_x17:
|
Chris@0
|
571
|
Chris@0
|
572 ldr r12, =WL9 @ r12 = window_l[9]
|
Chris@0
|
573
|
Chris@0
|
574 rsb r0, r9, #0 @ r0 = -x8
|
Chris@0
|
575 rsb r9, r2, #0 @ r9 = -x1
|
Chris@0
|
576 rsb r2, r8, #0 @ r2 = -x7
|
Chris@0
|
577 rsb r8, r3, #0 @ r8 = -x2
|
Chris@0
|
578 rsb r3, r7, #0 @ r3 = -x6
|
Chris@0
|
579 rsb r7, r4, #0 @ r7 = -x3
|
Chris@0
|
580 rsb r4, r6, #0 @ r4 = -x5
|
Chris@0
|
581 rsb r6, r5, #0 @ r6 = -x4
|
Chris@0
|
582
|
Chris@0
|
583 @ r0 = -x8
|
Chris@0
|
584 @ r1 = &x[9]
|
Chris@0
|
585 @ r2 = -x7
|
Chris@0
|
586 @ r3 = -x6
|
Chris@0
|
587 @ r4 = -x5
|
Chris@0
|
588 @ r5 = .
|
Chris@0
|
589 @ r6 = -x4
|
Chris@0
|
590 @ r7 = -x3
|
Chris@0
|
591 @ r8 = -x2
|
Chris@0
|
592 @ r9 = -x1
|
Chris@0
|
593 @ r10 = -x0
|
Chris@0
|
594 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
|
Chris@0
|
595 @ r12 = window_l[9]
|
Chris@0
|
596 @ lr = .
|
Chris@0
|
597
|
Chris@0
|
598 smull r5, lr, r12, r0 @ r5..lr = (window_l[9] * (x[9] == -x[8]))
|
Chris@0
|
599 ldr r12, =WL10 @ r12 = window_l[10]
|
Chris@0
|
600 movs r5, r5, lsr #28
|
Chris@0
|
601 adc r0, r5, lr, lsl #4 @ r0 = bits[59..28] of windowed x9
|
Chris@0
|
602
|
Chris@0
|
603 smull r5, lr, r12, r2 @ r5..lr = (window_l[10] * (x[10] == -x[7]))
|
Chris@0
|
604 ldr r12, =WL11 @ r12 = window_l[11]
|
Chris@0
|
605 movs r5, r5, lsr #28
|
Chris@0
|
606 adc r2, r5, lr, lsl #4 @ r2 = bits[59..28] of windowed x10
|
Chris@0
|
607
|
Chris@0
|
608 smull r5, lr, r12, r3 @ r5..lr = (window_l[11] * (x[11] == -x[6]))
|
Chris@0
|
609 ldr r12, =WL12 @ r12 = window_l[12]
|
Chris@0
|
610 movs r5, r5, lsr #28
|
Chris@0
|
611 adc r3, r5, lr, lsl #4 @ r3 = bits[59..28] of windowed x11
|
Chris@0
|
612
|
Chris@0
|
613 smull r5, lr, r12, r4 @ r5..lr = (window_l[12] * (x[12] == -x[5]))
|
Chris@0
|
614 ldr r12, =WL13 @ r12 = window_l[13]
|
Chris@0
|
615 movs r5, r5, lsr #28
|
Chris@0
|
616 adc r4, r5, lr, lsl #4 @ r4 = bits[59..28] of windowed x12
|
Chris@0
|
617
|
Chris@0
|
618 smull r5, lr, r12, r6 @ r5..lr = (window_l[13] * (x[13] == -x[4]))
|
Chris@0
|
619 ldr r12, =WL14 @ r12 = window_l[14]
|
Chris@0
|
620 movs r5, r5, lsr #28
|
Chris@0
|
621 adc r6, r5, lr, lsl #4 @ r6 = bits[59..28] of windowed x13
|
Chris@0
|
622
|
Chris@0
|
623 smull r5, lr, r12, r7 @ r5..lr = (window_l[14] * (x[14] == -x[3]))
|
Chris@0
|
624 ldr r12, =WL15 @ r12 = window_l[15]
|
Chris@0
|
625 movs r5, r5, lsr #28
|
Chris@0
|
626 adc r7, r5, lr, lsl #4 @ r7 = bits[59..28] of windowed x14
|
Chris@0
|
627
|
Chris@0
|
628 smull r5, lr, r12, r8 @ r5..lr = (window_l[15] * (x[15] == -x[2]))
|
Chris@0
|
629 ldr r12, =WL16 @ r12 = window_l[16]
|
Chris@0
|
630 movs r5, r5, lsr #28
|
Chris@0
|
631 adc r8, r5, lr, lsl #4 @ r8 = bits[59..28] of windowed x15
|
Chris@0
|
632
|
Chris@0
|
633 smull r5, lr, r12, r9 @ r5..lr = (window_l[16] * (x[16] == -x[1]))
|
Chris@0
|
634 ldr r12, =WL17 @ r12 = window_l[17]
|
Chris@0
|
635 movs r5, r5, lsr #28
|
Chris@0
|
636 adc r9, r5, lr, lsl #4 @ r9 = bits[59..28] of windowed x16
|
Chris@0
|
637
|
Chris@0
|
638 smull r5, lr, r12, r10 @ r5..lr = (window_l[17] * (x[17] == -x[0]))
|
Chris@0
|
639 ldr r12, =WL0 @ r12 = window_l[0]
|
Chris@0
|
640 movs r5, r5, lsr #28
|
Chris@0
|
641 adc r10, r5, lr, lsl #4 @ r10 = bits[59..28] of windowed x17
|
Chris@0
|
642
|
Chris@0
|
643
|
Chris@0
|
644 stmia r1, { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
|
Chris@0
|
645 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x0
|
Chris@0
|
646
|
Chris@0
|
647
|
Chris@0
|
648 smull r10, lr, r12, r0 @ r10..lr = (window_l[0] * x[0])
|
Chris@0
|
649 ldr r12, =WL1 @ r12 = window_l[1]
|
Chris@0
|
650 movs r10, r10, lsr #28
|
Chris@0
|
651 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
|
Chris@0
|
652
|
Chris@0
|
653 smull r10, lr, r12, r2 @ r10..lr = (window_l[1] * x[1])
|
Chris@0
|
654 ldr r12, =WL2 @ r12 = window_l[2]
|
Chris@0
|
655 movs r10, r10, lsr #28
|
Chris@0
|
656 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
|
Chris@0
|
657
|
Chris@0
|
658 smull r10, lr, r12, r3 @ r10..lr = (window_l[2] * x[2])
|
Chris@0
|
659 ldr r12, =WL3 @ r12 = window_l[3]
|
Chris@0
|
660 movs r10, r10, lsr #28
|
Chris@0
|
661 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
|
Chris@0
|
662
|
Chris@0
|
663 smull r10, lr, r12, r4 @ r10..lr = (window_l[3] * x[3])
|
Chris@0
|
664 ldr r12, =WL4 @ r12 = window_l[4]
|
Chris@0
|
665 movs r10, r10, lsr #28
|
Chris@0
|
666 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
|
Chris@0
|
667
|
Chris@0
|
668 smull r10, lr, r12, r5 @ r10..lr = (window_l[4] * x[4])
|
Chris@0
|
669 ldr r12, =WL5 @ r12 = window_l[5]
|
Chris@0
|
670 movs r10, r10, lsr #28
|
Chris@0
|
671 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
|
Chris@0
|
672
|
Chris@0
|
673 smull r10, lr, r12, r6 @ r10..lr = (window_l[5] * x[5])
|
Chris@0
|
674 ldr r12, =WL6 @ r12 = window_l[6]
|
Chris@0
|
675 movs r10, r10, lsr #28
|
Chris@0
|
676 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
|
Chris@0
|
677
|
Chris@0
|
678 smull r10, lr, r12, r7 @ r10..lr = (window_l[6] * x[6])
|
Chris@0
|
679 ldr r12, =WL7 @ r12 = window_l[7]
|
Chris@0
|
680 movs r10, r10, lsr #28
|
Chris@0
|
681 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
|
Chris@0
|
682
|
Chris@0
|
683 smull r10, lr, r12, r8 @ r10..lr = (window_l[7] * x[7])
|
Chris@0
|
684 ldr r12, =WL8 @ r12 = window_l[8]
|
Chris@0
|
685 movs r10, r10, lsr #28
|
Chris@0
|
686 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
|
Chris@0
|
687
|
Chris@0
|
688 smull r10, lr, r12, r9 @ r10..lr = (window_l[8] * x[8])
|
Chris@0
|
689 movs r10, r10, lsr #28
|
Chris@0
|
690 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
|
Chris@0
|
691
|
Chris@0
|
692 stmia r1, { r0, r2 - r9 } @ store windowed x[0] .. x[8]
|
Chris@0
|
693
|
Chris@0
|
694 cmp r11, #BLOCK_MODE_START
|
Chris@0
|
695 beq start_block_x18_to_x35
|
Chris@0
|
696
|
Chris@0
|
697
|
Chris@0
|
698 @----
|
Chris@0
|
699
|
Chris@0
|
700
|
Chris@0
|
701 normal_block_x18_to_x35:
|
Chris@0
|
702
|
Chris@0
|
703 ldr r11, =WL3 @ r11 = window_l[3]
|
Chris@0
|
704 ldr r12, =WL4 @ r12 = window_l[4]
|
Chris@0
|
705
|
Chris@0
|
706 add r1, r1, #(18*4) @ r1 = &x[18]
|
Chris@0
|
707
|
Chris@0
|
708 ldmia r1!, { r0, r2 - r4, r6 - r10 } @ load 9 words from x18, update pointer
|
Chris@0
|
709
|
Chris@0
|
710 @ r0 = x18
|
Chris@0
|
711 @ r1 = &x[27]
|
Chris@0
|
712 @ r2 = x19
|
Chris@0
|
713 @ r3 = x20
|
Chris@0
|
714 @ r4 = x21
|
Chris@0
|
715 @ r5 = .
|
Chris@0
|
716 @ r6 = x22
|
Chris@0
|
717 @ r7 = x23
|
Chris@0
|
718 @ r8 = x24
|
Chris@0
|
719 @ r9 = x25
|
Chris@0
|
720 @ r10 = x26
|
Chris@0
|
721 @ r11 = window_l[3]
|
Chris@0
|
722 @ r12 = window_l[4]
|
Chris@0
|
723 @ lr = .
|
Chris@0
|
724
|
Chris@0
|
725 smull r5, lr, r12, r6 @ r5..lr = (window_l[4] * (x[22] == x[31]))
|
Chris@0
|
726 movs r5, r5, lsr #28
|
Chris@0
|
727 adc r5, r5, lr, lsl #4 @ r5 = bits[59..28] of windowed x31
|
Chris@0
|
728
|
Chris@0
|
729 smull r6, lr, r11, r4 @ r5..lr = (window_l[3] * (x[21] == x[32]))
|
Chris@0
|
730 ldr r12, =WL5 @ r12 = window_l[5]
|
Chris@0
|
731 movs r6, r6, lsr #28
|
Chris@0
|
732 adc r6, r6, lr, lsl #4 @ r6 = bits[59..28] of windowed x32
|
Chris@0
|
733
|
Chris@0
|
734 smull r4, lr, r12, r7 @ r4..lr = (window_l[5] * (x[23] == x[30]))
|
Chris@0
|
735 ldr r11, =WL1 @ r11 = window_l[1]
|
Chris@0
|
736 ldr r12, =WL2 @ r12 = window_l[2]
|
Chris@0
|
737 movs r4, r4, lsr #28
|
Chris@0
|
738 adc r4, r4, lr, lsl #4 @ r4 = bits[59..28] of windowed x30
|
Chris@0
|
739
|
Chris@0
|
740 smull r7, lr, r12, r3 @ r7..lr = (window_l[2] * (x[20] == x[33]))
|
Chris@0
|
741 ldr r12, =WL6 @ r12 = window_l[6]
|
Chris@0
|
742 movs r7, r7, lsr #28
|
Chris@0
|
743 adc r7, r7, lr, lsl #4 @ r7 = bits[59..28] of windowed x33
|
Chris@0
|
744
|
Chris@0
|
745 smull r3, lr, r12, r8 @ r3..lr = (window_l[6] * (x[24] == x[29]))
|
Chris@0
|
746 movs r3, r3, lsr #28
|
Chris@0
|
747 adc r3, r3, lr, lsl #4 @ r3 = bits[59..28] of windowed x29
|
Chris@0
|
748
|
Chris@0
|
749 smull r8, lr, r11, r2 @ r7..lr = (window_l[1] * (x[19] == x[34]))
|
Chris@0
|
750 ldr r12, =WL7 @ r12 = window_l[7]
|
Chris@0
|
751 ldr r11, =WL8 @ r11 = window_l[8]
|
Chris@0
|
752 movs r8, r8, lsr #28
|
Chris@0
|
753 adc r8, r8, lr, lsl #4 @ r8 = bits[59..28] of windowed x34
|
Chris@0
|
754
|
Chris@0
|
755 smull r2, lr, r12, r9 @ r7..lr = (window_l[7] * (x[25] == x[28]))
|
Chris@0
|
756 ldr r12, =WL0 @ r12 = window_l[0]
|
Chris@0
|
757 movs r2, r2, lsr #28
|
Chris@0
|
758 adc r2, r2, lr, lsl #4 @ r2 = bits[59..28] of windowed x28
|
Chris@0
|
759
|
Chris@0
|
760 smull r9, lr, r12, r0 @ r3..lr = (window_l[0] * (x[18] == x[35]))
|
Chris@0
|
761 movs r9, r9, lsr #28
|
Chris@0
|
762 adc r9, r9, lr, lsl #4 @ r9 = bits[59..28] of windowed x35
|
Chris@0
|
763
|
Chris@0
|
764 smull r0, lr, r11, r10 @ r7..lr = (window_l[8] * (x[26] == x[27]))
|
Chris@0
|
765 ldr r11, =WL16 @ r11 = window_l[16]
|
Chris@0
|
766 ldr r12, =WL17 @ r12 = window_l[17]
|
Chris@0
|
767 movs r0, r0, lsr #28
|
Chris@0
|
768 adc r0, r0, lr, lsl #4 @ r0 = bits[59..28] of windowed x27
|
Chris@0
|
769
|
Chris@0
|
770
|
Chris@0
|
771 stmia r1, { r0, r2 - r9 } @ store windowed x[27] .. x[35]
|
Chris@0
|
772 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x18
|
Chris@0
|
773
|
Chris@0
|
774
|
Chris@0
|
775 smull r10, lr, r12, r0 @ r10..lr = (window_l[17] * x[18])
|
Chris@0
|
776 movs r10, r10, lsr #28
|
Chris@0
|
777 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
|
Chris@0
|
778
|
Chris@0
|
779 smull r10, lr, r11, r2 @ r10..lr = (window_l[16] * x[19])
|
Chris@0
|
780 ldr r11, =WL14 @ r11 = window_l[14]
|
Chris@0
|
781 ldr r12, =WL15 @ r12 = window_l[15]
|
Chris@0
|
782 movs r10, r10, lsr #28
|
Chris@0
|
783 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
|
Chris@0
|
784
|
Chris@0
|
785 smull r10, lr, r12, r3 @ r10..lr = (window_l[15] * x[20])
|
Chris@0
|
786 movs r10, r10, lsr #28
|
Chris@0
|
787 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
|
Chris@0
|
788
|
Chris@0
|
789 smull r10, lr, r11, r4 @ r10..lr = (window_l[14] * x[21])
|
Chris@0
|
790 ldr r11, =WL12 @ r11 = window_l[12]
|
Chris@0
|
791 ldr r12, =WL13 @ r12 = window_l[13]
|
Chris@0
|
792 movs r10, r10, lsr #28
|
Chris@0
|
793 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
|
Chris@0
|
794
|
Chris@0
|
795 smull r10, lr, r12, r5 @ r10..lr = (window_l[13] * x[22])
|
Chris@0
|
796 movs r10, r10, lsr #28
|
Chris@0
|
797 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
|
Chris@0
|
798
|
Chris@0
|
799 smull r10, lr, r11, r6 @ r10..lr = (window_l[12] * x[23])
|
Chris@0
|
800 ldr r11, =WL10 @ r12 = window_l[10]
|
Chris@0
|
801 ldr r12, =WL11 @ r12 = window_l[11]
|
Chris@0
|
802 movs r10, r10, lsr #28
|
Chris@0
|
803 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
|
Chris@0
|
804
|
Chris@0
|
805 smull r10, lr, r12, r7 @ r10..lr = (window_l[11] * x[24])
|
Chris@0
|
806 movs r10, r10, lsr #28
|
Chris@0
|
807 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
|
Chris@0
|
808
|
Chris@0
|
809 smull r10, lr, r11, r8 @ r10..lr = (window_l[10] * x[25])
|
Chris@0
|
810 ldr r12, =WL9 @ r12 = window_l[9]
|
Chris@0
|
811 movs r10, r10, lsr #28
|
Chris@0
|
812 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
|
Chris@0
|
813
|
Chris@0
|
814 smull r10, lr, r12, r9 @ r10..lr = (window_l[9] * x[26])
|
Chris@0
|
815
|
Chris@0
|
816 movs r10, r10, lsr #28
|
Chris@0
|
817 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
|
Chris@0
|
818
|
Chris@0
|
819 stmia r1, { r0, r2 - r9 } @ store windowed x[18] .. x[26]
|
Chris@0
|
820
|
Chris@0
|
821 @----
|
Chris@0
|
822 @ NB there are 2 possible exits from this function - this is only one of them
|
Chris@0
|
823 @----
|
Chris@0
|
824
|
Chris@0
|
825 add sp, sp, #(21*4) @ return stack frame
|
Chris@0
|
826 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
|
Chris@0
|
827
|
Chris@0
|
828 @----
|
Chris@0
|
829
|
Chris@0
|
830
|
Chris@0
|
831 stop_block_x0_to_x17:
|
Chris@0
|
832
|
Chris@0
|
833 @ r0 = x0
|
Chris@0
|
834 @ r1 = &x[9]
|
Chris@0
|
835 @ r2 = x1
|
Chris@0
|
836 @ r3 = x2
|
Chris@0
|
837 @ r4 = x3
|
Chris@0
|
838 @ r5 = x4
|
Chris@0
|
839 @ r6 = x5
|
Chris@0
|
840 @ r7 = x6
|
Chris@0
|
841 @ r8 = x7
|
Chris@0
|
842 @ r9 = x8
|
Chris@0
|
843 @ r10 = -x0
|
Chris@0
|
844 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
|
Chris@0
|
845 @ r12 = .
|
Chris@0
|
846 @ lr = .
|
Chris@0
|
847
|
Chris@0
|
848 rsb r0, r6, #0 @ r0 = -x5
|
Chris@0
|
849 rsb r6, r2, #0 @ r6 = -x1
|
Chris@0
|
850 rsb r2, r5, #0 @ r2 = -x4
|
Chris@0
|
851 rsb r5, r3, #0 @ r5 = -x2
|
Chris@0
|
852 rsb r3, r4, #0 @ r3 = -x3
|
Chris@0
|
853
|
Chris@0
|
854 add r1, r1, #(3*4) @ r1 = &x[12]
|
Chris@0
|
855 stmia r1, { r0, r2, r3, r5, r6, r10 } @ store unchanged x[12] .. x[17]
|
Chris@0
|
856
|
Chris@0
|
857 ldr r0, =WL1 @ r0 = window_l[1] == window_s[0]
|
Chris@0
|
858
|
Chris@0
|
859 rsb r10, r9, #0 @ r10 = -x8
|
Chris@0
|
860 rsb r12, r8, #0 @ r12 = -x7
|
Chris@0
|
861 rsb lr, r7, #0 @ lr = -x6
|
Chris@0
|
862
|
Chris@0
|
863 @ r0 = WL1
|
Chris@0
|
864 @ r1 = &x[12]
|
Chris@0
|
865 @ r2 = .
|
Chris@0
|
866 @ r3 = .
|
Chris@0
|
867 @ r4 = .
|
Chris@0
|
868 @ r5 = .
|
Chris@0
|
869 @ r6 = .
|
Chris@0
|
870 @ r7 = x6
|
Chris@0
|
871 @ r8 = x7
|
Chris@0
|
872 @ r9 = x8
|
Chris@0
|
873 @ r10 = -x8
|
Chris@0
|
874 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
|
Chris@0
|
875 @ r12 = -x7
|
Chris@0
|
876 @ lr = -x6
|
Chris@0
|
877
|
Chris@0
|
878 smull r5, r6, r0, r7 @ r5..r6 = (window_l[1] * x[6])
|
Chris@0
|
879 ldr r2, =WL4 @ r2 = window_l[4] == window_s[1]
|
Chris@0
|
880 movs r5, r5, lsr #28
|
Chris@0
|
881 adc r7, r5, r6, lsl #4 @ r7 = bits[59..28] of windowed x6
|
Chris@0
|
882
|
Chris@0
|
883 smull r5, r6, r2, r8 @ r5..r6 = (window_l[4] * x[7])
|
Chris@0
|
884 ldr r3, =WL7 @ r3 = window_l[7] == window_s[2]
|
Chris@0
|
885 movs r5, r5, lsr #28
|
Chris@0
|
886 adc r8, r5, r6, lsl #4 @ r8 = bits[59..28] of windowed x7
|
Chris@0
|
887
|
Chris@0
|
888 smull r5, r6, r3, r9 @ r5..r6 = (window_l[7] * x[8])
|
Chris@0
|
889 ldr r4, =WL10 @ r4 = window_l[10] == window_s[3]
|
Chris@0
|
890 movs r5, r5, lsr #28
|
Chris@0
|
891 adc r9, r5, r6, lsl #4 @ r9 = bits[59..28] of windowed x8
|
Chris@0
|
892
|
Chris@0
|
893 smull r5, r6, r4, r10 @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
|
Chris@0
|
894 ldr r0, =WL13 @ r0 = window_l[13] == window_s[4]
|
Chris@0
|
895 movs r5, r5, lsr #28
|
Chris@0
|
896 adc r10, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
|
Chris@0
|
897
|
Chris@0
|
898 smull r5, r6, r0, r12 @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
|
Chris@0
|
899 ldr r2, =WL16 @ r2 = window_l[16] == window_s[5]
|
Chris@0
|
900 movs r5, r5, lsr #28
|
Chris@0
|
901 adc r12, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
|
Chris@0
|
902
|
Chris@0
|
903 smull r5, r6, r2, lr @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
|
Chris@0
|
904
|
Chris@0
|
905 ldr r0, =0x00
|
Chris@0
|
906
|
Chris@0
|
907 movs r5, r5, lsr #28
|
Chris@0
|
908 adc lr, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
|
Chris@0
|
909
|
Chris@0
|
910 stmdb r1!, { r7 - r10, r12, lr } @ store windowed x[6] .. x[11]
|
Chris@0
|
911
|
Chris@0
|
912 ldr r5, =0x00
|
Chris@0
|
913 ldr r6, =0x00
|
Chris@0
|
914 ldr r2, =0x00
|
Chris@0
|
915 ldr r3, =0x00
|
Chris@0
|
916 ldr r4, =0x00
|
Chris@0
|
917
|
Chris@0
|
918 stmdb r1!, { r0, r2 - r6 } @ store windowed x[0] .. x[5]
|
Chris@0
|
919
|
Chris@0
|
920 b normal_block_x18_to_x35
|
Chris@0
|
921
|
Chris@0
|
922
|
Chris@0
|
923 @----
|
Chris@0
|
924
|
Chris@0
|
925
|
Chris@0
|
926 start_block_x18_to_x35:
|
Chris@0
|
927
|
Chris@0
|
928 ldr r4, =WL1 @ r0 = window_l[1] == window_s[0]
|
Chris@0
|
929
|
Chris@0
|
930 add r1, r1, #(24*4) @ r1 = &x[24]
|
Chris@0
|
931
|
Chris@0
|
932 ldmia r1, { r0, r2, r3 } @ load 3 words from x24, dont update pointer
|
Chris@0
|
933
|
Chris@0
|
934 @ r0 = x24
|
Chris@0
|
935 @ r1 = &x[24]
|
Chris@0
|
936 @ r2 = x25
|
Chris@0
|
937 @ r3 = x26
|
Chris@0
|
938 @ r4 = WL1
|
Chris@0
|
939 @ r5 = WL4
|
Chris@0
|
940 @ r6 = WL7
|
Chris@0
|
941 @ r7 = WL10
|
Chris@0
|
942 @ r8 = WL13
|
Chris@0
|
943 @ r9 = WL16
|
Chris@0
|
944 @ r10 = .
|
Chris@0
|
945 @ r11 = .
|
Chris@0
|
946 @ r12 = .
|
Chris@0
|
947 @ lr = .
|
Chris@0
|
948
|
Chris@0
|
949 ldr r5, =WL4 @ r5 = window_l[4] == window_s[1]
|
Chris@0
|
950
|
Chris@0
|
951 smull r10, r11, r4, r0 @ r10..r11 = (window_l[1] * (x[24] == x[29]))
|
Chris@0
|
952 ldr r6, =WL7 @ r6 = window_l[7] == window_s[2]
|
Chris@0
|
953 movs r10, r10, lsr #28
|
Chris@0
|
954 adc lr, r10, r11, lsl #4 @ lr = bits[59..28] of windowed x29
|
Chris@0
|
955
|
Chris@0
|
956 smull r10, r11, r5, r2 @ r10..r11 = (window_l[4] * (x[25] == x[28]))
|
Chris@0
|
957 ldr r7, =WL10 @ r7 = window_l[10] == window_s[3]
|
Chris@0
|
958 movs r10, r10, lsr #28
|
Chris@0
|
959 adc r12, r10, r11, lsl #4 @ r12 = bits[59..28] of windowed x28
|
Chris@0
|
960
|
Chris@0
|
961 smull r10, r11, r6, r3 @ r10..r11 = (window_l[7] * (x[26] == x[27]))
|
Chris@0
|
962 ldr r8, =WL13 @ r8 = window_l[13] == window_s[4]
|
Chris@0
|
963 movs r10, r10, lsr #28
|
Chris@0
|
964 adc r4, r10, r11, lsl #4 @ r4 = bits[59..28] of windowed x27
|
Chris@0
|
965
|
Chris@0
|
966 smull r10, r11, r7, r3 @ r10..r11 = (window_l[10] * x[26])
|
Chris@0
|
967 ldr r9, =WL16 @ r9 = window_l[16] == window_s[5]
|
Chris@0
|
968 movs r10, r10, lsr #28
|
Chris@0
|
969 adc r3, r10, r11, lsl #4 @ r3 = bits[59..28] of windowed x26
|
Chris@0
|
970
|
Chris@0
|
971 smull r10, r11, r8, r2 @ r10..r11 = (window_l[13] * x[25])
|
Chris@0
|
972 ldr r5, =0x00
|
Chris@0
|
973 movs r10, r10, lsr #28
|
Chris@0
|
974 adc r2, r10, r11, lsl #4 @ r2 = bits[59..28] of windowed x25
|
Chris@0
|
975
|
Chris@0
|
976 smull r10, r11, r9, r0 @ r10..r11 = (window_l[16] * x[24])
|
Chris@0
|
977 ldr r6, =0x00
|
Chris@0
|
978 movs r10, r10, lsr #28
|
Chris@0
|
979 adc r0, r10, r11, lsl #4 @ r0 = bits[59..28] of windowed x24
|
Chris@0
|
980
|
Chris@0
|
981 stmia r1!, { r0, r2, r3, r4, r12, lr } @ store windowed x[24] .. x[29]
|
Chris@0
|
982
|
Chris@0
|
983 ldr r7, =0x00
|
Chris@0
|
984 ldr r8, =0x00
|
Chris@0
|
985 ldr r9, =0x00
|
Chris@0
|
986 ldr r10, =0x00
|
Chris@0
|
987
|
Chris@0
|
988 stmia r1!, { r5 - r10 } @ store windowed x[30] .. x[35]
|
Chris@0
|
989
|
Chris@0
|
990 @----
|
Chris@0
|
991 @ NB there are 2 possible exits from this function - this is only one of them
|
Chris@0
|
992 @----
|
Chris@0
|
993
|
Chris@0
|
994 add sp, sp, #(21*4) @ return stack frame
|
Chris@0
|
995 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
|
Chris@0
|
996
|
Chris@0
|
997 @----
|
Chris@0
|
998 @END
|
Chris@0
|
999 @----
|
Chris@0
|
1000
|