yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* AAC Spectral Band Replication decoding functions
|
yading@10
|
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of Libav.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* Libav is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* Libav is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with Libav; if not, write to the Free Software
|
yading@10
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 SECTION_RODATA
|
yading@10
|
25 ; mask equivalent for multiply by -1.0 1.0
|
yading@10
|
26 ps_mask times 2 dd 1<<31, 0
|
yading@10
|
27 ps_mask2 times 2 dd 0, 1<<31
|
yading@10
|
28 ps_neg times 4 dd 1<<31
|
yading@10
|
29 ps_noise0 times 2 dd 1.0, 0.0,
|
yading@10
|
30 ps_noise2 times 2 dd -1.0, 0.0
|
yading@10
|
31 ps_noise13 dd 0.0, 1.0, 0.0, -1.0
|
yading@10
|
32 dd 0.0, -1.0, 0.0, 1.0
|
yading@10
|
33 dd 0.0, 1.0, 0.0, -1.0
|
yading@10
|
34 cextern sbr_noise_table
|
yading@10
|
35
|
yading@10
|
36 SECTION_TEXT
|
yading@10
|
37
|
yading@10
|
38 INIT_XMM sse
|
yading@10
|
39 cglobal sbr_sum_square, 2, 3, 6
|
yading@10
|
40 mov r2, r1
|
yading@10
|
41 xorps m0, m0
|
yading@10
|
42 xorps m1, m1
|
yading@10
|
43 sar r2, 3
|
yading@10
|
44 jz .prepare
|
yading@10
|
45 .loop:
|
yading@10
|
46 movu m2, [r0 + 0]
|
yading@10
|
47 movu m3, [r0 + 16]
|
yading@10
|
48 movu m4, [r0 + 32]
|
yading@10
|
49 movu m5, [r0 + 48]
|
yading@10
|
50 mulps m2, m2
|
yading@10
|
51 mulps m3, m3
|
yading@10
|
52 mulps m4, m4
|
yading@10
|
53 mulps m5, m5
|
yading@10
|
54 addps m0, m2
|
yading@10
|
55 addps m1, m3
|
yading@10
|
56 addps m0, m4
|
yading@10
|
57 addps m1, m5
|
yading@10
|
58 add r0, 64
|
yading@10
|
59 dec r2
|
yading@10
|
60 jnz .loop
|
yading@10
|
61 .prepare:
|
yading@10
|
62 and r1, 7
|
yading@10
|
63 sar r1, 1
|
yading@10
|
64 jz .end
|
yading@10
|
65 ; len is a multiple of 2, thus there are at least 4 elements to process
|
yading@10
|
66 .endloop:
|
yading@10
|
67 movu m2, [r0]
|
yading@10
|
68 add r0, 16
|
yading@10
|
69 mulps m2, m2
|
yading@10
|
70 dec r1
|
yading@10
|
71 addps m0, m2
|
yading@10
|
72 jnz .endloop
|
yading@10
|
73 .end:
|
yading@10
|
74 addps m0, m1
|
yading@10
|
75 movhlps m2, m0
|
yading@10
|
76 addps m0, m2
|
yading@10
|
77 movss m1, m0
|
yading@10
|
78 shufps m0, m0, 1
|
yading@10
|
79 addss m0, m1
|
yading@10
|
80 %if ARCH_X86_64 == 0
|
yading@10
|
81 movss r0m, m0
|
yading@10
|
82 fld dword r0m
|
yading@10
|
83 %endif
|
yading@10
|
84 RET
|
yading@10
|
85
|
yading@10
|
86 %define STEP 40*4*2
|
yading@10
|
87 cglobal sbr_hf_g_filt, 5, 6, 5
|
yading@10
|
88 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
|
yading@10
|
89 mov r5, r3
|
yading@10
|
90 and r3, 0xFC
|
yading@10
|
91 lea r2, [r2 + r3*4]
|
yading@10
|
92 lea r0, [r0 + r3*8]
|
yading@10
|
93 neg r3
|
yading@10
|
94 jz .loop1
|
yading@10
|
95 .loop4:
|
yading@10
|
96 movlps m0, [r2 + 4*r3 + 0]
|
yading@10
|
97 movlps m1, [r2 + 4*r3 + 8]
|
yading@10
|
98 movlps m2, [r1 + 0*STEP]
|
yading@10
|
99 movlps m3, [r1 + 2*STEP]
|
yading@10
|
100 movhps m2, [r1 + 1*STEP]
|
yading@10
|
101 movhps m3, [r1 + 3*STEP]
|
yading@10
|
102 unpcklps m0, m0
|
yading@10
|
103 unpcklps m1, m1
|
yading@10
|
104 mulps m0, m2
|
yading@10
|
105 mulps m1, m3
|
yading@10
|
106 movu [r0 + 8*r3 + 0], m0
|
yading@10
|
107 movu [r0 + 8*r3 + 16], m1
|
yading@10
|
108 add r1, 4*STEP
|
yading@10
|
109 add r3, 4
|
yading@10
|
110 jnz .loop4
|
yading@10
|
111 and r5, 3 ; number of single element loops
|
yading@10
|
112 jz .end
|
yading@10
|
113 .loop1: ; element 0 and 1 can be computed at the same time
|
yading@10
|
114 movss m0, [r2]
|
yading@10
|
115 movlps m2, [r1]
|
yading@10
|
116 unpcklps m0, m0
|
yading@10
|
117 mulps m2, m0
|
yading@10
|
118 movlps [r0], m2
|
yading@10
|
119 add r0, 8
|
yading@10
|
120 add r2, 4
|
yading@10
|
121 add r1, STEP
|
yading@10
|
122 dec r5
|
yading@10
|
123 jnz .loop1
|
yading@10
|
124 .end:
|
yading@10
|
125 RET
|
yading@10
|
126
|
yading@10
|
127 ; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
|
yading@10
|
128 ; const float alpha0[2], const float alpha1[2],
|
yading@10
|
129 ; float bw, int start, int end)
|
yading@10
|
130 ;
|
yading@10
|
131 cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
yading@10
|
132 ; load alpha factors
|
yading@10
|
133 %define bw m0
|
yading@10
|
134 %if ARCH_X86_64 == 0 || WIN64
|
yading@10
|
135 movss bw, BWm
|
yading@10
|
136 %endif
|
yading@10
|
137 movlps m2, [alpha1q]
|
yading@10
|
138 movlps m1, [alpha0q]
|
yading@10
|
139 shufps bw, bw, 0
|
yading@10
|
140 mulps m2, bw ; (a1[0] a1[1])*bw
|
yading@10
|
141 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
|
yading@10
|
142 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
yading@10
|
143 mova m3, m1
|
yading@10
|
144 mova m4, m2
|
yading@10
|
145
|
yading@10
|
146 ; Set pointers
|
yading@10
|
147 %if ARCH_X86_64 == 0 || WIN64
|
yading@10
|
148 ; start and end 6th and 7th args on stack
|
yading@10
|
149 mov r2d, Sm
|
yading@10
|
150 mov r3d, Em
|
yading@10
|
151 %define start r2q
|
yading@10
|
152 %define end r3q
|
yading@10
|
153 %else
|
yading@10
|
154 ; BW does not actually occupy a register, so shift by 1
|
yading@10
|
155 %define start BWq
|
yading@10
|
156 %define end Sq
|
yading@10
|
157 %endif
|
yading@10
|
158 sub start, end ; neg num of loops
|
yading@10
|
159 lea X_highq, [X_highq + end*2*4]
|
yading@10
|
160 lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
|
yading@10
|
161 shl start, 3 ; offset from num loops
|
yading@10
|
162
|
yading@10
|
163 mova m0, [X_lowq + start]
|
yading@10
|
164 shufps m3, m3, q1111
|
yading@10
|
165 shufps m4, m4, q1111
|
yading@10
|
166 xorps m3, [ps_mask]
|
yading@10
|
167 shufps m1, m1, q0000
|
yading@10
|
168 shufps m2, m2, q0000
|
yading@10
|
169 xorps m4, [ps_mask]
|
yading@10
|
170 .loop2:
|
yading@10
|
171 movu m7, [X_lowq + start + 8] ; BbCc
|
yading@10
|
172 mova m6, m0
|
yading@10
|
173 mova m5, m7
|
yading@10
|
174 shufps m0, m0, q2301 ; aAbB
|
yading@10
|
175 shufps m7, m7, q2301 ; bBcC
|
yading@10
|
176 mulps m0, m4
|
yading@10
|
177 mulps m7, m3
|
yading@10
|
178 mulps m6, m2
|
yading@10
|
179 mulps m5, m1
|
yading@10
|
180 addps m7, m0
|
yading@10
|
181 mova m0, [X_lowq + start +16] ; CcDd
|
yading@10
|
182 addps m7, m0
|
yading@10
|
183 addps m6, m5
|
yading@10
|
184 addps m7, m6
|
yading@10
|
185 mova [X_highq + start], m7
|
yading@10
|
186 add start, 16
|
yading@10
|
187 jnz .loop2
|
yading@10
|
188 RET
|
yading@10
|
189
|
yading@10
|
190 cglobal sbr_sum64x5, 1,2,4,z
|
yading@10
|
191 lea r1q, [zq+ 256]
|
yading@10
|
192 .loop:
|
yading@10
|
193 mova m0, [zq+ 0]
|
yading@10
|
194 mova m2, [zq+ 16]
|
yading@10
|
195 mova m1, [zq+ 256]
|
yading@10
|
196 mova m3, [zq+ 272]
|
yading@10
|
197 addps m0, [zq+ 512]
|
yading@10
|
198 addps m2, [zq+ 528]
|
yading@10
|
199 addps m1, [zq+ 768]
|
yading@10
|
200 addps m3, [zq+ 784]
|
yading@10
|
201 addps m0, [zq+1024]
|
yading@10
|
202 addps m2, [zq+1040]
|
yading@10
|
203 addps m0, m1
|
yading@10
|
204 addps m2, m3
|
yading@10
|
205 mova [zq], m0
|
yading@10
|
206 mova [zq+16], m2
|
yading@10
|
207 add zq, 32
|
yading@10
|
208 cmp zq, r1q
|
yading@10
|
209 jne .loop
|
yading@10
|
210 REP_RET
|
yading@10
|
211
|
yading@10
|
212 INIT_XMM sse
|
yading@10
|
213 cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
|
yading@10
|
214 lea r2q, [zq + (64-4)*4]
|
yading@10
|
215 mova m3, [ps_neg]
|
yading@10
|
216 .loop:
|
yading@10
|
217 mova m1, [zq]
|
yading@10
|
218 xorps m0, m3, [r2q]
|
yading@10
|
219 shufps m0, m0, m0, q0123
|
yading@10
|
220 unpcklps m2, m0, m1
|
yading@10
|
221 unpckhps m0, m0, m1
|
yading@10
|
222 mova [Wq + 0], m2
|
yading@10
|
223 mova [Wq + 16], m0
|
yading@10
|
224 add Wq, 32
|
yading@10
|
225 sub r2q, 16
|
yading@10
|
226 add zq, 16
|
yading@10
|
227 cmp zq, r2q
|
yading@10
|
228 jl .loop
|
yading@10
|
229 REP_RET
|
yading@10
|
230
|
yading@10
|
231 INIT_XMM sse
|
yading@10
|
232 cglobal sbr_neg_odd_64, 1,2,4,z
|
yading@10
|
233 lea r1q, [zq+256]
|
yading@10
|
234 .loop:
|
yading@10
|
235 mova m0, [zq+ 0]
|
yading@10
|
236 mova m1, [zq+16]
|
yading@10
|
237 mova m2, [zq+32]
|
yading@10
|
238 mova m3, [zq+48]
|
yading@10
|
239 xorps m0, [ps_mask2]
|
yading@10
|
240 xorps m1, [ps_mask2]
|
yading@10
|
241 xorps m2, [ps_mask2]
|
yading@10
|
242 xorps m3, [ps_mask2]
|
yading@10
|
243 mova [zq+ 0], m0
|
yading@10
|
244 mova [zq+16], m1
|
yading@10
|
245 mova [zq+32], m2
|
yading@10
|
246 mova [zq+48], m3
|
yading@10
|
247 add zq, 64
|
yading@10
|
248 cmp zq, r1q
|
yading@10
|
249 jne .loop
|
yading@10
|
250 REP_RET
|
yading@10
|
251
|
yading@10
|
252 ; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
|
yading@10
|
253 %macro SBR_QMF_DEINT_BFLY 0
|
yading@10
|
254 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
|
yading@10
|
255 mov cq, 64*4-2*mmsize
|
yading@10
|
256 lea vrevq, [vq + 64*4]
|
yading@10
|
257 .loop:
|
yading@10
|
258 mova m0, [src0q+cq]
|
yading@10
|
259 mova m1, [src1q]
|
yading@10
|
260 mova m4, [src0q+cq+mmsize]
|
yading@10
|
261 mova m5, [src1q+mmsize]
|
yading@10
|
262 %if cpuflag(sse2)
|
yading@10
|
263 pshufd m2, m0, q0123
|
yading@10
|
264 pshufd m3, m1, q0123
|
yading@10
|
265 pshufd m6, m4, q0123
|
yading@10
|
266 pshufd m7, m5, q0123
|
yading@10
|
267 %else
|
yading@10
|
268 shufps m2, m0, m0, q0123
|
yading@10
|
269 shufps m3, m1, m1, q0123
|
yading@10
|
270 shufps m6, m4, m4, q0123
|
yading@10
|
271 shufps m7, m5, m5, q0123
|
yading@10
|
272 %endif
|
yading@10
|
273 addps m5, m2
|
yading@10
|
274 subps m0, m7
|
yading@10
|
275 addps m1, m6
|
yading@10
|
276 subps m4, m3
|
yading@10
|
277 mova [vrevq], m1
|
yading@10
|
278 mova [vrevq+mmsize], m5
|
yading@10
|
279 mova [vq+cq], m0
|
yading@10
|
280 mova [vq+cq+mmsize], m4
|
yading@10
|
281 add src1q, 2*mmsize
|
yading@10
|
282 add vrevq, 2*mmsize
|
yading@10
|
283 sub cq, 2*mmsize
|
yading@10
|
284 jge .loop
|
yading@10
|
285 REP_RET
|
yading@10
|
286 %endmacro
|
yading@10
|
287
|
yading@10
|
288 INIT_XMM sse
|
yading@10
|
289 SBR_QMF_DEINT_BFLY
|
yading@10
|
290
|
yading@10
|
291 INIT_XMM sse2
|
yading@10
|
292 SBR_QMF_DEINT_BFLY
|
yading@10
|
293
|
yading@10
|
294 INIT_XMM sse2
|
yading@10
|
295 cglobal sbr_qmf_pre_shuffle, 1,4,7,z
|
yading@10
|
296 %define OFFSET (32*4-2*mmsize)
|
yading@10
|
297 mov r3q, OFFSET
|
yading@10
|
298 lea r1q, [zq + (32+1)*4]
|
yading@10
|
299 lea r2q, [zq + 64*4]
|
yading@10
|
300 mova m6, [ps_neg]
|
yading@10
|
301 .loop:
|
yading@10
|
302 movu m0, [r1q]
|
yading@10
|
303 movu m2, [r1q + mmsize]
|
yading@10
|
304 movu m1, [zq + r3q + 4 + mmsize]
|
yading@10
|
305 movu m3, [zq + r3q + 4]
|
yading@10
|
306
|
yading@10
|
307 pxor m2, m6
|
yading@10
|
308 pxor m0, m6
|
yading@10
|
309 pshufd m2, m2, q0123
|
yading@10
|
310 pshufd m0, m0, q0123
|
yading@10
|
311 SBUTTERFLY dq, 2, 3, 5
|
yading@10
|
312 SBUTTERFLY dq, 0, 1, 4
|
yading@10
|
313 mova [r2q + 2*r3q + 0*mmsize], m2
|
yading@10
|
314 mova [r2q + 2*r3q + 1*mmsize], m3
|
yading@10
|
315 mova [r2q + 2*r3q + 2*mmsize], m0
|
yading@10
|
316 mova [r2q + 2*r3q + 3*mmsize], m1
|
yading@10
|
317 add r1q, 2*mmsize
|
yading@10
|
318 sub r3q, 2*mmsize
|
yading@10
|
319 jge .loop
|
yading@10
|
320 mova m2, [zq]
|
yading@10
|
321 movq [r2q], m2
|
yading@10
|
322 REP_RET
|
yading@10
|
323
|
yading@10
|
324 %if WIN64
|
yading@10
|
325 %define NREGS 0
|
yading@10
|
326 %define NOISE_TABLE sbr_noise_table
|
yading@10
|
327 %else
|
yading@10
|
328 %ifdef PIC
|
yading@10
|
329 %define NREGS 1
|
yading@10
|
330 %if UNIX64
|
yading@10
|
331 %define NOISE_TABLE r6q ; r5q is m_max
|
yading@10
|
332 %else
|
yading@10
|
333 %define NOISE_TABLE r5q
|
yading@10
|
334 %endif
|
yading@10
|
335 %else
|
yading@10
|
336 %define NREGS 0
|
yading@10
|
337 %define NOISE_TABLE sbr_noise_table
|
yading@10
|
338 %endif
|
yading@10
|
339 %endif
|
yading@10
|
340
|
yading@10
|
341 %macro LOAD_NST 1
|
yading@10
|
342 %if NREGS
|
yading@10
|
343 lea NOISE_TABLE, [%1]
|
yading@10
|
344 mova m0, [kxq + NOISE_TABLE]
|
yading@10
|
345 %else
|
yading@10
|
346 mova m0, [kxq + %1]
|
yading@10
|
347 %endif
|
yading@10
|
348 %endmacro
|
yading@10
|
349
|
yading@10
|
350 INIT_XMM sse2
|
yading@10
|
351 ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
|
yading@10
|
352 ; const float *q_filt, int noise,
|
yading@10
|
353 ; int kx, int m_max)
|
yading@10
|
354 cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
yading@10
|
355 mova m0, [ps_noise0]
|
yading@10
|
356 jmp apply_noise_main
|
yading@10
|
357
|
yading@10
|
358 ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
|
yading@10
|
359 ; const float *q_filt, int noise,
|
yading@10
|
360 ; int kx, int m_max)
|
yading@10
|
361 cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
yading@10
|
362 and kxq, 1
|
yading@10
|
363 shl kxq, 4
|
yading@10
|
364 LOAD_NST ps_noise13
|
yading@10
|
365 jmp apply_noise_main
|
yading@10
|
366
|
yading@10
|
367 ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
|
yading@10
|
368 ; const float *q_filt, int noise,
|
yading@10
|
369 ; int kx, int m_max)
|
yading@10
|
370 cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
yading@10
|
371 mova m0, [ps_noise2]
|
yading@10
|
372 jmp apply_noise_main
|
yading@10
|
373
|
yading@10
|
374 ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
|
yading@10
|
375 ; const float *q_filt, int noise,
|
yading@10
|
376 ; int kx, int m_max)
|
yading@10
|
377 cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
|
yading@10
|
378 and kxq, 1
|
yading@10
|
379 shl kxq, 4
|
yading@10
|
380 LOAD_NST ps_noise13+16
|
yading@10
|
381
|
yading@10
|
382 apply_noise_main:
|
yading@10
|
383 %if ARCH_X86_64 == 0 || WIN64
|
yading@10
|
384 mov kxd, m_maxm
|
yading@10
|
385 %define count kxq
|
yading@10
|
386 %else
|
yading@10
|
387 %define count m_maxq
|
yading@10
|
388 %endif
|
yading@10
|
389 dec noiseq
|
yading@10
|
390 shl count, 2
|
yading@10
|
391 %if NREGS
|
yading@10
|
392 lea NOISE_TABLE, [sbr_noise_table]
|
yading@10
|
393 %endif
|
yading@10
|
394 lea Yq, [Yq + 2*count]
|
yading@10
|
395 add s_mq, count
|
yading@10
|
396 add q_filtq, count
|
yading@10
|
397 shl noiseq, 3
|
yading@10
|
398 pxor m5, m5
|
yading@10
|
399 neg count
|
yading@10
|
400 .loop:
|
yading@10
|
401 mova m1, [q_filtq + count]
|
yading@10
|
402 movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
|
yading@10
|
403 movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
|
yading@10
|
404 add noiseq, 2*mmsize
|
yading@10
|
405 and noiseq, 0x1ff<<3
|
yading@10
|
406 punpckhdq m2, m1, m1
|
yading@10
|
407 punpckldq m1, m1
|
yading@10
|
408 mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
yading@10
|
409 mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
|
yading@10
|
410 mova m3, [s_mq + count]
|
yading@10
|
411 ; TODO: replace by a vpermd in AVX2
|
yading@10
|
412 punpckhdq m4, m3, m3
|
yading@10
|
413 punpckldq m3, m3
|
yading@10
|
414 pcmpeqd m6, m3, m5 ; m6 == 0
|
yading@10
|
415 pcmpeqd m7, m4, m5 ; m7 == 0
|
yading@10
|
416 mulps m3, m0 ; s_m[m] * phi_sign
|
yading@10
|
417 mulps m4, m0 ; s_m[m] * phi_sign
|
yading@10
|
418 pand m1, m6
|
yading@10
|
419 pand m2, m7
|
yading@10
|
420 movu m6, [Yq + 2*count]
|
yading@10
|
421 movu m7, [Yq + 2*count + mmsize]
|
yading@10
|
422 addps m3, m1
|
yading@10
|
423 addps m4, m2
|
yading@10
|
424 addps m6, m3
|
yading@10
|
425 addps m7, m4
|
yading@10
|
426 movu [Yq + 2*count], m6
|
yading@10
|
427 movu [Yq + 2*count + mmsize], m7
|
yading@10
|
428 add count, mmsize
|
yading@10
|
429 jl .loop
|
yading@10
|
430 RET
|