annotate src/fftw-3.3.8/dft/simd/common/t2sv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:12 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include dft/simd/ts.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 488 FP additions, 350 FP multiplications,
Chris@82 32 * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
Chris@82 33 * 164 stack variables, 7 constants, and 128 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/ts.h"
Chris@82 36
Chris@82 37 static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 41 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 46 {
Chris@82 47 INT m;
Chris@82 48 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 49 V T2, T8, T3, T6, Te, Ti, T5, T7, TJ, Tb, TM, Tc, Ts, T23, T1w;
Chris@82 50 V T19, TA, TE, T1s, T1N, T1o, T1C, T1F, T1K, T15, T11, T2F, T31, T2J, T34;
Chris@82 51 V T3f, T3z, T3j, T3C, Tw, T3M, T3Q, T1z, T2s, T2w, T1d, T3n, T3r, T26, T2T;
Chris@82 52 V T2X, Th, TR, TP, Td, Tj, TW, Tn, TS, T1U, T2b, T29, T1R, T1V, T2g;
Chris@82 53 V T1Z, T2c;
Chris@82 54 {
Chris@82 55 V Tz, T1n, T10, TD, T1r, T14, T9, T1Q, Tv, T1c;
Chris@82 56 {
Chris@82 57 V T4, T18, Ta, Tr;
Chris@82 58 T2 = LDW(&(W[0]));
Chris@82 59 T8 = LDW(&(W[TWVL * 4]));
Chris@82 60 T3 = LDW(&(W[TWVL * 2]));
Chris@82 61 T6 = LDW(&(W[TWVL * 3]));
Chris@82 62 T4 = VMUL(T2, T3);
Chris@82 63 T18 = VMUL(T3, T8);
Chris@82 64 Ta = VMUL(T2, T6);
Chris@82 65 Tr = VMUL(T2, T8);
Chris@82 66 Te = LDW(&(W[TWVL * 6]));
Chris@82 67 Tz = VMUL(T3, Te);
Chris@82 68 T1n = VMUL(T8, Te);
Chris@82 69 T10 = VMUL(T2, Te);
Chris@82 70 Ti = LDW(&(W[TWVL * 7]));
Chris@82 71 TD = VMUL(T3, Ti);
Chris@82 72 T1r = VMUL(T8, Ti);
Chris@82 73 T14 = VMUL(T2, Ti);
Chris@82 74 T5 = LDW(&(W[TWVL * 1]));
Chris@82 75 T7 = VFMA(T5, T6, T4);
Chris@82 76 TJ = VFNMS(T5, T6, T4);
Chris@82 77 T9 = VMUL(T7, T8);
Chris@82 78 T1Q = VMUL(TJ, T8);
Chris@82 79 Tb = VFNMS(T5, T3, Ta);
Chris@82 80 TM = VFMA(T5, T3, Ta);
Chris@82 81 Tc = LDW(&(W[TWVL * 5]));
Chris@82 82 Tv = VMUL(T2, Tc);
Chris@82 83 T1c = VMUL(T3, Tc);
Chris@82 84 Ts = VFMA(T5, Tc, Tr);
Chris@82 85 T23 = VFMA(T6, Tc, T18);
Chris@82 86 T1w = VFNMS(T5, Tc, Tr);
Chris@82 87 T19 = VFNMS(T6, Tc, T18);
Chris@82 88 }
Chris@82 89 TA = VFMA(T6, Ti, Tz);
Chris@82 90 TE = VFNMS(T6, Te, TD);
Chris@82 91 T1s = VFNMS(Tc, Te, T1r);
Chris@82 92 T1N = VFMA(T6, Te, TD);
Chris@82 93 T1o = VFMA(Tc, Ti, T1n);
Chris@82 94 T1C = VFMA(T5, Ti, T10);
Chris@82 95 T1F = VFNMS(T5, Te, T14);
Chris@82 96 T1K = VFNMS(T6, Ti, Tz);
Chris@82 97 T15 = VFMA(T5, Te, T14);
Chris@82 98 T11 = VFNMS(T5, Ti, T10);
Chris@82 99 {
Chris@82 100 V T2E, T2I, T2S, T2W;
Chris@82 101 T2E = VMUL(T7, Te);
Chris@82 102 T2F = VFMA(Tb, Ti, T2E);
Chris@82 103 T31 = VFNMS(Tb, Ti, T2E);
Chris@82 104 T2I = VMUL(T7, Ti);
Chris@82 105 T2J = VFNMS(Tb, Te, T2I);
Chris@82 106 T34 = VFMA(Tb, Te, T2I);
Chris@82 107 {
Chris@82 108 V T3e, T3i, T3L, T3P;
Chris@82 109 T3e = VMUL(TJ, Te);
Chris@82 110 T3f = VFNMS(TM, Ti, T3e);
Chris@82 111 T3z = VFMA(TM, Ti, T3e);
Chris@82 112 T3i = VMUL(TJ, Ti);
Chris@82 113 T3j = VFMA(TM, Te, T3i);
Chris@82 114 T3C = VFNMS(TM, Te, T3i);
Chris@82 115 T3L = VMUL(Ts, Te);
Chris@82 116 T3P = VMUL(Ts, Ti);
Chris@82 117 Tw = VFNMS(T5, T8, Tv);
Chris@82 118 T3M = VFMA(Tw, Ti, T3L);
Chris@82 119 T3Q = VFNMS(Tw, Te, T3P);
Chris@82 120 }
Chris@82 121 {
Chris@82 122 V T2r, T2v, T3m, T3q;
Chris@82 123 T2r = VMUL(T1w, Te);
Chris@82 124 T2v = VMUL(T1w, Ti);
Chris@82 125 T1z = VFMA(T5, T8, Tv);
Chris@82 126 T2s = VFMA(T1z, Ti, T2r);
Chris@82 127 T2w = VFNMS(T1z, Te, T2v);
Chris@82 128 T3m = VMUL(T19, Te);
Chris@82 129 T3q = VMUL(T19, Ti);
Chris@82 130 T1d = VFMA(T6, T8, T1c);
Chris@82 131 T3n = VFMA(T1d, Ti, T3m);
Chris@82 132 T3r = VFNMS(T1d, Te, T3q);
Chris@82 133 }
Chris@82 134 T2S = VMUL(T23, Te);
Chris@82 135 T2W = VMUL(T23, Ti);
Chris@82 136 T26 = VFNMS(T6, T8, T1c);
Chris@82 137 T2T = VFMA(T26, Ti, T2S);
Chris@82 138 T2X = VFNMS(T26, Te, T2W);
Chris@82 139 {
Chris@82 140 V TQ, TV, Tf, Tm, Tg;
Chris@82 141 Tg = VMUL(T7, Tc);
Chris@82 142 Th = VFMA(Tb, T8, Tg);
Chris@82 143 TR = VFNMS(Tb, T8, Tg);
Chris@82 144 TP = VFMA(Tb, Tc, T9);
Chris@82 145 TQ = VMUL(TP, Te);
Chris@82 146 TV = VMUL(TP, Ti);
Chris@82 147 Td = VFNMS(Tb, Tc, T9);
Chris@82 148 Tf = VMUL(Td, Te);
Chris@82 149 Tm = VMUL(Td, Ti);
Chris@82 150 Tj = VFMA(Th, Ti, Tf);
Chris@82 151 TW = VFNMS(TR, Te, TV);
Chris@82 152 Tn = VFNMS(Th, Te, Tm);
Chris@82 153 TS = VFMA(TR, Ti, TQ);
Chris@82 154 }
Chris@82 155 {
Chris@82 156 V T2a, T2f, T1S, T1Y, T1T;
Chris@82 157 T1T = VMUL(TJ, Tc);
Chris@82 158 T1U = VFMA(TM, T8, T1T);
Chris@82 159 T2b = VFNMS(TM, T8, T1T);
Chris@82 160 T29 = VFMA(TM, Tc, T1Q);
Chris@82 161 T2a = VMUL(T29, Te);
Chris@82 162 T2f = VMUL(T29, Ti);
Chris@82 163 T1R = VFNMS(TM, Tc, T1Q);
Chris@82 164 T1S = VMUL(T1R, Te);
Chris@82 165 T1Y = VMUL(T1R, Ti);
Chris@82 166 T1V = VFMA(T1U, Ti, T1S);
Chris@82 167 T2g = VFNMS(T2b, Te, T2f);
Chris@82 168 T1Z = VFNMS(T1U, Te, T1Y);
Chris@82 169 T2c = VFMA(T2b, Ti, T2a);
Chris@82 170 }
Chris@82 171 }
Chris@82 172 }
Chris@82 173 {
Chris@82 174 V Tq, T46, T8H, T97, TH, T98, T4b, T8D, TZ, T7f, T4j, T6t, T1g, T7g, T4q;
Chris@82 175 V T6u, T1v, T1I, T7m, T7j, T7k, T7l, T4z, T6x, T4G, T6y, T22, T2j, T7o, T7p;
Chris@82 176 V T7q, T7r, T4O, T6A, T4V, T6B, T3G, T7L, T7I, T8n, T5E, T6P, T61, T6M, T2N;
Chris@82 177 V T7A, T7x, T8i, T55, T6I, T5s, T6F, T43, T7J, T7O, T8o, T5L, T62, T5S, T63;
Chris@82 178 V T3c, T7y, T7D, T8j, T5c, T5t, T5j, T5u;
Chris@82 179 {
Chris@82 180 V T1, T8G, Tk, Tl, To, T8E, Tp, T8F;
Chris@82 181 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 182 T8G = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 183 Tk = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@82 184 Tl = VMUL(Tj, Tk);
Chris@82 185 To = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@82 186 T8E = VMUL(Tj, To);
Chris@82 187 Tp = VFMA(Tn, To, Tl);
Chris@82 188 Tq = VADD(T1, Tp);
Chris@82 189 T46 = VSUB(T1, Tp);
Chris@82 190 T8F = VFNMS(Tn, Tk, T8E);
Chris@82 191 T8H = VADD(T8F, T8G);
Chris@82 192 T97 = VSUB(T8G, T8F);
Chris@82 193 }
Chris@82 194 {
Chris@82 195 V Tt, Tu, Tx, T47, TB, TC, TF, T49;
Chris@82 196 Tt = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@82 197 Tu = VMUL(Ts, Tt);
Chris@82 198 Tx = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@82 199 T47 = VMUL(Ts, Tx);
Chris@82 200 TB = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@82 201 TC = VMUL(TA, TB);
Chris@82 202 TF = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@82 203 T49 = VMUL(TA, TF);
Chris@82 204 {
Chris@82 205 V Ty, TG, T48, T4a;
Chris@82 206 Ty = VFMA(Tw, Tx, Tu);
Chris@82 207 TG = VFMA(TE, TF, TC);
Chris@82 208 TH = VADD(Ty, TG);
Chris@82 209 T98 = VSUB(Ty, TG);
Chris@82 210 T48 = VFNMS(Tw, Tt, T47);
Chris@82 211 T4a = VFNMS(TE, TB, T49);
Chris@82 212 T4b = VSUB(T48, T4a);
Chris@82 213 T8D = VADD(T48, T4a);
Chris@82 214 }
Chris@82 215 }
Chris@82 216 {
Chris@82 217 V TO, T4f, TY, T4h, T4d, T4i;
Chris@82 218 {
Chris@82 219 V TK, TL, TN, T4e;
Chris@82 220 TK = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 221 TL = VMUL(TJ, TK);
Chris@82 222 TN = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 223 T4e = VMUL(TJ, TN);
Chris@82 224 TO = VFMA(TM, TN, TL);
Chris@82 225 T4f = VFNMS(TM, TK, T4e);
Chris@82 226 }
Chris@82 227 {
Chris@82 228 V TT, TU, TX, T4g;
Chris@82 229 TT = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@82 230 TU = VMUL(TS, TT);
Chris@82 231 TX = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@82 232 T4g = VMUL(TS, TX);
Chris@82 233 TY = VFMA(TW, TX, TU);
Chris@82 234 T4h = VFNMS(TW, TT, T4g);
Chris@82 235 }
Chris@82 236 TZ = VADD(TO, TY);
Chris@82 237 T7f = VADD(T4f, T4h);
Chris@82 238 T4d = VSUB(TO, TY);
Chris@82 239 T4i = VSUB(T4f, T4h);
Chris@82 240 T4j = VADD(T4d, T4i);
Chris@82 241 T6t = VSUB(T4i, T4d);
Chris@82 242 }
Chris@82 243 {
Chris@82 244 V T17, T4m, T1f, T4o, T4k, T4p;
Chris@82 245 {
Chris@82 246 V T12, T13, T16, T4l;
Chris@82 247 T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@82 248 T13 = VMUL(T11, T12);
Chris@82 249 T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@82 250 T4l = VMUL(T11, T16);
Chris@82 251 T17 = VFMA(T15, T16, T13);
Chris@82 252 T4m = VFNMS(T15, T12, T4l);
Chris@82 253 }
Chris@82 254 {
Chris@82 255 V T1a, T1b, T1e, T4n;
Chris@82 256 T1a = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@82 257 T1b = VMUL(T19, T1a);
Chris@82 258 T1e = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@82 259 T4n = VMUL(T19, T1e);
Chris@82 260 T1f = VFMA(T1d, T1e, T1b);
Chris@82 261 T4o = VFNMS(T1d, T1a, T4n);
Chris@82 262 }
Chris@82 263 T1g = VADD(T17, T1f);
Chris@82 264 T7g = VADD(T4m, T4o);
Chris@82 265 T4k = VSUB(T17, T1f);
Chris@82 266 T4p = VSUB(T4m, T4o);
Chris@82 267 T4q = VSUB(T4k, T4p);
Chris@82 268 T6u = VADD(T4k, T4p);
Chris@82 269 }
Chris@82 270 {
Chris@82 271 V T1m, T4u, T1H, T4E, T1u, T4w, T1B, T4C;
Chris@82 272 {
Chris@82 273 V T1j, T1k, T1l, T4t;
Chris@82 274 T1j = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 275 T1k = VMUL(T7, T1j);
Chris@82 276 T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 277 T4t = VMUL(T7, T1l);
Chris@82 278 T1m = VFMA(Tb, T1l, T1k);
Chris@82 279 T4u = VFNMS(Tb, T1j, T4t);
Chris@82 280 }
Chris@82 281 {
Chris@82 282 V T1D, T1E, T1G, T4D;
Chris@82 283 T1D = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@82 284 T1E = VMUL(T1C, T1D);
Chris@82 285 T1G = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@82 286 T4D = VMUL(T1C, T1G);
Chris@82 287 T1H = VFMA(T1F, T1G, T1E);
Chris@82 288 T4E = VFNMS(T1F, T1D, T4D);
Chris@82 289 }
Chris@82 290 {
Chris@82 291 V T1p, T1q, T1t, T4v;
Chris@82 292 T1p = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@82 293 T1q = VMUL(T1o, T1p);
Chris@82 294 T1t = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@82 295 T4v = VMUL(T1o, T1t);
Chris@82 296 T1u = VFMA(T1s, T1t, T1q);
Chris@82 297 T4w = VFNMS(T1s, T1p, T4v);
Chris@82 298 }
Chris@82 299 {
Chris@82 300 V T1x, T1y, T1A, T4B;
Chris@82 301 T1x = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@82 302 T1y = VMUL(T1w, T1x);
Chris@82 303 T1A = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@82 304 T4B = VMUL(T1w, T1A);
Chris@82 305 T1B = VFMA(T1z, T1A, T1y);
Chris@82 306 T4C = VFNMS(T1z, T1x, T4B);
Chris@82 307 }
Chris@82 308 T1v = VADD(T1m, T1u);
Chris@82 309 T1I = VADD(T1B, T1H);
Chris@82 310 T7m = VSUB(T1v, T1I);
Chris@82 311 T7j = VADD(T4u, T4w);
Chris@82 312 T7k = VADD(T4C, T4E);
Chris@82 313 T7l = VSUB(T7j, T7k);
Chris@82 314 {
Chris@82 315 V T4x, T4y, T4A, T4F;
Chris@82 316 T4x = VSUB(T4u, T4w);
Chris@82 317 T4y = VSUB(T1B, T1H);
Chris@82 318 T4z = VSUB(T4x, T4y);
Chris@82 319 T6x = VADD(T4x, T4y);
Chris@82 320 T4A = VSUB(T1m, T1u);
Chris@82 321 T4F = VSUB(T4C, T4E);
Chris@82 322 T4G = VADD(T4A, T4F);
Chris@82 323 T6y = VSUB(T4A, T4F);
Chris@82 324 }
Chris@82 325 }
Chris@82 326 {
Chris@82 327 V T1P, T4J, T2i, T4T, T21, T4L, T28, T4R;
Chris@82 328 {
Chris@82 329 V T1L, T1M, T1O, T4I;
Chris@82 330 T1L = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@82 331 T1M = VMUL(T1K, T1L);
Chris@82 332 T1O = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@82 333 T4I = VMUL(T1K, T1O);
Chris@82 334 T1P = VFMA(T1N, T1O, T1M);
Chris@82 335 T4J = VFNMS(T1N, T1L, T4I);
Chris@82 336 }
Chris@82 337 {
Chris@82 338 V T2d, T2e, T2h, T4S;
Chris@82 339 T2d = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@82 340 T2e = VMUL(T2c, T2d);
Chris@82 341 T2h = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@82 342 T4S = VMUL(T2c, T2h);
Chris@82 343 T2i = VFMA(T2g, T2h, T2e);
Chris@82 344 T4T = VFNMS(T2g, T2d, T4S);
Chris@82 345 }
Chris@82 346 {
Chris@82 347 V T1W, T1X, T20, T4K;
Chris@82 348 T1W = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@82 349 T1X = VMUL(T1V, T1W);
Chris@82 350 T20 = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@82 351 T4K = VMUL(T1V, T20);
Chris@82 352 T21 = VFMA(T1Z, T20, T1X);
Chris@82 353 T4L = VFNMS(T1Z, T1W, T4K);
Chris@82 354 }
Chris@82 355 {
Chris@82 356 V T24, T25, T27, T4Q;
Chris@82 357 T24 = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 358 T25 = VMUL(T23, T24);
Chris@82 359 T27 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 360 T4Q = VMUL(T23, T27);
Chris@82 361 T28 = VFMA(T26, T27, T25);
Chris@82 362 T4R = VFNMS(T26, T24, T4Q);
Chris@82 363 }
Chris@82 364 T22 = VADD(T1P, T21);
Chris@82 365 T2j = VADD(T28, T2i);
Chris@82 366 T7o = VSUB(T22, T2j);
Chris@82 367 T7p = VADD(T4J, T4L);
Chris@82 368 T7q = VADD(T4R, T4T);
Chris@82 369 T7r = VSUB(T7p, T7q);
Chris@82 370 {
Chris@82 371 V T4M, T4N, T4P, T4U;
Chris@82 372 T4M = VSUB(T4J, T4L);
Chris@82 373 T4N = VSUB(T28, T2i);
Chris@82 374 T4O = VSUB(T4M, T4N);
Chris@82 375 T6A = VADD(T4M, T4N);
Chris@82 376 T4P = VSUB(T1P, T21);
Chris@82 377 T4U = VSUB(T4R, T4T);
Chris@82 378 T4V = VADD(T4P, T4U);
Chris@82 379 T6B = VSUB(T4P, T4U);
Chris@82 380 }
Chris@82 381 }
Chris@82 382 {
Chris@82 383 V T3l, T5z, T3E, T5Z, T3t, T5B, T3y, T5X;
Chris@82 384 {
Chris@82 385 V T3g, T3h, T3k, T5y;
Chris@82 386 T3g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@82 387 T3h = VMUL(T3f, T3g);
Chris@82 388 T3k = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@82 389 T5y = VMUL(T3f, T3k);
Chris@82 390 T3l = VFMA(T3j, T3k, T3h);
Chris@82 391 T5z = VFNMS(T3j, T3g, T5y);
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V T3A, T3B, T3D, T5Y;
Chris@82 395 T3A = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@82 396 T3B = VMUL(T3z, T3A);
Chris@82 397 T3D = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@82 398 T5Y = VMUL(T3z, T3D);
Chris@82 399 T3E = VFMA(T3C, T3D, T3B);
Chris@82 400 T5Z = VFNMS(T3C, T3A, T5Y);
Chris@82 401 }
Chris@82 402 {
Chris@82 403 V T3o, T3p, T3s, T5A;
Chris@82 404 T3o = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@82 405 T3p = VMUL(T3n, T3o);
Chris@82 406 T3s = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@82 407 T5A = VMUL(T3n, T3s);
Chris@82 408 T3t = VFMA(T3r, T3s, T3p);
Chris@82 409 T5B = VFNMS(T3r, T3o, T5A);
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V T3v, T3w, T3x, T5W;
Chris@82 413 T3v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 414 T3w = VMUL(TP, T3v);
Chris@82 415 T3x = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 416 T5W = VMUL(TP, T3x);
Chris@82 417 T3y = VFMA(TR, T3x, T3w);
Chris@82 418 T5X = VFNMS(TR, T3v, T5W);
Chris@82 419 }
Chris@82 420 {
Chris@82 421 V T3u, T3F, T7G, T7H;
Chris@82 422 T3u = VADD(T3l, T3t);
Chris@82 423 T3F = VADD(T3y, T3E);
Chris@82 424 T3G = VADD(T3u, T3F);
Chris@82 425 T7L = VSUB(T3u, T3F);
Chris@82 426 T7G = VADD(T5z, T5B);
Chris@82 427 T7H = VADD(T5X, T5Z);
Chris@82 428 T7I = VSUB(T7G, T7H);
Chris@82 429 T8n = VADD(T7G, T7H);
Chris@82 430 }
Chris@82 431 {
Chris@82 432 V T5C, T5D, T5V, T60;
Chris@82 433 T5C = VSUB(T5z, T5B);
Chris@82 434 T5D = VSUB(T3y, T3E);
Chris@82 435 T5E = VSUB(T5C, T5D);
Chris@82 436 T6P = VADD(T5C, T5D);
Chris@82 437 T5V = VSUB(T3l, T3t);
Chris@82 438 T60 = VSUB(T5X, T5Z);
Chris@82 439 T61 = VADD(T5V, T60);
Chris@82 440 T6M = VSUB(T5V, T60);
Chris@82 441 }
Chris@82 442 }
Chris@82 443 {
Chris@82 444 V T2q, T50, T2L, T5q, T2y, T52, T2D, T5o;
Chris@82 445 {
Chris@82 446 V T2n, T2o, T2p, T4Z;
Chris@82 447 T2n = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 448 T2o = VMUL(T2, T2n);
Chris@82 449 T2p = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 450 T4Z = VMUL(T2, T2p);
Chris@82 451 T2q = VFMA(T5, T2p, T2o);
Chris@82 452 T50 = VFNMS(T5, T2n, T4Z);
Chris@82 453 }
Chris@82 454 {
Chris@82 455 V T2G, T2H, T2K, T5p;
Chris@82 456 T2G = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@82 457 T2H = VMUL(T2F, T2G);
Chris@82 458 T2K = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@82 459 T5p = VMUL(T2F, T2K);
Chris@82 460 T2L = VFMA(T2J, T2K, T2H);
Chris@82 461 T5q = VFNMS(T2J, T2G, T5p);
Chris@82 462 }
Chris@82 463 {
Chris@82 464 V T2t, T2u, T2x, T51;
Chris@82 465 T2t = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@82 466 T2u = VMUL(T2s, T2t);
Chris@82 467 T2x = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@82 468 T51 = VMUL(T2s, T2x);
Chris@82 469 T2y = VFMA(T2w, T2x, T2u);
Chris@82 470 T52 = VFNMS(T2w, T2t, T51);
Chris@82 471 }
Chris@82 472 {
Chris@82 473 V T2A, T2B, T2C, T5n;
Chris@82 474 T2A = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@82 475 T2B = VMUL(T8, T2A);
Chris@82 476 T2C = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@82 477 T5n = VMUL(T8, T2C);
Chris@82 478 T2D = VFMA(Tc, T2C, T2B);
Chris@82 479 T5o = VFNMS(Tc, T2A, T5n);
Chris@82 480 }
Chris@82 481 {
Chris@82 482 V T2z, T2M, T7v, T7w;
Chris@82 483 T2z = VADD(T2q, T2y);
Chris@82 484 T2M = VADD(T2D, T2L);
Chris@82 485 T2N = VADD(T2z, T2M);
Chris@82 486 T7A = VSUB(T2z, T2M);
Chris@82 487 T7v = VADD(T50, T52);
Chris@82 488 T7w = VADD(T5o, T5q);
Chris@82 489 T7x = VSUB(T7v, T7w);
Chris@82 490 T8i = VADD(T7v, T7w);
Chris@82 491 }
Chris@82 492 {
Chris@82 493 V T53, T54, T5m, T5r;
Chris@82 494 T53 = VSUB(T50, T52);
Chris@82 495 T54 = VSUB(T2D, T2L);
Chris@82 496 T55 = VSUB(T53, T54);
Chris@82 497 T6I = VADD(T53, T54);
Chris@82 498 T5m = VSUB(T2q, T2y);
Chris@82 499 T5r = VSUB(T5o, T5q);
Chris@82 500 T5s = VADD(T5m, T5r);
Chris@82 501 T6F = VSUB(T5m, T5r);
Chris@82 502 }
Chris@82 503 }
Chris@82 504 {
Chris@82 505 V T3K, T5G, T41, T5Q, T3S, T5I, T3X, T5O;
Chris@82 506 {
Chris@82 507 V T3H, T3I, T3J, T5F;
Chris@82 508 T3H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 509 T3I = VMUL(T3, T3H);
Chris@82 510 T3J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 511 T5F = VMUL(T3, T3J);
Chris@82 512 T3K = VFMA(T6, T3J, T3I);
Chris@82 513 T5G = VFNMS(T6, T3H, T5F);
Chris@82 514 }
Chris@82 515 {
Chris@82 516 V T3Y, T3Z, T40, T5P;
Chris@82 517 T3Y = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@82 518 T3Z = VMUL(Td, T3Y);
Chris@82 519 T40 = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@82 520 T5P = VMUL(Td, T40);
Chris@82 521 T41 = VFMA(Th, T40, T3Z);
Chris@82 522 T5Q = VFNMS(Th, T3Y, T5P);
Chris@82 523 }
Chris@82 524 {
Chris@82 525 V T3N, T3O, T3R, T5H;
Chris@82 526 T3N = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@82 527 T3O = VMUL(T3M, T3N);
Chris@82 528 T3R = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@82 529 T5H = VMUL(T3M, T3R);
Chris@82 530 T3S = VFMA(T3Q, T3R, T3O);
Chris@82 531 T5I = VFNMS(T3Q, T3N, T5H);
Chris@82 532 }
Chris@82 533 {
Chris@82 534 V T3U, T3V, T3W, T5N;
Chris@82 535 T3U = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@82 536 T3V = VMUL(Te, T3U);
Chris@82 537 T3W = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@82 538 T5N = VMUL(Te, T3W);
Chris@82 539 T3X = VFMA(Ti, T3W, T3V);
Chris@82 540 T5O = VFNMS(Ti, T3U, T5N);
Chris@82 541 }
Chris@82 542 {
Chris@82 543 V T3T, T42, T7M, T7N;
Chris@82 544 T3T = VADD(T3K, T3S);
Chris@82 545 T42 = VADD(T3X, T41);
Chris@82 546 T43 = VADD(T3T, T42);
Chris@82 547 T7J = VSUB(T42, T3T);
Chris@82 548 T7M = VADD(T5G, T5I);
Chris@82 549 T7N = VADD(T5O, T5Q);
Chris@82 550 T7O = VSUB(T7M, T7N);
Chris@82 551 T8o = VADD(T7M, T7N);
Chris@82 552 }
Chris@82 553 {
Chris@82 554 V T5J, T5K, T5M, T5R;
Chris@82 555 T5J = VSUB(T5G, T5I);
Chris@82 556 T5K = VSUB(T3K, T3S);
Chris@82 557 T5L = VSUB(T5J, T5K);
Chris@82 558 T62 = VADD(T5K, T5J);
Chris@82 559 T5M = VSUB(T3X, T41);
Chris@82 560 T5R = VSUB(T5O, T5Q);
Chris@82 561 T5S = VADD(T5M, T5R);
Chris@82 562 T63 = VSUB(T5M, T5R);
Chris@82 563 }
Chris@82 564 }
Chris@82 565 {
Chris@82 566 V T2R, T57, T3a, T5h, T2Z, T59, T36, T5f;
Chris@82 567 {
Chris@82 568 V T2O, T2P, T2Q, T56;
Chris@82 569 T2O = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 570 T2P = VMUL(T29, T2O);
Chris@82 571 T2Q = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 572 T56 = VMUL(T29, T2Q);
Chris@82 573 T2R = VFMA(T2b, T2Q, T2P);
Chris@82 574 T57 = VFNMS(T2b, T2O, T56);
Chris@82 575 }
Chris@82 576 {
Chris@82 577 V T37, T38, T39, T5g;
Chris@82 578 T37 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@82 579 T38 = VMUL(T1R, T37);
Chris@82 580 T39 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@82 581 T5g = VMUL(T1R, T39);
Chris@82 582 T3a = VFMA(T1U, T39, T38);
Chris@82 583 T5h = VFNMS(T1U, T37, T5g);
Chris@82 584 }
Chris@82 585 {
Chris@82 586 V T2U, T2V, T2Y, T58;
Chris@82 587 T2U = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@82 588 T2V = VMUL(T2T, T2U);
Chris@82 589 T2Y = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@82 590 T58 = VMUL(T2T, T2Y);
Chris@82 591 T2Z = VFMA(T2X, T2Y, T2V);
Chris@82 592 T59 = VFNMS(T2X, T2U, T58);
Chris@82 593 }
Chris@82 594 {
Chris@82 595 V T32, T33, T35, T5e;
Chris@82 596 T32 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@82 597 T33 = VMUL(T31, T32);
Chris@82 598 T35 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@82 599 T5e = VMUL(T31, T35);
Chris@82 600 T36 = VFMA(T34, T35, T33);
Chris@82 601 T5f = VFNMS(T34, T32, T5e);
Chris@82 602 }
Chris@82 603 {
Chris@82 604 V T30, T3b, T7B, T7C;
Chris@82 605 T30 = VADD(T2R, T2Z);
Chris@82 606 T3b = VADD(T36, T3a);
Chris@82 607 T3c = VADD(T30, T3b);
Chris@82 608 T7y = VSUB(T3b, T30);
Chris@82 609 T7B = VADD(T57, T59);
Chris@82 610 T7C = VADD(T5f, T5h);
Chris@82 611 T7D = VSUB(T7B, T7C);
Chris@82 612 T8j = VADD(T7B, T7C);
Chris@82 613 }
Chris@82 614 {
Chris@82 615 V T5a, T5b, T5d, T5i;
Chris@82 616 T5a = VSUB(T57, T59);
Chris@82 617 T5b = VSUB(T2R, T2Z);
Chris@82 618 T5c = VSUB(T5a, T5b);
Chris@82 619 T5t = VADD(T5b, T5a);
Chris@82 620 T5d = VSUB(T36, T3a);
Chris@82 621 T5i = VSUB(T5f, T5h);
Chris@82 622 T5j = VADD(T5d, T5i);
Chris@82 623 T5u = VSUB(T5d, T5i);
Chris@82 624 }
Chris@82 625 }
Chris@82 626 {
Chris@82 627 V T1i, T8c, T8z, T8A, T8J, T8O, T2l, T8N, T45, T8L, T8l, T8t, T8q, T8u, T8f;
Chris@82 628 V T8B;
Chris@82 629 {
Chris@82 630 V TI, T1h, T8x, T8y;
Chris@82 631 TI = VADD(Tq, TH);
Chris@82 632 T1h = VADD(TZ, T1g);
Chris@82 633 T1i = VADD(TI, T1h);
Chris@82 634 T8c = VSUB(TI, T1h);
Chris@82 635 T8x = VADD(T8i, T8j);
Chris@82 636 T8y = VADD(T8n, T8o);
Chris@82 637 T8z = VSUB(T8x, T8y);
Chris@82 638 T8A = VADD(T8x, T8y);
Chris@82 639 }
Chris@82 640 {
Chris@82 641 V T8C, T8I, T1J, T2k;
Chris@82 642 T8C = VADD(T7f, T7g);
Chris@82 643 T8I = VADD(T8D, T8H);
Chris@82 644 T8J = VADD(T8C, T8I);
Chris@82 645 T8O = VSUB(T8I, T8C);
Chris@82 646 T1J = VADD(T1v, T1I);
Chris@82 647 T2k = VADD(T22, T2j);
Chris@82 648 T2l = VADD(T1J, T2k);
Chris@82 649 T8N = VSUB(T2k, T1J);
Chris@82 650 }
Chris@82 651 {
Chris@82 652 V T3d, T44, T8h, T8k;
Chris@82 653 T3d = VADD(T2N, T3c);
Chris@82 654 T44 = VADD(T3G, T43);
Chris@82 655 T45 = VADD(T3d, T44);
Chris@82 656 T8L = VSUB(T44, T3d);
Chris@82 657 T8h = VSUB(T2N, T3c);
Chris@82 658 T8k = VSUB(T8i, T8j);
Chris@82 659 T8l = VADD(T8h, T8k);
Chris@82 660 T8t = VSUB(T8k, T8h);
Chris@82 661 }
Chris@82 662 {
Chris@82 663 V T8m, T8p, T8d, T8e;
Chris@82 664 T8m = VSUB(T3G, T43);
Chris@82 665 T8p = VSUB(T8n, T8o);
Chris@82 666 T8q = VSUB(T8m, T8p);
Chris@82 667 T8u = VADD(T8m, T8p);
Chris@82 668 T8d = VADD(T7j, T7k);
Chris@82 669 T8e = VADD(T7p, T7q);
Chris@82 670 T8f = VSUB(T8d, T8e);
Chris@82 671 T8B = VADD(T8d, T8e);
Chris@82 672 }
Chris@82 673 {
Chris@82 674 V T2m, T8K, T8w, T8M;
Chris@82 675 T2m = VADD(T1i, T2l);
Chris@82 676 ST(&(ri[WS(rs, 16)]), VSUB(T2m, T45), ms, &(ri[0]));
Chris@82 677 ST(&(ri[0]), VADD(T2m, T45), ms, &(ri[0]));
Chris@82 678 T8K = VADD(T8B, T8J);
Chris@82 679 ST(&(ii[0]), VADD(T8A, T8K), ms, &(ii[0]));
Chris@82 680 ST(&(ii[WS(rs, 16)]), VSUB(T8K, T8A), ms, &(ii[0]));
Chris@82 681 T8w = VSUB(T1i, T2l);
Chris@82 682 ST(&(ri[WS(rs, 24)]), VSUB(T8w, T8z), ms, &(ri[0]));
Chris@82 683 ST(&(ri[WS(rs, 8)]), VADD(T8w, T8z), ms, &(ri[0]));
Chris@82 684 T8M = VSUB(T8J, T8B);
Chris@82 685 ST(&(ii[WS(rs, 8)]), VADD(T8L, T8M), ms, &(ii[0]));
Chris@82 686 ST(&(ii[WS(rs, 24)]), VSUB(T8M, T8L), ms, &(ii[0]));
Chris@82 687 }
Chris@82 688 {
Chris@82 689 V T8g, T8r, T8P, T8Q;
Chris@82 690 T8g = VADD(T8c, T8f);
Chris@82 691 T8r = VADD(T8l, T8q);
Chris@82 692 ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
Chris@82 693 ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
Chris@82 694 T8P = VADD(T8N, T8O);
Chris@82 695 T8Q = VADD(T8t, T8u);
Chris@82 696 ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
Chris@82 697 ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
Chris@82 698 }
Chris@82 699 {
Chris@82 700 V T8s, T8v, T8R, T8S;
Chris@82 701 T8s = VSUB(T8c, T8f);
Chris@82 702 T8v = VSUB(T8t, T8u);
Chris@82 703 ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
Chris@82 704 ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
Chris@82 705 T8R = VSUB(T8O, T8N);
Chris@82 706 T8S = VSUB(T8q, T8l);
Chris@82 707 ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
Chris@82 708 ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
Chris@82 709 }
Chris@82 710 }
Chris@82 711 {
Chris@82 712 V T7i, T7W, T86, T8a, T8V, T91, T7t, T8W, T7F, T7T, T7Z, T92, T83, T89, T7Q;
Chris@82 713 V T7U;
Chris@82 714 {
Chris@82 715 V T7e, T7h, T84, T85;
Chris@82 716 T7e = VSUB(Tq, TH);
Chris@82 717 T7h = VSUB(T7f, T7g);
Chris@82 718 T7i = VSUB(T7e, T7h);
Chris@82 719 T7W = VADD(T7e, T7h);
Chris@82 720 T84 = VADD(T7L, T7O);
Chris@82 721 T85 = VADD(T7I, T7J);
Chris@82 722 T86 = VFNMS(LDK(KP414213562), T85, T84);
Chris@82 723 T8a = VFMA(LDK(KP414213562), T84, T85);
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V T8T, T8U, T7n, T7s;
Chris@82 727 T8T = VSUB(T1g, TZ);
Chris@82 728 T8U = VSUB(T8H, T8D);
Chris@82 729 T8V = VADD(T8T, T8U);
Chris@82 730 T91 = VSUB(T8U, T8T);
Chris@82 731 T7n = VSUB(T7l, T7m);
Chris@82 732 T7s = VADD(T7o, T7r);
Chris@82 733 T7t = VSUB(T7n, T7s);
Chris@82 734 T8W = VADD(T7n, T7s);
Chris@82 735 }
Chris@82 736 {
Chris@82 737 V T7z, T7E, T7X, T7Y;
Chris@82 738 T7z = VSUB(T7x, T7y);
Chris@82 739 T7E = VSUB(T7A, T7D);
Chris@82 740 T7F = VFMA(LDK(KP414213562), T7E, T7z);
Chris@82 741 T7T = VFNMS(LDK(KP414213562), T7z, T7E);
Chris@82 742 T7X = VADD(T7m, T7l);
Chris@82 743 T7Y = VSUB(T7o, T7r);
Chris@82 744 T7Z = VADD(T7X, T7Y);
Chris@82 745 T92 = VSUB(T7Y, T7X);
Chris@82 746 }
Chris@82 747 {
Chris@82 748 V T81, T82, T7K, T7P;
Chris@82 749 T81 = VADD(T7A, T7D);
Chris@82 750 T82 = VADD(T7x, T7y);
Chris@82 751 T83 = VFMA(LDK(KP414213562), T82, T81);
Chris@82 752 T89 = VFNMS(LDK(KP414213562), T81, T82);
Chris@82 753 T7K = VSUB(T7I, T7J);
Chris@82 754 T7P = VSUB(T7L, T7O);
Chris@82 755 T7Q = VFNMS(LDK(KP414213562), T7P, T7K);
Chris@82 756 T7U = VFMA(LDK(KP414213562), T7K, T7P);
Chris@82 757 }
Chris@82 758 {
Chris@82 759 V T7u, T7R, T93, T94;
Chris@82 760 T7u = VFMA(LDK(KP707106781), T7t, T7i);
Chris@82 761 T7R = VSUB(T7F, T7Q);
Chris@82 762 ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
Chris@82 763 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
Chris@82 764 T93 = VFMA(LDK(KP707106781), T92, T91);
Chris@82 765 T94 = VSUB(T7U, T7T);
Chris@82 766 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T94, T93), ms, &(ii[0]));
Chris@82 767 ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T94, T93), ms, &(ii[0]));
Chris@82 768 }
Chris@82 769 {
Chris@82 770 V T7S, T7V, T95, T96;
Chris@82 771 T7S = VFNMS(LDK(KP707106781), T7t, T7i);
Chris@82 772 T7V = VADD(T7T, T7U);
Chris@82 773 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
Chris@82 774 ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
Chris@82 775 T95 = VFNMS(LDK(KP707106781), T92, T91);
Chris@82 776 T96 = VADD(T7F, T7Q);
Chris@82 777 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T96, T95), ms, &(ii[0]));
Chris@82 778 ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T96, T95), ms, &(ii[0]));
Chris@82 779 }
Chris@82 780 {
Chris@82 781 V T80, T87, T8X, T8Y;
Chris@82 782 T80 = VFMA(LDK(KP707106781), T7Z, T7W);
Chris@82 783 T87 = VADD(T83, T86);
Chris@82 784 ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T87, T80), ms, &(ri[0]));
Chris@82 785 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T87, T80), ms, &(ri[0]));
Chris@82 786 T8X = VFMA(LDK(KP707106781), T8W, T8V);
Chris@82 787 T8Y = VADD(T89, T8a);
Chris@82 788 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
Chris@82 789 ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
Chris@82 790 }
Chris@82 791 {
Chris@82 792 V T88, T8b, T8Z, T90;
Chris@82 793 T88 = VFNMS(LDK(KP707106781), T7Z, T7W);
Chris@82 794 T8b = VSUB(T89, T8a);
Chris@82 795 ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
Chris@82 796 ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
Chris@82 797 T8Z = VFNMS(LDK(KP707106781), T8W, T8V);
Chris@82 798 T90 = VSUB(T86, T83);
Chris@82 799 ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
Chris@82 800 ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
Chris@82 801 }
Chris@82 802 }
Chris@82 803 {
Chris@82 804 V T4s, T6c, T4X, T9c, T9b, T9h, T6f, T9i, T66, T6q, T6a, T6m, T5x, T6p, T69;
Chris@82 805 V T6j;
Chris@82 806 {
Chris@82 807 V T4c, T4r, T6d, T6e;
Chris@82 808 T4c = VADD(T46, T4b);
Chris@82 809 T4r = VADD(T4j, T4q);
Chris@82 810 T4s = VFNMS(LDK(KP707106781), T4r, T4c);
Chris@82 811 T6c = VFMA(LDK(KP707106781), T4r, T4c);
Chris@82 812 {
Chris@82 813 V T4H, T4W, T99, T9a;
Chris@82 814 T4H = VFNMS(LDK(KP414213562), T4G, T4z);
Chris@82 815 T4W = VFMA(LDK(KP414213562), T4V, T4O);
Chris@82 816 T4X = VSUB(T4H, T4W);
Chris@82 817 T9c = VADD(T4H, T4W);
Chris@82 818 T99 = VSUB(T97, T98);
Chris@82 819 T9a = VADD(T6t, T6u);
Chris@82 820 T9b = VFMA(LDK(KP707106781), T9a, T99);
Chris@82 821 T9h = VFNMS(LDK(KP707106781), T9a, T99);
Chris@82 822 }
Chris@82 823 T6d = VFMA(LDK(KP414213562), T4z, T4G);
Chris@82 824 T6e = VFNMS(LDK(KP414213562), T4O, T4V);
Chris@82 825 T6f = VADD(T6d, T6e);
Chris@82 826 T9i = VSUB(T6e, T6d);
Chris@82 827 {
Chris@82 828 V T5U, T6l, T65, T6k, T5T, T64;
Chris@82 829 T5T = VADD(T5L, T5S);
Chris@82 830 T5U = VFNMS(LDK(KP707106781), T5T, T5E);
Chris@82 831 T6l = VFMA(LDK(KP707106781), T5T, T5E);
Chris@82 832 T64 = VADD(T62, T63);
Chris@82 833 T65 = VFNMS(LDK(KP707106781), T64, T61);
Chris@82 834 T6k = VFMA(LDK(KP707106781), T64, T61);
Chris@82 835 T66 = VFNMS(LDK(KP668178637), T65, T5U);
Chris@82 836 T6q = VFMA(LDK(KP198912367), T6k, T6l);
Chris@82 837 T6a = VFMA(LDK(KP668178637), T5U, T65);
Chris@82 838 T6m = VFNMS(LDK(KP198912367), T6l, T6k);
Chris@82 839 }
Chris@82 840 {
Chris@82 841 V T5l, T6i, T5w, T6h, T5k, T5v;
Chris@82 842 T5k = VADD(T5c, T5j);
Chris@82 843 T5l = VFNMS(LDK(KP707106781), T5k, T55);
Chris@82 844 T6i = VFMA(LDK(KP707106781), T5k, T55);
Chris@82 845 T5v = VADD(T5t, T5u);
Chris@82 846 T5w = VFNMS(LDK(KP707106781), T5v, T5s);
Chris@82 847 T6h = VFMA(LDK(KP707106781), T5v, T5s);
Chris@82 848 T5x = VFMA(LDK(KP668178637), T5w, T5l);
Chris@82 849 T6p = VFNMS(LDK(KP198912367), T6h, T6i);
Chris@82 850 T69 = VFNMS(LDK(KP668178637), T5l, T5w);
Chris@82 851 T6j = VFMA(LDK(KP198912367), T6i, T6h);
Chris@82 852 }
Chris@82 853 }
Chris@82 854 {
Chris@82 855 V T4Y, T67, T9j, T9k;
Chris@82 856 T4Y = VFMA(LDK(KP923879532), T4X, T4s);
Chris@82 857 T67 = VSUB(T5x, T66);
Chris@82 858 ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
Chris@82 859 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
Chris@82 860 T9j = VFMA(LDK(KP923879532), T9i, T9h);
Chris@82 861 T9k = VSUB(T6a, T69);
Chris@82 862 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
Chris@82 863 ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
Chris@82 864 }
Chris@82 865 {
Chris@82 866 V T68, T6b, T9l, T9m;
Chris@82 867 T68 = VFNMS(LDK(KP923879532), T4X, T4s);
Chris@82 868 T6b = VADD(T69, T6a);
Chris@82 869 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
Chris@82 870 ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
Chris@82 871 T9l = VFNMS(LDK(KP923879532), T9i, T9h);
Chris@82 872 T9m = VADD(T5x, T66);
Chris@82 873 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
Chris@82 874 ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
Chris@82 875 }
Chris@82 876 {
Chris@82 877 V T6g, T6n, T9d, T9e;
Chris@82 878 T6g = VFMA(LDK(KP923879532), T6f, T6c);
Chris@82 879 T6n = VADD(T6j, T6m);
Chris@82 880 ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
Chris@82 881 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
Chris@82 882 T9d = VFMA(LDK(KP923879532), T9c, T9b);
Chris@82 883 T9e = VADD(T6p, T6q);
Chris@82 884 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
Chris@82 885 ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
Chris@82 886 }
Chris@82 887 {
Chris@82 888 V T6o, T6r, T9f, T9g;
Chris@82 889 T6o = VFNMS(LDK(KP923879532), T6f, T6c);
Chris@82 890 T6r = VSUB(T6p, T6q);
Chris@82 891 ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
Chris@82 892 ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
Chris@82 893 T9f = VFNMS(LDK(KP923879532), T9c, T9b);
Chris@82 894 T9g = VSUB(T6m, T6j);
Chris@82 895 ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
Chris@82 896 ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
Chris@82 897 }
Chris@82 898 }
Chris@82 899 {
Chris@82 900 V T6w, T6Y, T6D, T9w, T9p, T9v, T71, T9q, T6S, T7c, T6W, T78, T6L, T7b, T6V;
Chris@82 901 V T75;
Chris@82 902 {
Chris@82 903 V T6s, T6v, T6Z, T70;
Chris@82 904 T6s = VSUB(T46, T4b);
Chris@82 905 T6v = VSUB(T6t, T6u);
Chris@82 906 T6w = VFMA(LDK(KP707106781), T6v, T6s);
Chris@82 907 T6Y = VFNMS(LDK(KP707106781), T6v, T6s);
Chris@82 908 {
Chris@82 909 V T6z, T6C, T9n, T9o;
Chris@82 910 T6z = VFMA(LDK(KP414213562), T6y, T6x);
Chris@82 911 T6C = VFNMS(LDK(KP414213562), T6B, T6A);
Chris@82 912 T6D = VSUB(T6z, T6C);
Chris@82 913 T9w = VADD(T6z, T6C);
Chris@82 914 T9n = VADD(T98, T97);
Chris@82 915 T9o = VSUB(T4q, T4j);
Chris@82 916 T9p = VFMA(LDK(KP707106781), T9o, T9n);
Chris@82 917 T9v = VFNMS(LDK(KP707106781), T9o, T9n);
Chris@82 918 }
Chris@82 919 T6Z = VFNMS(LDK(KP414213562), T6x, T6y);
Chris@82 920 T70 = VFMA(LDK(KP414213562), T6A, T6B);
Chris@82 921 T71 = VADD(T6Z, T70);
Chris@82 922 T9q = VSUB(T70, T6Z);
Chris@82 923 {
Chris@82 924 V T6O, T77, T6R, T76, T6N, T6Q;
Chris@82 925 T6N = VSUB(T5S, T5L);
Chris@82 926 T6O = VFNMS(LDK(KP707106781), T6N, T6M);
Chris@82 927 T77 = VFMA(LDK(KP707106781), T6N, T6M);
Chris@82 928 T6Q = VSUB(T62, T63);
Chris@82 929 T6R = VFNMS(LDK(KP707106781), T6Q, T6P);
Chris@82 930 T76 = VFMA(LDK(KP707106781), T6Q, T6P);
Chris@82 931 T6S = VFNMS(LDK(KP668178637), T6R, T6O);
Chris@82 932 T7c = VFMA(LDK(KP198912367), T76, T77);
Chris@82 933 T6W = VFMA(LDK(KP668178637), T6O, T6R);
Chris@82 934 T78 = VFNMS(LDK(KP198912367), T77, T76);
Chris@82 935 }
Chris@82 936 {
Chris@82 937 V T6H, T74, T6K, T73, T6G, T6J;
Chris@82 938 T6G = VSUB(T5j, T5c);
Chris@82 939 T6H = VFNMS(LDK(KP707106781), T6G, T6F);
Chris@82 940 T74 = VFMA(LDK(KP707106781), T6G, T6F);
Chris@82 941 T6J = VSUB(T5t, T5u);
Chris@82 942 T6K = VFNMS(LDK(KP707106781), T6J, T6I);
Chris@82 943 T73 = VFMA(LDK(KP707106781), T6J, T6I);
Chris@82 944 T6L = VFMA(LDK(KP668178637), T6K, T6H);
Chris@82 945 T7b = VFNMS(LDK(KP198912367), T73, T74);
Chris@82 946 T6V = VFNMS(LDK(KP668178637), T6H, T6K);
Chris@82 947 T75 = VFMA(LDK(KP198912367), T74, T73);
Chris@82 948 }
Chris@82 949 }
Chris@82 950 {
Chris@82 951 V T6E, T6T, T9r, T9s;
Chris@82 952 T6E = VFMA(LDK(KP923879532), T6D, T6w);
Chris@82 953 T6T = VADD(T6L, T6S);
Chris@82 954 ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
Chris@82 955 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
Chris@82 956 T9r = VFMA(LDK(KP923879532), T9q, T9p);
Chris@82 957 T9s = VADD(T6V, T6W);
Chris@82 958 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
Chris@82 959 ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
Chris@82 960 }
Chris@82 961 {
Chris@82 962 V T6U, T6X, T9t, T9u;
Chris@82 963 T6U = VFNMS(LDK(KP923879532), T6D, T6w);
Chris@82 964 T6X = VSUB(T6V, T6W);
Chris@82 965 ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
Chris@82 966 ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
Chris@82 967 T9t = VFNMS(LDK(KP923879532), T9q, T9p);
Chris@82 968 T9u = VSUB(T6S, T6L);
Chris@82 969 ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
Chris@82 970 ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
Chris@82 971 }
Chris@82 972 {
Chris@82 973 V T72, T79, T9x, T9y;
Chris@82 974 T72 = VFNMS(LDK(KP923879532), T71, T6Y);
Chris@82 975 T79 = VSUB(T75, T78);
Chris@82 976 ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
Chris@82 977 ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
Chris@82 978 T9x = VFNMS(LDK(KP923879532), T9w, T9v);
Chris@82 979 T9y = VSUB(T7c, T7b);
Chris@82 980 ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
Chris@82 981 ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
Chris@82 982 }
Chris@82 983 {
Chris@82 984 V T7a, T7d, T9z, T9A;
Chris@82 985 T7a = VFMA(LDK(KP923879532), T71, T6Y);
Chris@82 986 T7d = VADD(T7b, T7c);
Chris@82 987 ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
Chris@82 988 ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
Chris@82 989 T9z = VFMA(LDK(KP923879532), T9w, T9v);
Chris@82 990 T9A = VADD(T75, T78);
Chris@82 991 ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
Chris@82 992 ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
Chris@82 993 }
Chris@82 994 }
Chris@82 995 }
Chris@82 996 }
Chris@82 997 }
Chris@82 998 VLEAVE();
Chris@82 999 }
Chris@82 1000
Chris@82 1001 static const tw_instr twinstr[] = {
Chris@82 1002 VTW(0, 1),
Chris@82 1003 VTW(0, 3),
Chris@82 1004 VTW(0, 9),
Chris@82 1005 VTW(0, 27),
Chris@82 1006 {TW_NEXT, (2 * VL), 0}
Chris@82 1007 };
Chris@82 1008
Chris@82 1009 static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };
Chris@82 1010
Chris@82 1011 void XSIMD(codelet_t2sv_32) (planner *p) {
Chris@82 1012 X(kdft_dit_register) (p, t2sv_32, &desc);
Chris@82 1013 }
Chris@82 1014 #else
Chris@82 1015
Chris@82 1016 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include dft/simd/ts.h */
Chris@82 1017
Chris@82 1018 /*
Chris@82 1019 * This function contains 488 FP additions, 280 FP multiplications,
Chris@82 1020 * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
Chris@82 1021 * 158 stack variables, 7 constants, and 128 memory accesses
Chris@82 1022 */
Chris@82 1023 #include "dft/simd/ts.h"
Chris@82 1024
Chris@82 1025 static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 1026 {
Chris@82 1027 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 1028 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 1029 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 1030 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 1031 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 1032 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 1033 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 1034 {
Chris@82 1035 INT m;
Chris@82 1036 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 1037 V T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
Chris@82 1038 V T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
Chris@82 1039 V Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
Chris@82 1040 V T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
Chris@82 1041 V T1S, T23;
Chris@82 1042 {
Chris@82 1043 V Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
Chris@82 1044 V T10;
Chris@82 1045 {
Chris@82 1046 V T4, Tc, T7, Tb;
Chris@82 1047 T2 = LDW(&(W[0]));
Chris@82 1048 T5 = LDW(&(W[TWVL * 1]));
Chris@82 1049 T3 = LDW(&(W[TWVL * 2]));
Chris@82 1050 T6 = LDW(&(W[TWVL * 3]));
Chris@82 1051 T4 = VMUL(T2, T3);
Chris@82 1052 Tc = VMUL(T5, T3);
Chris@82 1053 T7 = VMUL(T5, T6);
Chris@82 1054 Tb = VMUL(T2, T6);
Chris@82 1055 T8 = VADD(T4, T7);
Chris@82 1056 TM = VSUB(T4, T7);
Chris@82 1057 TO = VADD(Tb, Tc);
Chris@82 1058 Td = VSUB(Tb, Tc);
Chris@82 1059 T9 = LDW(&(W[TWVL * 4]));
Chris@82 1060 Ts = VMUL(T2, T9);
Chris@82 1061 T1d = VMUL(T6, T9);
Chris@82 1062 Tx = VMUL(T5, T9);
Chris@82 1063 T18 = VMUL(T3, T9);
Chris@82 1064 Te = LDW(&(W[TWVL * 5]));
Chris@82 1065 Tt = VMUL(T5, Te);
Chris@82 1066 T1c = VMUL(T3, Te);
Chris@82 1067 Tw = VMUL(T2, Te);
Chris@82 1068 T19 = VMUL(T6, Te);
Chris@82 1069 Th = LDW(&(W[TWVL * 6]));
Chris@82 1070 TB = VMUL(T3, Th);
Chris@82 1071 T14 = VMUL(T5, Th);
Chris@82 1072 TG = VMUL(T6, Th);
Chris@82 1073 TZ = VMUL(T2, Th);
Chris@82 1074 Tl = LDW(&(W[TWVL * 7]));
Chris@82 1075 TC = VMUL(T6, Tl);
Chris@82 1076 T13 = VMUL(T2, Tl);
Chris@82 1077 TF = VMUL(T3, Tl);
Chris@82 1078 T10 = VMUL(T5, Tl);
Chris@82 1079 }
Chris@82 1080 TD = VADD(TB, TC);
Chris@82 1081 TH = VSUB(TF, TG);
Chris@82 1082 T1y = VADD(TZ, T10);
Chris@82 1083 T1H = VADD(TF, TG);
Chris@82 1084 T15 = VADD(T13, T14);
Chris@82 1085 T1A = VSUB(T13, T14);
Chris@82 1086 T11 = VSUB(TZ, T10);
Chris@82 1087 T1F = VSUB(TB, TC);
Chris@82 1088 T1n = VFMA(T9, Th, VMUL(Te, Tl));
Chris@82 1089 T1p = VFNMS(Te, Th, VMUL(T9, Tl));
Chris@82 1090 {
Chris@82 1091 V T2o, T2p, T2s, T2t;
Chris@82 1092 T2o = VMUL(T8, Th);
Chris@82 1093 T2p = VMUL(Td, Tl);
Chris@82 1094 T2q = VADD(T2o, T2p);
Chris@82 1095 T2I = VSUB(T2o, T2p);
Chris@82 1096 T2s = VMUL(T8, Tl);
Chris@82 1097 T2t = VMUL(Td, Th);
Chris@82 1098 T2u = VSUB(T2s, T2t);
Chris@82 1099 T2K = VADD(T2s, T2t);
Chris@82 1100 }
Chris@82 1101 {
Chris@82 1102 V T2T, T2U, T2X, T2Y;
Chris@82 1103 T2T = VMUL(TM, Th);
Chris@82 1104 T2U = VMUL(TO, Tl);
Chris@82 1105 T2V = VSUB(T2T, T2U);
Chris@82 1106 T3b = VADD(T2T, T2U);
Chris@82 1107 T2X = VMUL(TM, Tl);
Chris@82 1108 T2Y = VMUL(TO, Th);
Chris@82 1109 T2Z = VADD(T2X, T2Y);
Chris@82 1110 T3d = VSUB(T2X, T2Y);
Chris@82 1111 Tu = VADD(Ts, Tt);
Chris@82 1112 Ty = VSUB(Tw, Tx);
Chris@82 1113 T3l = VFMA(Tu, Th, VMUL(Ty, Tl));
Chris@82 1114 T3n = VFNMS(Ty, Th, VMUL(Tu, Tl));
Chris@82 1115 }
Chris@82 1116 T1t = VSUB(Ts, Tt);
Chris@82 1117 T1v = VADD(Tw, Tx);
Chris@82 1118 T2f = VFMA(T1t, Th, VMUL(T1v, Tl));
Chris@82 1119 T2h = VFNMS(T1v, Th, VMUL(T1t, Tl));
Chris@82 1120 T1a = VSUB(T18, T19);
Chris@82 1121 T1e = VADD(T1c, T1d);
Chris@82 1122 T32 = VFMA(T1a, Th, VMUL(T1e, Tl));
Chris@82 1123 T34 = VFNMS(T1e, Th, VMUL(T1a, Tl));
Chris@82 1124 T1W = VADD(T18, T19);
Chris@82 1125 T1Y = VSUB(T1c, T1d);
Chris@82 1126 T2C = VFMA(T1W, Th, VMUL(T1Y, Tl));
Chris@82 1127 T2E = VFNMS(T1Y, Th, VMUL(T1W, Tl));
Chris@82 1128 {
Chris@82 1129 V Ta, Tf, Ti, Tj;
Chris@82 1130 Ta = VMUL(T8, T9);
Chris@82 1131 Tf = VMUL(Td, Te);
Chris@82 1132 Tg = VSUB(Ta, Tf);
Chris@82 1133 TR = VADD(Ta, Tf);
Chris@82 1134 Ti = VMUL(T8, Te);
Chris@82 1135 Tj = VMUL(Td, T9);
Chris@82 1136 Tk = VADD(Ti, Tj);
Chris@82 1137 TS = VSUB(Ti, Tj);
Chris@82 1138 }
Chris@82 1139 Tm = VFMA(Tg, Th, VMUL(Tk, Tl));
Chris@82 1140 TV = VFNMS(TS, Th, VMUL(TR, Tl));
Chris@82 1141 To = VFNMS(Tk, Th, VMUL(Tg, Tl));
Chris@82 1142 TT = VFMA(TR, Th, VMUL(TS, Tl));
Chris@82 1143 {
Chris@82 1144 V T1K, T1L, T1N, T1O;
Chris@82 1145 T1K = VMUL(TM, T9);
Chris@82 1146 T1L = VMUL(TO, Te);
Chris@82 1147 T1M = VSUB(T1K, T1L);
Chris@82 1148 T21 = VADD(T1K, T1L);
Chris@82 1149 T1N = VMUL(TM, Te);
Chris@82 1150 T1O = VMUL(TO, T9);
Chris@82 1151 T1P = VADD(T1N, T1O);
Chris@82 1152 T22 = VSUB(T1N, T1O);
Chris@82 1153 }
Chris@82 1154 T1Q = VFMA(T1M, Th, VMUL(T1P, Tl));
Chris@82 1155 T25 = VFNMS(T22, Th, VMUL(T21, Tl));
Chris@82 1156 T1S = VFNMS(T1P, Th, VMUL(T1M, Tl));
Chris@82 1157 T23 = VFMA(T21, Th, VMUL(T22, Tl));
Chris@82 1158 }
Chris@82 1159 {
Chris@82 1160 V TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
Chris@82 1161 V T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
Chris@82 1162 V T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
Chris@82 1163 V T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
Chris@82 1164 V T4W, T5R, T55, T5O;
Chris@82 1165 {
Chris@82 1166 V T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
Chris@82 1167 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@82 1168 T7G = LD(&(ii[0]), ms, &(ii[0]));
Chris@82 1169 Tn = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@82 1170 Tp = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@82 1171 Tq = VFMA(Tm, Tn, VMUL(To, Tp));
Chris@82 1172 T7F = VFNMS(To, Tn, VMUL(Tm, Tp));
Chris@82 1173 {
Chris@82 1174 V Tv, Tz, TE, TI;
Chris@82 1175 Tv = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@82 1176 Tz = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@82 1177 TA = VFMA(Tu, Tv, VMUL(Ty, Tz));
Chris@82 1178 T3C = VFNMS(Ty, Tv, VMUL(Tu, Tz));
Chris@82 1179 TE = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@82 1180 TI = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@82 1181 TJ = VFMA(TD, TE, VMUL(TH, TI));
Chris@82 1182 T3D = VFNMS(TH, TE, VMUL(TD, TI));
Chris@82 1183 }
Chris@82 1184 {
Chris@82 1185 V Tr, TK, T8a, T8b;
Chris@82 1186 Tr = VADD(T1, Tq);
Chris@82 1187 TK = VADD(TA, TJ);
Chris@82 1188 TL = VADD(Tr, TK);
Chris@82 1189 T6f = VSUB(Tr, TK);
Chris@82 1190 T8a = VSUB(T7G, T7F);
Chris@82 1191 T8b = VSUB(TA, TJ);
Chris@82 1192 T8c = VSUB(T8a, T8b);
Chris@82 1193 T8q = VADD(T8b, T8a);
Chris@82 1194 }
Chris@82 1195 {
Chris@82 1196 V T3B, T3E, T7E, T7H;
Chris@82 1197 T3B = VSUB(T1, Tq);
Chris@82 1198 T3E = VSUB(T3C, T3D);
Chris@82 1199 T3F = VSUB(T3B, T3E);
Chris@82 1200 T5t = VADD(T3B, T3E);
Chris@82 1201 T7E = VADD(T3C, T3D);
Chris@82 1202 T7H = VADD(T7F, T7G);
Chris@82 1203 T7I = VADD(T7E, T7H);
Chris@82 1204 T7W = VSUB(T7H, T7E);
Chris@82 1205 }
Chris@82 1206 }
Chris@82 1207 {
Chris@82 1208 V T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
Chris@82 1209 {
Chris@82 1210 V T2c, T2d, T2r, T2v;
Chris@82 1211 T2c = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1212 T2d = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1213 T2e = VFMA(T2, T2c, VMUL(T5, T2d));
Chris@82 1214 T4g = VFNMS(T5, T2c, VMUL(T2, T2d));
Chris@82 1215 T2r = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1216 T2v = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1217 T2w = VFMA(T2q, T2r, VMUL(T2u, T2v));
Chris@82 1218 T4z = VFNMS(T2u, T2r, VMUL(T2q, T2v));
Chris@82 1219 }
Chris@82 1220 {
Chris@82 1221 V T2g, T2i, T2l, T2m;
Chris@82 1222 T2g = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1223 T2i = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1224 T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
Chris@82 1225 T4h = VFNMS(T2h, T2g, VMUL(T2f, T2i));
Chris@82 1226 T2l = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1227 T2m = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1228 T2n = VFMA(T9, T2l, VMUL(Te, T2m));
Chris@82 1229 T4y = VFNMS(Te, T2l, VMUL(T9, T2m));
Chris@82 1230 }
Chris@82 1231 {
Chris@82 1232 V T2k, T2x, T6w, T6x;
Chris@82 1233 T2k = VADD(T2e, T2j);
Chris@82 1234 T2x = VADD(T2n, T2w);
Chris@82 1235 T2y = VADD(T2k, T2x);
Chris@82 1236 T6B = VSUB(T2k, T2x);
Chris@82 1237 T6w = VADD(T4g, T4h);
Chris@82 1238 T6x = VADD(T4y, T4z);
Chris@82 1239 T6y = VSUB(T6w, T6x);
Chris@82 1240 T7j = VADD(T6w, T6x);
Chris@82 1241 }
Chris@82 1242 {
Chris@82 1243 V T4i, T4j, T4x, T4A;
Chris@82 1244 T4i = VSUB(T4g, T4h);
Chris@82 1245 T4j = VSUB(T2n, T2w);
Chris@82 1246 T4k = VADD(T4i, T4j);
Chris@82 1247 T5J = VSUB(T4i, T4j);
Chris@82 1248 T4x = VSUB(T2e, T2j);
Chris@82 1249 T4A = VSUB(T4y, T4z);
Chris@82 1250 T4B = VSUB(T4x, T4A);
Chris@82 1251 T5G = VADD(T4x, T4A);
Chris@82 1252 }
Chris@82 1253 }
Chris@82 1254 {
Chris@82 1255 V T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
Chris@82 1256 {
Chris@82 1257 V T2W, T30, T3c, T3e;
Chris@82 1258 T2W = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1259 T30 = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1260 T31 = VFMA(T2V, T2W, VMUL(T2Z, T30));
Chris@82 1261 T4Y = VFNMS(T2Z, T2W, VMUL(T2V, T30));
Chris@82 1262 T3c = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1263 T3e = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1264 T3f = VFMA(T3b, T3c, VMUL(T3d, T3e));
Chris@82 1265 T4J = VFNMS(T3d, T3c, VMUL(T3b, T3e));
Chris@82 1266 }
Chris@82 1267 {
Chris@82 1268 V T33, T35, T38, T39;
Chris@82 1269 T33 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1270 T35 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1271 T36 = VFMA(T32, T33, VMUL(T34, T35));
Chris@82 1272 T4Z = VFNMS(T34, T33, VMUL(T32, T35));
Chris@82 1273 T38 = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1274 T39 = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1275 T3a = VFMA(TR, T38, VMUL(TS, T39));
Chris@82 1276 T4I = VFNMS(TS, T38, VMUL(TR, T39));
Chris@82 1277 }
Chris@82 1278 {
Chris@82 1279 V T37, T3g, T6M, T6N;
Chris@82 1280 T37 = VADD(T31, T36);
Chris@82 1281 T3g = VADD(T3a, T3f);
Chris@82 1282 T3h = VADD(T37, T3g);
Chris@82 1283 T6H = VSUB(T37, T3g);
Chris@82 1284 T6M = VADD(T4Y, T4Z);
Chris@82 1285 T6N = VADD(T4I, T4J);
Chris@82 1286 T6O = VSUB(T6M, T6N);
Chris@82 1287 T7o = VADD(T6M, T6N);
Chris@82 1288 }
Chris@82 1289 {
Chris@82 1290 V T4H, T4K, T50, T51;
Chris@82 1291 T4H = VSUB(T31, T36);
Chris@82 1292 T4K = VSUB(T4I, T4J);
Chris@82 1293 T4L = VSUB(T4H, T4K);
Chris@82 1294 T5N = VADD(T4H, T4K);
Chris@82 1295 T50 = VSUB(T4Y, T4Z);
Chris@82 1296 T51 = VSUB(T3a, T3f);
Chris@82 1297 T52 = VADD(T50, T51);
Chris@82 1298 T5Q = VSUB(T50, T51);
Chris@82 1299 }
Chris@82 1300 }
Chris@82 1301 {
Chris@82 1302 V TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
Chris@82 1303 {
Chris@82 1304 V TN, TP, T1b, T1f;
Chris@82 1305 TN = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@82 1306 TP = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@82 1307 TQ = VFMA(TM, TN, VMUL(TO, TP));
Chris@82 1308 T3G = VFNMS(TO, TN, VMUL(TM, TP));
Chris@82 1309 T1b = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@82 1310 T1f = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@82 1311 T1g = VFMA(T1a, T1b, VMUL(T1e, T1f));
Chris@82 1312 T3N = VFNMS(T1e, T1b, VMUL(T1a, T1f));
Chris@82 1313 }
Chris@82 1314 {
Chris@82 1315 V TU, TW, T12, T16;
Chris@82 1316 TU = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@82 1317 TW = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@82 1318 TX = VFMA(TT, TU, VMUL(TV, TW));
Chris@82 1319 T3H = VFNMS(TV, TU, VMUL(TT, TW));
Chris@82 1320 T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@82 1321 T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@82 1322 T17 = VFMA(T11, T12, VMUL(T15, T16));
Chris@82 1323 T3M = VFNMS(T15, T12, VMUL(T11, T16));
Chris@82 1324 }
Chris@82 1325 {
Chris@82 1326 V TY, T1h, T6g, T6h;
Chris@82 1327 TY = VADD(TQ, TX);
Chris@82 1328 T1h = VADD(T17, T1g);
Chris@82 1329 T1i = VADD(TY, T1h);
Chris@82 1330 T7V = VSUB(T1h, TY);
Chris@82 1331 T6g = VADD(T3G, T3H);
Chris@82 1332 T6h = VADD(T3M, T3N);
Chris@82 1333 T6i = VSUB(T6g, T6h);
Chris@82 1334 T7D = VADD(T6g, T6h);
Chris@82 1335 }
Chris@82 1336 {
Chris@82 1337 V T3I, T3J, T3L, T3O;
Chris@82 1338 T3I = VSUB(T3G, T3H);
Chris@82 1339 T3J = VSUB(TQ, TX);
Chris@82 1340 T3K = VSUB(T3I, T3J);
Chris@82 1341 T5u = VADD(T3J, T3I);
Chris@82 1342 T3L = VSUB(T17, T1g);
Chris@82 1343 T3O = VSUB(T3M, T3N);
Chris@82 1344 T3P = VADD(T3L, T3O);
Chris@82 1345 T5v = VSUB(T3L, T3O);
Chris@82 1346 }
Chris@82 1347 }
Chris@82 1348 {
Chris@82 1349 V T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
Chris@82 1350 {
Chris@82 1351 V T1k, T1l, T1z, T1B;
Chris@82 1352 T1k = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@82 1353 T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@82 1354 T1m = VFMA(T8, T1k, VMUL(Td, T1l));
Chris@82 1355 T3S = VFNMS(Td, T1k, VMUL(T8, T1l));
Chris@82 1356 T1z = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@82 1357 T1B = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@82 1358 T1C = VFMA(T1y, T1z, VMUL(T1A, T1B));
Chris@82 1359 T3Z = VFNMS(T1A, T1z, VMUL(T1y, T1B));
Chris@82 1360 }
Chris@82 1361 {
Chris@82 1362 V T1o, T1q, T1u, T1w;
Chris@82 1363 T1o = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@82 1364 T1q = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@82 1365 T1r = VFMA(T1n, T1o, VMUL(T1p, T1q));
Chris@82 1366 T3T = VFNMS(T1p, T1o, VMUL(T1n, T1q));
Chris@82 1367 T1u = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@82 1368 T1w = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@82 1369 T1x = VFMA(T1t, T1u, VMUL(T1v, T1w));
Chris@82 1370 T3Y = VFNMS(T1v, T1u, VMUL(T1t, T1w));
Chris@82 1371 }
Chris@82 1372 {
Chris@82 1373 V T1s, T1D, T6k, T6l;
Chris@82 1374 T1s = VADD(T1m, T1r);
Chris@82 1375 T1D = VADD(T1x, T1C);
Chris@82 1376 T1E = VADD(T1s, T1D);
Chris@82 1377 T6n = VSUB(T1s, T1D);
Chris@82 1378 T6k = VADD(T3S, T3T);
Chris@82 1379 T6l = VADD(T3Y, T3Z);
Chris@82 1380 T6m = VSUB(T6k, T6l);
Chris@82 1381 T7e = VADD(T6k, T6l);
Chris@82 1382 }
Chris@82 1383 {
Chris@82 1384 V T3U, T3V, T3X, T40;
Chris@82 1385 T3U = VSUB(T3S, T3T);
Chris@82 1386 T3V = VSUB(T1x, T1C);
Chris@82 1387 T3W = VADD(T3U, T3V);
Chris@82 1388 T5y = VSUB(T3U, T3V);
Chris@82 1389 T3X = VSUB(T1m, T1r);
Chris@82 1390 T40 = VSUB(T3Y, T3Z);
Chris@82 1391 T41 = VSUB(T3X, T40);
Chris@82 1392 T5z = VADD(T3X, T40);
Chris@82 1393 }
Chris@82 1394 }
Chris@82 1395 {
Chris@82 1396 V T1J, T43, T27, T4a, T1U, T44, T20, T49;
Chris@82 1397 {
Chris@82 1398 V T1G, T1I, T24, T26;
Chris@82 1399 T1G = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@82 1400 T1I = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@82 1401 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
Chris@82 1402 T43 = VFNMS(T1H, T1G, VMUL(T1F, T1I));
Chris@82 1403 T24 = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@82 1404 T26 = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@82 1405 T27 = VFMA(T23, T24, VMUL(T25, T26));
Chris@82 1406 T4a = VFNMS(T25, T24, VMUL(T23, T26));
Chris@82 1407 }
Chris@82 1408 {
Chris@82 1409 V T1R, T1T, T1X, T1Z;
Chris@82 1410 T1R = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@82 1411 T1T = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@82 1412 T1U = VFMA(T1Q, T1R, VMUL(T1S, T1T));
Chris@82 1413 T44 = VFNMS(T1S, T1R, VMUL(T1Q, T1T));
Chris@82 1414 T1X = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@82 1415 T1Z = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@82 1416 T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
Chris@82 1417 T49 = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
Chris@82 1418 }
Chris@82 1419 {
Chris@82 1420 V T1V, T28, T6q, T6r;
Chris@82 1421 T1V = VADD(T1J, T1U);
Chris@82 1422 T28 = VADD(T20, T27);
Chris@82 1423 T29 = VADD(T1V, T28);
Chris@82 1424 T6p = VSUB(T1V, T28);
Chris@82 1425 T6q = VADD(T43, T44);
Chris@82 1426 T6r = VADD(T49, T4a);
Chris@82 1427 T6s = VSUB(T6q, T6r);
Chris@82 1428 T7f = VADD(T6q, T6r);
Chris@82 1429 }
Chris@82 1430 {
Chris@82 1431 V T45, T46, T48, T4b;
Chris@82 1432 T45 = VSUB(T43, T44);
Chris@82 1433 T46 = VSUB(T20, T27);
Chris@82 1434 T47 = VADD(T45, T46);
Chris@82 1435 T5B = VSUB(T45, T46);
Chris@82 1436 T48 = VSUB(T1J, T1U);
Chris@82 1437 T4b = VSUB(T49, T4a);
Chris@82 1438 T4c = VSUB(T48, T4b);
Chris@82 1439 T5C = VADD(T48, T4b);
Chris@82 1440 }
Chris@82 1441 }
Chris@82 1442 {
Chris@82 1443 V T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
Chris@82 1444 {
Chris@82 1445 V T2z, T2A, T2D, T2F;
Chris@82 1446 T2z = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1447 T2A = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1448 T2B = VFMA(T21, T2z, VMUL(T22, T2A));
Chris@82 1449 T4r = VFNMS(T22, T2z, VMUL(T21, T2A));
Chris@82 1450 T2D = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1451 T2F = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1452 T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
Chris@82 1453 T4s = VFNMS(T2E, T2D, VMUL(T2C, T2F));
Chris@82 1454 }
Chris@82 1455 T4q = VSUB(T2B, T2G);
Chris@82 1456 T4t = VSUB(T4r, T4s);
Chris@82 1457 {
Chris@82 1458 V T2J, T2L, T2N, T2O;
Chris@82 1459 T2J = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1460 T2L = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1461 T2M = VFMA(T2I, T2J, VMUL(T2K, T2L));
Chris@82 1462 T4m = VFNMS(T2K, T2J, VMUL(T2I, T2L));
Chris@82 1463 T2N = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1464 T2O = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1465 T2P = VFMA(T1M, T2N, VMUL(T1P, T2O));
Chris@82 1466 T4n = VFNMS(T1P, T2N, VMUL(T1M, T2O));
Chris@82 1467 }
Chris@82 1468 T4l = VSUB(T2M, T2P);
Chris@82 1469 T4o = VSUB(T4m, T4n);
Chris@82 1470 {
Chris@82 1471 V T2H, T2Q, T6C, T6D;
Chris@82 1472 T2H = VADD(T2B, T2G);
Chris@82 1473 T2Q = VADD(T2M, T2P);
Chris@82 1474 T2R = VADD(T2H, T2Q);
Chris@82 1475 T6z = VSUB(T2Q, T2H);
Chris@82 1476 T6C = VADD(T4r, T4s);
Chris@82 1477 T6D = VADD(T4m, T4n);
Chris@82 1478 T6E = VSUB(T6C, T6D);
Chris@82 1479 T7k = VADD(T6C, T6D);
Chris@82 1480 }
Chris@82 1481 {
Chris@82 1482 V T4p, T4u, T4C, T4D;
Chris@82 1483 T4p = VSUB(T4l, T4o);
Chris@82 1484 T4u = VADD(T4q, T4t);
Chris@82 1485 T4v = VMUL(LDK(KP707106781), VSUB(T4p, T4u));
Chris@82 1486 T5H = VMUL(LDK(KP707106781), VADD(T4u, T4p));
Chris@82 1487 T4C = VSUB(T4t, T4q);
Chris@82 1488 T4D = VADD(T4l, T4o);
Chris@82 1489 T4E = VMUL(LDK(KP707106781), VSUB(T4C, T4D));
Chris@82 1490 T5K = VMUL(LDK(KP707106781), VADD(T4C, T4D));
Chris@82 1491 }
Chris@82 1492 }
Chris@82 1493 {
Chris@82 1494 V T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
Chris@82 1495 {
Chris@82 1496 V T3i, T3j, T3m, T3o;
Chris@82 1497 T3i = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1498 T3j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1499 T3k = VFMA(T3, T3i, VMUL(T6, T3j));
Chris@82 1500 T4M = VFNMS(T6, T3i, VMUL(T3, T3j));
Chris@82 1501 T3m = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1502 T3o = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1503 T3p = VFMA(T3l, T3m, VMUL(T3n, T3o));
Chris@82 1504 T4N = VFNMS(T3n, T3m, VMUL(T3l, T3o));
Chris@82 1505 }
Chris@82 1506 T4O = VSUB(T4M, T4N);
Chris@82 1507 T4P = VSUB(T3k, T3p);
Chris@82 1508 {
Chris@82 1509 V T3r, T3s, T3u, T3v;
Chris@82 1510 T3r = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1511 T3s = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1512 T3t = VFMA(Th, T3r, VMUL(Tl, T3s));
Chris@82 1513 T4S = VFNMS(Tl, T3r, VMUL(Th, T3s));
Chris@82 1514 T3u = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@82 1515 T3v = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@82 1516 T3w = VFMA(Tg, T3u, VMUL(Tk, T3v));
Chris@82 1517 T4T = VFNMS(Tk, T3u, VMUL(Tg, T3v));
Chris@82 1518 }
Chris@82 1519 T4R = VSUB(T3t, T3w);
Chris@82 1520 T4U = VSUB(T4S, T4T);
Chris@82 1521 {
Chris@82 1522 V T3q, T3x, T6I, T6J;
Chris@82 1523 T3q = VADD(T3k, T3p);
Chris@82 1524 T3x = VADD(T3t, T3w);
Chris@82 1525 T3y = VADD(T3q, T3x);
Chris@82 1526 T6P = VSUB(T3x, T3q);
Chris@82 1527 T6I = VADD(T4M, T4N);
Chris@82 1528 T6J = VADD(T4S, T4T);
Chris@82 1529 T6K = VSUB(T6I, T6J);
Chris@82 1530 T7p = VADD(T6I, T6J);
Chris@82 1531 }
Chris@82 1532 {
Chris@82 1533 V T4Q, T4V, T53, T54;
Chris@82 1534 T4Q = VSUB(T4O, T4P);
Chris@82 1535 T4V = VADD(T4R, T4U);
Chris@82 1536 T4W = VMUL(LDK(KP707106781), VSUB(T4Q, T4V));
Chris@82 1537 T5R = VMUL(LDK(KP707106781), VADD(T4Q, T4V));
Chris@82 1538 T53 = VSUB(T4R, T4U);
Chris@82 1539 T54 = VADD(T4P, T4O);
Chris@82 1540 T55 = VMUL(LDK(KP707106781), VSUB(T53, T54));
Chris@82 1541 T5O = VMUL(LDK(KP707106781), VADD(T54, T53));
Chris@82 1542 }
Chris@82 1543 }
Chris@82 1544 {
Chris@82 1545 V T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
Chris@82 1546 {
Chris@82 1547 V T1j, T2a, T7C, T7J;
Chris@82 1548 T1j = VADD(TL, T1i);
Chris@82 1549 T2a = VADD(T1E, T29);
Chris@82 1550 T2b = VADD(T1j, T2a);
Chris@82 1551 T7x = VSUB(T1j, T2a);
Chris@82 1552 T7C = VADD(T7e, T7f);
Chris@82 1553 T7J = VADD(T7D, T7I);
Chris@82 1554 T7K = VADD(T7C, T7J);
Chris@82 1555 T7M = VSUB(T7J, T7C);
Chris@82 1556 }
Chris@82 1557 {
Chris@82 1558 V T2S, T3z, T7y, T7z;
Chris@82 1559 T2S = VADD(T2y, T2R);
Chris@82 1560 T3z = VADD(T3h, T3y);
Chris@82 1561 T3A = VADD(T2S, T3z);
Chris@82 1562 T7L = VSUB(T3z, T2S);
Chris@82 1563 T7y = VADD(T7j, T7k);
Chris@82 1564 T7z = VADD(T7o, T7p);
Chris@82 1565 T7A = VSUB(T7y, T7z);
Chris@82 1566 T7B = VADD(T7y, T7z);
Chris@82 1567 }
Chris@82 1568 ST(&(ri[WS(rs, 16)]), VSUB(T2b, T3A), ms, &(ri[0]));
Chris@82 1569 ST(&(ii[WS(rs, 16)]), VSUB(T7K, T7B), ms, &(ii[0]));
Chris@82 1570 ST(&(ri[0]), VADD(T2b, T3A), ms, &(ri[0]));
Chris@82 1571 ST(&(ii[0]), VADD(T7B, T7K), ms, &(ii[0]));
Chris@82 1572 ST(&(ri[WS(rs, 24)]), VSUB(T7x, T7A), ms, &(ri[0]));
Chris@82 1573 ST(&(ii[WS(rs, 24)]), VSUB(T7M, T7L), ms, &(ii[0]));
Chris@82 1574 ST(&(ri[WS(rs, 8)]), VADD(T7x, T7A), ms, &(ri[0]));
Chris@82 1575 ST(&(ii[WS(rs, 8)]), VADD(T7L, T7M), ms, &(ii[0]));
Chris@82 1576 }
Chris@82 1577 {
Chris@82 1578 V T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
Chris@82 1579 {
Chris@82 1580 V T7d, T7g, T7O, T7P;
Chris@82 1581 T7d = VSUB(TL, T1i);
Chris@82 1582 T7g = VSUB(T7e, T7f);
Chris@82 1583 T7h = VADD(T7d, T7g);
Chris@82 1584 T7t = VSUB(T7d, T7g);
Chris@82 1585 T7O = VSUB(T29, T1E);
Chris@82 1586 T7P = VSUB(T7I, T7D);
Chris@82 1587 T7Q = VADD(T7O, T7P);
Chris@82 1588 T7S = VSUB(T7P, T7O);
Chris@82 1589 }
Chris@82 1590 {
Chris@82 1591 V T7i, T7l, T7n, T7q;
Chris@82 1592 T7i = VSUB(T2y, T2R);
Chris@82 1593 T7l = VSUB(T7j, T7k);
Chris@82 1594 T7m = VADD(T7i, T7l);
Chris@82 1595 T7u = VSUB(T7l, T7i);
Chris@82 1596 T7n = VSUB(T3h, T3y);
Chris@82 1597 T7q = VSUB(T7o, T7p);
Chris@82 1598 T7r = VSUB(T7n, T7q);
Chris@82 1599 T7v = VADD(T7n, T7q);
Chris@82 1600 }
Chris@82 1601 {
Chris@82 1602 V T7s, T7N, T7w, T7R;
Chris@82 1603 T7s = VMUL(LDK(KP707106781), VADD(T7m, T7r));
Chris@82 1604 ST(&(ri[WS(rs, 20)]), VSUB(T7h, T7s), ms, &(ri[0]));
Chris@82 1605 ST(&(ri[WS(rs, 4)]), VADD(T7h, T7s), ms, &(ri[0]));
Chris@82 1606 T7N = VMUL(LDK(KP707106781), VADD(T7u, T7v));
Chris@82 1607 ST(&(ii[WS(rs, 4)]), VADD(T7N, T7Q), ms, &(ii[0]));
Chris@82 1608 ST(&(ii[WS(rs, 20)]), VSUB(T7Q, T7N), ms, &(ii[0]));
Chris@82 1609 T7w = VMUL(LDK(KP707106781), VSUB(T7u, T7v));
Chris@82 1610 ST(&(ri[WS(rs, 28)]), VSUB(T7t, T7w), ms, &(ri[0]));
Chris@82 1611 ST(&(ri[WS(rs, 12)]), VADD(T7t, T7w), ms, &(ri[0]));
Chris@82 1612 T7R = VMUL(LDK(KP707106781), VSUB(T7r, T7m));
Chris@82 1613 ST(&(ii[WS(rs, 12)]), VADD(T7R, T7S), ms, &(ii[0]));
Chris@82 1614 ST(&(ii[WS(rs, 28)]), VSUB(T7S, T7R), ms, &(ii[0]));
Chris@82 1615 }
Chris@82 1616 }
Chris@82 1617 {
Chris@82 1618 V T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
Chris@82 1619 V T6V;
Chris@82 1620 {
Chris@82 1621 V T6o, T6t, T6A, T6F;
Chris@82 1622 T6j = VSUB(T6f, T6i);
Chris@82 1623 T7X = VADD(T7V, T7W);
Chris@82 1624 T83 = VSUB(T7W, T7V);
Chris@82 1625 T6X = VADD(T6f, T6i);
Chris@82 1626 T6o = VSUB(T6m, T6n);
Chris@82 1627 T6t = VADD(T6p, T6s);
Chris@82 1628 T6u = VMUL(LDK(KP707106781), VSUB(T6o, T6t));
Chris@82 1629 T7U = VMUL(LDK(KP707106781), VADD(T6o, T6t));
Chris@82 1630 {
Chris@82 1631 V T75, T76, T6Y, T6Z;
Chris@82 1632 T75 = VADD(T6H, T6K);
Chris@82 1633 T76 = VADD(T6O, T6P);
Chris@82 1634 T77 = VFNMS(LDK(KP382683432), T76, VMUL(LDK(KP923879532), T75));
Chris@82 1635 T7b = VFMA(LDK(KP923879532), T76, VMUL(LDK(KP382683432), T75));
Chris@82 1636 T6Y = VADD(T6n, T6m);
Chris@82 1637 T6Z = VSUB(T6p, T6s);
Chris@82 1638 T70 = VMUL(LDK(KP707106781), VADD(T6Y, T6Z));
Chris@82 1639 T82 = VMUL(LDK(KP707106781), VSUB(T6Z, T6Y));
Chris@82 1640 }
Chris@82 1641 T6A = VSUB(T6y, T6z);
Chris@82 1642 T6F = VSUB(T6B, T6E);
Chris@82 1643 T6G = VFMA(LDK(KP923879532), T6A, VMUL(LDK(KP382683432), T6F));
Chris@82 1644 T6U = VFNMS(LDK(KP923879532), T6F, VMUL(LDK(KP382683432), T6A));
Chris@82 1645 {
Chris@82 1646 V T72, T73, T6L, T6Q;
Chris@82 1647 T72 = VADD(T6y, T6z);
Chris@82 1648 T73 = VADD(T6B, T6E);
Chris@82 1649 T74 = VFMA(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T73));
Chris@82 1650 T7a = VFNMS(LDK(KP382683432), T73, VMUL(LDK(KP923879532), T72));
Chris@82 1651 T6L = VSUB(T6H, T6K);
Chris@82 1652 T6Q = VSUB(T6O, T6P);
Chris@82 1653 T6R = VFNMS(LDK(KP923879532), T6Q, VMUL(LDK(KP382683432), T6L));
Chris@82 1654 T6V = VFMA(LDK(KP382683432), T6Q, VMUL(LDK(KP923879532), T6L));
Chris@82 1655 }
Chris@82 1656 }
Chris@82 1657 {
Chris@82 1658 V T6v, T6S, T81, T84;
Chris@82 1659 T6v = VADD(T6j, T6u);
Chris@82 1660 T6S = VADD(T6G, T6R);
Chris@82 1661 ST(&(ri[WS(rs, 22)]), VSUB(T6v, T6S), ms, &(ri[0]));
Chris@82 1662 ST(&(ri[WS(rs, 6)]), VADD(T6v, T6S), ms, &(ri[0]));
Chris@82 1663 T81 = VADD(T6U, T6V);
Chris@82 1664 T84 = VADD(T82, T83);
Chris@82 1665 ST(&(ii[WS(rs, 6)]), VADD(T81, T84), ms, &(ii[0]));
Chris@82 1666 ST(&(ii[WS(rs, 22)]), VSUB(T84, T81), ms, &(ii[0]));
Chris@82 1667 }
Chris@82 1668 {
Chris@82 1669 V T6T, T6W, T85, T86;
Chris@82 1670 T6T = VSUB(T6j, T6u);
Chris@82 1671 T6W = VSUB(T6U, T6V);
Chris@82 1672 ST(&(ri[WS(rs, 30)]), VSUB(T6T, T6W), ms, &(ri[0]));
Chris@82 1673 ST(&(ri[WS(rs, 14)]), VADD(T6T, T6W), ms, &(ri[0]));
Chris@82 1674 T85 = VSUB(T6R, T6G);
Chris@82 1675 T86 = VSUB(T83, T82);
Chris@82 1676 ST(&(ii[WS(rs, 14)]), VADD(T85, T86), ms, &(ii[0]));
Chris@82 1677 ST(&(ii[WS(rs, 30)]), VSUB(T86, T85), ms, &(ii[0]));
Chris@82 1678 }
Chris@82 1679 {
Chris@82 1680 V T71, T78, T7T, T7Y;
Chris@82 1681 T71 = VADD(T6X, T70);
Chris@82 1682 T78 = VADD(T74, T77);
Chris@82 1683 ST(&(ri[WS(rs, 18)]), VSUB(T71, T78), ms, &(ri[0]));
Chris@82 1684 ST(&(ri[WS(rs, 2)]), VADD(T71, T78), ms, &(ri[0]));
Chris@82 1685 T7T = VADD(T7a, T7b);
Chris@82 1686 T7Y = VADD(T7U, T7X);
Chris@82 1687 ST(&(ii[WS(rs, 2)]), VADD(T7T, T7Y), ms, &(ii[0]));
Chris@82 1688 ST(&(ii[WS(rs, 18)]), VSUB(T7Y, T7T), ms, &(ii[0]));
Chris@82 1689 }
Chris@82 1690 {
Chris@82 1691 V T79, T7c, T7Z, T80;
Chris@82 1692 T79 = VSUB(T6X, T70);
Chris@82 1693 T7c = VSUB(T7a, T7b);
Chris@82 1694 ST(&(ri[WS(rs, 26)]), VSUB(T79, T7c), ms, &(ri[0]));
Chris@82 1695 ST(&(ri[WS(rs, 10)]), VADD(T79, T7c), ms, &(ri[0]));
Chris@82 1696 T7Z = VSUB(T77, T74);
Chris@82 1697 T80 = VSUB(T7X, T7U);
Chris@82 1698 ST(&(ii[WS(rs, 10)]), VADD(T7Z, T80), ms, &(ii[0]));
Chris@82 1699 ST(&(ii[WS(rs, 26)]), VSUB(T80, T7Z), ms, &(ii[0]));
Chris@82 1700 }
Chris@82 1701 }
Chris@82 1702 {
Chris@82 1703 V T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
Chris@82 1704 V T5b, T3Q, T8p;
Chris@82 1705 T3Q = VMUL(LDK(KP707106781), VSUB(T3K, T3P));
Chris@82 1706 T3R = VSUB(T3F, T3Q);
Chris@82 1707 T5d = VADD(T3F, T3Q);
Chris@82 1708 T8p = VMUL(LDK(KP707106781), VSUB(T5v, T5u));
Chris@82 1709 T8r = VADD(T8p, T8q);
Chris@82 1710 T8x = VSUB(T8q, T8p);
Chris@82 1711 {
Chris@82 1712 V T42, T4d, T5l, T5m;
Chris@82 1713 T42 = VFNMS(LDK(KP923879532), T41, VMUL(LDK(KP382683432), T3W));
Chris@82 1714 T4d = VFMA(LDK(KP382683432), T47, VMUL(LDK(KP923879532), T4c));
Chris@82 1715 T4e = VSUB(T42, T4d);
Chris@82 1716 T8o = VADD(T42, T4d);
Chris@82 1717 T5l = VADD(T4L, T4W);
Chris@82 1718 T5m = VADD(T52, T55);
Chris@82 1719 T5n = VFNMS(LDK(KP555570233), T5m, VMUL(LDK(KP831469612), T5l));
Chris@82 1720 T5r = VFMA(LDK(KP831469612), T5m, VMUL(LDK(KP555570233), T5l));
Chris@82 1721 }
Chris@82 1722 {
Chris@82 1723 V T4w, T4F, T5e, T5f;
Chris@82 1724 T4w = VSUB(T4k, T4v);
Chris@82 1725 T4F = VSUB(T4B, T4E);
Chris@82 1726 T4G = VFMA(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4F));
Chris@82 1727 T5a = VFNMS(LDK(KP980785280), T4F, VMUL(LDK(KP195090322), T4w));
Chris@82 1728 T5e = VFMA(LDK(KP923879532), T3W, VMUL(LDK(KP382683432), T41));
Chris@82 1729 T5f = VFNMS(LDK(KP923879532), T47, VMUL(LDK(KP382683432), T4c));
Chris@82 1730 T5g = VADD(T5e, T5f);
Chris@82 1731 T8w = VSUB(T5f, T5e);
Chris@82 1732 }
Chris@82 1733 {
Chris@82 1734 V T5i, T5j, T4X, T56;
Chris@82 1735 T5i = VADD(T4k, T4v);
Chris@82 1736 T5j = VADD(T4B, T4E);
Chris@82 1737 T5k = VFMA(LDK(KP555570233), T5i, VMUL(LDK(KP831469612), T5j));
Chris@82 1738 T5q = VFNMS(LDK(KP555570233), T5j, VMUL(LDK(KP831469612), T5i));
Chris@82 1739 T4X = VSUB(T4L, T4W);
Chris@82 1740 T56 = VSUB(T52, T55);
Chris@82 1741 T57 = VFNMS(LDK(KP980785280), T56, VMUL(LDK(KP195090322), T4X));
Chris@82 1742 T5b = VFMA(LDK(KP195090322), T56, VMUL(LDK(KP980785280), T4X));
Chris@82 1743 }
Chris@82 1744 {
Chris@82 1745 V T4f, T58, T8v, T8y;
Chris@82 1746 T4f = VADD(T3R, T4e);
Chris@82 1747 T58 = VADD(T4G, T57);
Chris@82 1748 ST(&(ri[WS(rs, 23)]), VSUB(T4f, T58), ms, &(ri[WS(rs, 1)]));
Chris@82 1749 ST(&(ri[WS(rs, 7)]), VADD(T4f, T58), ms, &(ri[WS(rs, 1)]));
Chris@82 1750 T8v = VADD(T5a, T5b);
Chris@82 1751 T8y = VADD(T8w, T8x);
Chris@82 1752 ST(&(ii[WS(rs, 7)]), VADD(T8v, T8y), ms, &(ii[WS(rs, 1)]));
Chris@82 1753 ST(&(ii[WS(rs, 23)]), VSUB(T8y, T8v), ms, &(ii[WS(rs, 1)]));
Chris@82 1754 }
Chris@82 1755 {
Chris@82 1756 V T59, T5c, T8z, T8A;
Chris@82 1757 T59 = VSUB(T3R, T4e);
Chris@82 1758 T5c = VSUB(T5a, T5b);
Chris@82 1759 ST(&(ri[WS(rs, 31)]), VSUB(T59, T5c), ms, &(ri[WS(rs, 1)]));
Chris@82 1760 ST(&(ri[WS(rs, 15)]), VADD(T59, T5c), ms, &(ri[WS(rs, 1)]));
Chris@82 1761 T8z = VSUB(T57, T4G);
Chris@82 1762 T8A = VSUB(T8x, T8w);
Chris@82 1763 ST(&(ii[WS(rs, 15)]), VADD(T8z, T8A), ms, &(ii[WS(rs, 1)]));
Chris@82 1764 ST(&(ii[WS(rs, 31)]), VSUB(T8A, T8z), ms, &(ii[WS(rs, 1)]));
Chris@82 1765 }
Chris@82 1766 {
Chris@82 1767 V T5h, T5o, T8n, T8s;
Chris@82 1768 T5h = VADD(T5d, T5g);
Chris@82 1769 T5o = VADD(T5k, T5n);
Chris@82 1770 ST(&(ri[WS(rs, 19)]), VSUB(T5h, T5o), ms, &(ri[WS(rs, 1)]));
Chris@82 1771 ST(&(ri[WS(rs, 3)]), VADD(T5h, T5o), ms, &(ri[WS(rs, 1)]));
Chris@82 1772 T8n = VADD(T5q, T5r);
Chris@82 1773 T8s = VADD(T8o, T8r);
Chris@82 1774 ST(&(ii[WS(rs, 3)]), VADD(T8n, T8s), ms, &(ii[WS(rs, 1)]));
Chris@82 1775 ST(&(ii[WS(rs, 19)]), VSUB(T8s, T8n), ms, &(ii[WS(rs, 1)]));
Chris@82 1776 }
Chris@82 1777 {
Chris@82 1778 V T5p, T5s, T8t, T8u;
Chris@82 1779 T5p = VSUB(T5d, T5g);
Chris@82 1780 T5s = VSUB(T5q, T5r);
Chris@82 1781 ST(&(ri[WS(rs, 27)]), VSUB(T5p, T5s), ms, &(ri[WS(rs, 1)]));
Chris@82 1782 ST(&(ri[WS(rs, 11)]), VADD(T5p, T5s), ms, &(ri[WS(rs, 1)]));
Chris@82 1783 T8t = VSUB(T5n, T5k);
Chris@82 1784 T8u = VSUB(T8r, T8o);
Chris@82 1785 ST(&(ii[WS(rs, 11)]), VADD(T8t, T8u), ms, &(ii[WS(rs, 1)]));
Chris@82 1786 ST(&(ii[WS(rs, 27)]), VSUB(T8u, T8t), ms, &(ii[WS(rs, 1)]));
Chris@82 1787 }
Chris@82 1788 }
Chris@82 1789 {
Chris@82 1790 V T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
Chris@82 1791 V T5X, T5w, T89;
Chris@82 1792 T5w = VMUL(LDK(KP707106781), VADD(T5u, T5v));
Chris@82 1793 T5x = VSUB(T5t, T5w);
Chris@82 1794 T5Z = VADD(T5t, T5w);
Chris@82 1795 T89 = VMUL(LDK(KP707106781), VADD(T3K, T3P));
Chris@82 1796 T8d = VADD(T89, T8c);
Chris@82 1797 T8j = VSUB(T8c, T89);
Chris@82 1798 {
Chris@82 1799 V T5A, T5D, T67, T68;
Chris@82 1800 T5A = VFNMS(LDK(KP382683432), T5z, VMUL(LDK(KP923879532), T5y));
Chris@82 1801 T5D = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5C));
Chris@82 1802 T5E = VSUB(T5A, T5D);
Chris@82 1803 T88 = VADD(T5A, T5D);
Chris@82 1804 T67 = VADD(T5N, T5O);
Chris@82 1805 T68 = VADD(T5Q, T5R);
Chris@82 1806 T69 = VFNMS(LDK(KP195090322), T68, VMUL(LDK(KP980785280), T67));
Chris@82 1807 T6d = VFMA(LDK(KP195090322), T67, VMUL(LDK(KP980785280), T68));
Chris@82 1808 }
Chris@82 1809 {
Chris@82 1810 V T5I, T5L, T60, T61;
Chris@82 1811 T5I = VSUB(T5G, T5H);
Chris@82 1812 T5L = VSUB(T5J, T5K);
Chris@82 1813 T5M = VFMA(LDK(KP555570233), T5I, VMUL(LDK(KP831469612), T5L));
Chris@82 1814 T5W = VFNMS(LDK(KP831469612), T5I, VMUL(LDK(KP555570233), T5L));
Chris@82 1815 T60 = VFMA(LDK(KP382683432), T5y, VMUL(LDK(KP923879532), T5z));
Chris@82 1816 T61 = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5C));
Chris@82 1817 T62 = VADD(T60, T61);
Chris@82 1818 T8i = VSUB(T61, T60);
Chris@82 1819 }
Chris@82 1820 {
Chris@82 1821 V T64, T65, T5P, T5S;
Chris@82 1822 T64 = VADD(T5G, T5H);
Chris@82 1823 T65 = VADD(T5J, T5K);
Chris@82 1824 T66 = VFMA(LDK(KP980785280), T64, VMUL(LDK(KP195090322), T65));
Chris@82 1825 T6c = VFNMS(LDK(KP195090322), T64, VMUL(LDK(KP980785280), T65));
Chris@82 1826 T5P = VSUB(T5N, T5O);
Chris@82 1827 T5S = VSUB(T5Q, T5R);
Chris@82 1828 T5T = VFNMS(LDK(KP831469612), T5S, VMUL(LDK(KP555570233), T5P));
Chris@82 1829 T5X = VFMA(LDK(KP831469612), T5P, VMUL(LDK(KP555570233), T5S));
Chris@82 1830 }
Chris@82 1831 {
Chris@82 1832 V T5F, T5U, T8h, T8k;
Chris@82 1833 T5F = VADD(T5x, T5E);
Chris@82 1834 T5U = VADD(T5M, T5T);
Chris@82 1835 ST(&(ri[WS(rs, 21)]), VSUB(T5F, T5U), ms, &(ri[WS(rs, 1)]));
Chris@82 1836 ST(&(ri[WS(rs, 5)]), VADD(T5F, T5U), ms, &(ri[WS(rs, 1)]));
Chris@82 1837 T8h = VADD(T5W, T5X);
Chris@82 1838 T8k = VADD(T8i, T8j);
Chris@82 1839 ST(&(ii[WS(rs, 5)]), VADD(T8h, T8k), ms, &(ii[WS(rs, 1)]));
Chris@82 1840 ST(&(ii[WS(rs, 21)]), VSUB(T8k, T8h), ms, &(ii[WS(rs, 1)]));
Chris@82 1841 }
Chris@82 1842 {
Chris@82 1843 V T5V, T5Y, T8l, T8m;
Chris@82 1844 T5V = VSUB(T5x, T5E);
Chris@82 1845 T5Y = VSUB(T5W, T5X);
Chris@82 1846 ST(&(ri[WS(rs, 29)]), VSUB(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
Chris@82 1847 ST(&(ri[WS(rs, 13)]), VADD(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
Chris@82 1848 T8l = VSUB(T5T, T5M);
Chris@82 1849 T8m = VSUB(T8j, T8i);
Chris@82 1850 ST(&(ii[WS(rs, 13)]), VADD(T8l, T8m), ms, &(ii[WS(rs, 1)]));
Chris@82 1851 ST(&(ii[WS(rs, 29)]), VSUB(T8m, T8l), ms, &(ii[WS(rs, 1)]));
Chris@82 1852 }
Chris@82 1853 {
Chris@82 1854 V T63, T6a, T87, T8e;
Chris@82 1855 T63 = VADD(T5Z, T62);
Chris@82 1856 T6a = VADD(T66, T69);
Chris@82 1857 ST(&(ri[WS(rs, 17)]), VSUB(T63, T6a), ms, &(ri[WS(rs, 1)]));
Chris@82 1858 ST(&(ri[WS(rs, 1)]), VADD(T63, T6a), ms, &(ri[WS(rs, 1)]));
Chris@82 1859 T87 = VADD(T6c, T6d);
Chris@82 1860 T8e = VADD(T88, T8d);
Chris@82 1861 ST(&(ii[WS(rs, 1)]), VADD(T87, T8e), ms, &(ii[WS(rs, 1)]));
Chris@82 1862 ST(&(ii[WS(rs, 17)]), VSUB(T8e, T87), ms, &(ii[WS(rs, 1)]));
Chris@82 1863 }
Chris@82 1864 {
Chris@82 1865 V T6b, T6e, T8f, T8g;
Chris@82 1866 T6b = VSUB(T5Z, T62);
Chris@82 1867 T6e = VSUB(T6c, T6d);
Chris@82 1868 ST(&(ri[WS(rs, 25)]), VSUB(T6b, T6e), ms, &(ri[WS(rs, 1)]));
Chris@82 1869 ST(&(ri[WS(rs, 9)]), VADD(T6b, T6e), ms, &(ri[WS(rs, 1)]));
Chris@82 1870 T8f = VSUB(T69, T66);
Chris@82 1871 T8g = VSUB(T8d, T88);
Chris@82 1872 ST(&(ii[WS(rs, 9)]), VADD(T8f, T8g), ms, &(ii[WS(rs, 1)]));
Chris@82 1873 ST(&(ii[WS(rs, 25)]), VSUB(T8g, T8f), ms, &(ii[WS(rs, 1)]));
Chris@82 1874 }
Chris@82 1875 }
Chris@82 1876 }
Chris@82 1877 }
Chris@82 1878 }
Chris@82 1879 VLEAVE();
Chris@82 1880 }
Chris@82 1881
Chris@82 1882 static const tw_instr twinstr[] = {
Chris@82 1883 VTW(0, 1),
Chris@82 1884 VTW(0, 3),
Chris@82 1885 VTW(0, 9),
Chris@82 1886 VTW(0, 27),
Chris@82 1887 {TW_NEXT, (2 * VL), 0}
Chris@82 1888 };
Chris@82 1889
Chris@82 1890 static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {376, 168, 112, 0}, 0, 0, 0 };
Chris@82 1891
Chris@82 1892 void XSIMD(codelet_t2sv_32) (planner *p) {
Chris@82 1893 X(kdft_dit_register) (p, t2sv_32, &desc);
Chris@82 1894 }
Chris@82 1895 #endif