annotate src/fftw-3.3.3/dft/simd/common/t2sv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:28 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 488 FP additions, 350 FP multiplications,
Chris@10 32 * (or, 236 additions, 98 multiplications, 252 fused multiply/add),
Chris@10 33 * 204 stack variables, 7 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "ts.h"
Chris@10 36
Chris@10 37 static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 49 V T6H, T74, T6U, T6E, T9r, T9t, T78, T7c, T6W, T6S, T73, T6K, T7a, T72, T9x;
Chris@10 50 V T9z;
Chris@10 51 {
Chris@10 52 V T2, T8, T3, T6, Te, Ti, T5, Tc;
Chris@10 53 T2 = LDW(&(W[0]));
Chris@10 54 T8 = LDW(&(W[TWVL * 4]));
Chris@10 55 T3 = LDW(&(W[TWVL * 2]));
Chris@10 56 T6 = LDW(&(W[TWVL * 3]));
Chris@10 57 Te = LDW(&(W[TWVL * 6]));
Chris@10 58 Ti = LDW(&(W[TWVL * 7]));
Chris@10 59 T5 = LDW(&(W[TWVL * 1]));
Chris@10 60 Tc = LDW(&(W[TWVL * 5]));
Chris@10 61 {
Chris@10 62 V T2X, T2T, T34, T31, Tq, T46, T97, T8H, TH, T98, T4b, T8D, TZ, T7f, T1g;
Chris@10 63 V T7g, T4j, T6t, T4q, T6u, T6x, T4z, T7m, T1J, T4G, T6y, T8d, T7l, T4O, T6A;
Chris@10 64 V T2k, T7o, T6B, T4V, T7r, T8e, T5E, T6P, T3G, T7L, T6M, T61, T8n, T7I, T55;
Chris@10 65 V T6I, T2N, T7A, T5s, T6F, T7x, T8i, T2R, T2U, T57, T3a, T5h, T62, T5L, T7J;
Chris@10 66 V T43, T63, T5S, T8o, T7O, T2V, T2Y, T32, T35;
Chris@10 67 {
Chris@10 68 V T1w, T23, T1K, T1F, T1s, T1N, T26, T1z, T2w, T2s, T3Q, T3M, T3r, T3n, T2b;
Chris@10 69 V T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J, T2F, Td, TP, T1Z, T1V;
Chris@10 70 V T2g, T2c, T1m, T4u, T1D, T1G, T1p, T1t, T1E, T4D, T1x, T1A, T1q, T4v;
Chris@10 71 {
Chris@10 72 V T1, Ts, T19, TJ, T7, TM, Tb, T11, T1C, T1o, TA, T15, TE, T1d, Tw;
Chris@10 73 V T8G, Tk, Tn, Tj, TW, TS, To, Tt, Tx, TB, TF, Tl;
Chris@10 74 {
Chris@10 75 V T1Y, T1S, T2f, T2a;
Chris@10 76 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 77 {
Chris@10 78 V Tr, T18, T4, Ta;
Chris@10 79 Tr = VMUL(T2, T8);
Chris@10 80 T18 = VMUL(T3, T8);
Chris@10 81 T4 = VMUL(T2, T3);
Chris@10 82 Ta = VMUL(T2, T6);
Chris@10 83 {
Chris@10 84 V T10, T1n, Tz, T14;
Chris@10 85 T10 = VMUL(T2, Te);
Chris@10 86 T1n = VMUL(T8, Te);
Chris@10 87 Tz = VMUL(T3, Te);
Chris@10 88 T14 = VMUL(T2, Ti);
Chris@10 89 {
Chris@10 90 V T1r, TD, T1c, Tv;
Chris@10 91 T1r = VMUL(T8, Ti);
Chris@10 92 TD = VMUL(T3, Ti);
Chris@10 93 T1c = VMUL(T3, Tc);
Chris@10 94 Tv = VMUL(T2, Tc);
Chris@10 95 T1w = VFNMS(T5, Tc, Tr);
Chris@10 96 Ts = VFMA(T5, Tc, Tr);
Chris@10 97 T19 = VFNMS(T6, Tc, T18);
Chris@10 98 T23 = VFMA(T6, Tc, T18);
Chris@10 99 TJ = VFNMS(T5, T6, T4);
Chris@10 100 T7 = VFMA(T5, T6, T4);
Chris@10 101 TM = VFMA(T5, T3, Ta);
Chris@10 102 Tb = VFNMS(T5, T3, Ta);
Chris@10 103 T11 = VFNMS(T5, Ti, T10);
Chris@10 104 T1C = VFMA(T5, Ti, T10);
Chris@10 105 T1o = VFMA(Tc, Ti, T1n);
Chris@10 106 TA = VFMA(T6, Ti, Tz);
Chris@10 107 T1K = VFNMS(T6, Ti, Tz);
Chris@10 108 T1F = VFNMS(T5, Te, T14);
Chris@10 109 T15 = VFMA(T5, Te, T14);
Chris@10 110 T1s = VFNMS(Tc, Te, T1r);
Chris@10 111 T1N = VFMA(T6, Te, TD);
Chris@10 112 TE = VFNMS(T6, Te, TD);
Chris@10 113 T26 = VFNMS(T6, T8, T1c);
Chris@10 114 T1d = VFMA(T6, T8, T1c);
Chris@10 115 T1z = VFMA(T5, T8, Tv);
Chris@10 116 Tw = VFNMS(T5, T8, Tv);
Chris@10 117 {
Chris@10 118 V T2v, T2r, T3P, T3L;
Chris@10 119 T2v = VMUL(T1w, Ti);
Chris@10 120 T2r = VMUL(T1w, Te);
Chris@10 121 T3P = VMUL(Ts, Ti);
Chris@10 122 T3L = VMUL(Ts, Te);
Chris@10 123 {
Chris@10 124 V T3q, T3m, T2W, T2S;
Chris@10 125 T3q = VMUL(T19, Ti);
Chris@10 126 T3m = VMUL(T19, Te);
Chris@10 127 T2W = VMUL(T23, Ti);
Chris@10 128 T2S = VMUL(T23, Te);
Chris@10 129 {
Chris@10 130 V T1T, T3i, T3e, T1Q;
Chris@10 131 T1T = VMUL(TJ, Tc);
Chris@10 132 T3i = VMUL(TJ, Ti);
Chris@10 133 T3e = VMUL(TJ, Te);
Chris@10 134 T1Q = VMUL(TJ, T8);
Chris@10 135 {
Chris@10 136 V Tg, T2I, T2E, T9;
Chris@10 137 Tg = VMUL(T7, Tc);
Chris@10 138 T2I = VMUL(T7, Ti);
Chris@10 139 T2E = VMUL(T7, Te);
Chris@10 140 T9 = VMUL(T7, T8);
Chris@10 141 T2w = VFNMS(T1z, Te, T2v);
Chris@10 142 T2s = VFMA(T1z, Ti, T2r);
Chris@10 143 T3Q = VFNMS(Tw, Te, T3P);
Chris@10 144 T3M = VFMA(Tw, Ti, T3L);
Chris@10 145 T3r = VFNMS(T1d, Te, T3q);
Chris@10 146 T3n = VFMA(T1d, Ti, T3m);
Chris@10 147 T2X = VFNMS(T26, Te, T2W);
Chris@10 148 T2T = VFMA(T26, Ti, T2S);
Chris@10 149 T2b = VFNMS(TM, T8, T1T);
Chris@10 150 T1U = VFMA(TM, T8, T1T);
Chris@10 151 T3C = VFNMS(TM, Te, T3i);
Chris@10 152 T3j = VFMA(TM, Te, T3i);
Chris@10 153 T3z = VFMA(TM, Ti, T3e);
Chris@10 154 T3f = VFNMS(TM, Ti, T3e);
Chris@10 155 T1R = VFNMS(TM, Tc, T1Q);
Chris@10 156 T29 = VFMA(TM, Tc, T1Q);
Chris@10 157 TR = VFNMS(Tb, T8, Tg);
Chris@10 158 Th = VFMA(Tb, T8, Tg);
Chris@10 159 T34 = VFMA(Tb, Te, T2I);
Chris@10 160 T2J = VFNMS(Tb, Te, T2I);
Chris@10 161 T31 = VFNMS(Tb, Ti, T2E);
Chris@10 162 T2F = VFMA(Tb, Ti, T2E);
Chris@10 163 Td = VFNMS(Tb, Tc, T9);
Chris@10 164 TP = VFMA(Tb, Tc, T9);
Chris@10 165 T1Y = VMUL(T1R, Ti);
Chris@10 166 T1S = VMUL(T1R, Te);
Chris@10 167 T2f = VMUL(T29, Ti);
Chris@10 168 T2a = VMUL(T29, Te);
Chris@10 169 T8G = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 170 }
Chris@10 171 }
Chris@10 172 }
Chris@10 173 }
Chris@10 174 }
Chris@10 175 }
Chris@10 176 }
Chris@10 177 Tk = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@10 178 {
Chris@10 179 V Tm, Tf, TV, TQ;
Chris@10 180 Tm = VMUL(Td, Ti);
Chris@10 181 Tf = VMUL(Td, Te);
Chris@10 182 TV = VMUL(TP, Ti);
Chris@10 183 TQ = VMUL(TP, Te);
Chris@10 184 T1Z = VFNMS(T1U, Te, T1Y);
Chris@10 185 T1V = VFMA(T1U, Ti, T1S);
Chris@10 186 T2g = VFNMS(T2b, Te, T2f);
Chris@10 187 T2c = VFMA(T2b, Ti, T2a);
Chris@10 188 Tn = VFNMS(Th, Te, Tm);
Chris@10 189 Tj = VFMA(Th, Ti, Tf);
Chris@10 190 TW = VFNMS(TR, Te, TV);
Chris@10 191 TS = VFMA(TR, Ti, TQ);
Chris@10 192 }
Chris@10 193 To = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@10 194 }
Chris@10 195 Tt = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 196 Tx = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 197 TB = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@10 198 TF = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@10 199 Tl = VMUL(Tj, Tk);
Chris@10 200 {
Chris@10 201 V TO, T4f, TT, TX;
Chris@10 202 {
Chris@10 203 V Ty, T48, TG, T4a;
Chris@10 204 {
Chris@10 205 V TK, TN, T8E, Tu, T47, TC, T49, Tp, TL, T4e, T8F;
Chris@10 206 TK = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 207 TN = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 208 T8E = VMUL(Tj, To);
Chris@10 209 Tu = VMUL(Ts, Tt);
Chris@10 210 T47 = VMUL(Ts, Tx);
Chris@10 211 TC = VMUL(TA, TB);
Chris@10 212 T49 = VMUL(TA, TF);
Chris@10 213 Tp = VFMA(Tn, To, Tl);
Chris@10 214 TL = VMUL(TJ, TK);
Chris@10 215 T4e = VMUL(TJ, TN);
Chris@10 216 T8F = VFNMS(Tn, Tk, T8E);
Chris@10 217 Ty = VFMA(Tw, Tx, Tu);
Chris@10 218 T48 = VFNMS(Tw, Tt, T47);
Chris@10 219 TG = VFMA(TE, TF, TC);
Chris@10 220 T4a = VFNMS(TE, TB, T49);
Chris@10 221 Tq = VADD(T1, Tp);
Chris@10 222 T46 = VSUB(T1, Tp);
Chris@10 223 TO = VFMA(TM, TN, TL);
Chris@10 224 T97 = VSUB(T8G, T8F);
Chris@10 225 T8H = VADD(T8F, T8G);
Chris@10 226 T4f = VFNMS(TM, TK, T4e);
Chris@10 227 }
Chris@10 228 TH = VADD(Ty, TG);
Chris@10 229 T98 = VSUB(Ty, TG);
Chris@10 230 T4b = VSUB(T48, T4a);
Chris@10 231 T8D = VADD(T48, T4a);
Chris@10 232 TT = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@10 233 TX = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@10 234 }
Chris@10 235 {
Chris@10 236 V T12, T16, T1a, T1e, T4k, T4p;
Chris@10 237 T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@10 238 T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@10 239 T1a = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 240 T1e = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 241 {
Chris@10 242 V TY, T4h, T17, T4m, T1f, T4o, T4d, T4i;
Chris@10 243 {
Chris@10 244 V T1j, T1l, TU, T4g, T13, T4l, T1b, T4n, T1k, T4t;
Chris@10 245 T1j = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 246 T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 247 TU = VMUL(TS, TT);
Chris@10 248 T4g = VMUL(TS, TX);
Chris@10 249 T13 = VMUL(T11, T12);
Chris@10 250 T4l = VMUL(T11, T16);
Chris@10 251 T1b = VMUL(T19, T1a);
Chris@10 252 T4n = VMUL(T19, T1e);
Chris@10 253 T1k = VMUL(T7, T1j);
Chris@10 254 T4t = VMUL(T7, T1l);
Chris@10 255 TY = VFMA(TW, TX, TU);
Chris@10 256 T4h = VFNMS(TW, TT, T4g);
Chris@10 257 T17 = VFMA(T15, T16, T13);
Chris@10 258 T4m = VFNMS(T15, T12, T4l);
Chris@10 259 T1f = VFMA(T1d, T1e, T1b);
Chris@10 260 T4o = VFNMS(T1d, T1a, T4n);
Chris@10 261 T1m = VFMA(Tb, T1l, T1k);
Chris@10 262 T4u = VFNMS(Tb, T1j, T4t);
Chris@10 263 }
Chris@10 264 TZ = VADD(TO, TY);
Chris@10 265 T4d = VSUB(TO, TY);
Chris@10 266 T7f = VADD(T4f, T4h);
Chris@10 267 T4i = VSUB(T4f, T4h);
Chris@10 268 T1g = VADD(T17, T1f);
Chris@10 269 T4k = VSUB(T17, T1f);
Chris@10 270 T7g = VADD(T4m, T4o);
Chris@10 271 T4p = VSUB(T4m, T4o);
Chris@10 272 T1D = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@10 273 T1G = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@10 274 T4j = VADD(T4d, T4i);
Chris@10 275 T6t = VSUB(T4i, T4d);
Chris@10 276 }
Chris@10 277 T1p = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@10 278 T1t = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@10 279 T4q = VSUB(T4k, T4p);
Chris@10 280 T6u = VADD(T4k, T4p);
Chris@10 281 T1E = VMUL(T1C, T1D);
Chris@10 282 T4D = VMUL(T1C, T1G);
Chris@10 283 T1x = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 284 T1A = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 285 T1q = VMUL(T1o, T1p);
Chris@10 286 T4v = VMUL(T1o, T1t);
Chris@10 287 }
Chris@10 288 }
Chris@10 289 }
Chris@10 290 {
Chris@10 291 V T3l, T5z, T3E, T5Z, T3v, T3x, T3w, T3t, T5B, T5W;
Chris@10 292 {
Chris@10 293 V T1P, T4J, T1W, T20, T2i, T4T, T1X, T4K, T24, T27;
Chris@10 294 {
Chris@10 295 V T2d, T2h, T1v, T4A, T7j, T4x, T2e, T4y, T1I, T4F, T7k, T4S;
Chris@10 296 {
Chris@10 297 V T1L, T1O, T1H, T4E, T1y, T4B, T1u, T4w, T1M, T4I, T1B, T4C;
Chris@10 298 T1L = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@10 299 T1O = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@10 300 T1H = VFMA(T1F, T1G, T1E);
Chris@10 301 T4E = VFNMS(T1F, T1D, T4D);
Chris@10 302 T1y = VMUL(T1w, T1x);
Chris@10 303 T4B = VMUL(T1w, T1A);
Chris@10 304 T1u = VFMA(T1s, T1t, T1q);
Chris@10 305 T4w = VFNMS(T1s, T1p, T4v);
Chris@10 306 T1M = VMUL(T1K, T1L);
Chris@10 307 T4I = VMUL(T1K, T1O);
Chris@10 308 T2d = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@10 309 T2h = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@10 310 T1B = VFMA(T1z, T1A, T1y);
Chris@10 311 T4C = VFNMS(T1z, T1x, T4B);
Chris@10 312 T1v = VADD(T1m, T1u);
Chris@10 313 T4A = VSUB(T1m, T1u);
Chris@10 314 T7j = VADD(T4u, T4w);
Chris@10 315 T4x = VSUB(T4u, T4w);
Chris@10 316 T1P = VFMA(T1N, T1O, T1M);
Chris@10 317 T4J = VFNMS(T1N, T1L, T4I);
Chris@10 318 T2e = VMUL(T2c, T2d);
Chris@10 319 T4y = VSUB(T1B, T1H);
Chris@10 320 T1I = VADD(T1B, T1H);
Chris@10 321 T4F = VSUB(T4C, T4E);
Chris@10 322 T7k = VADD(T4C, T4E);
Chris@10 323 T4S = VMUL(T2c, T2h);
Chris@10 324 }
Chris@10 325 T1W = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 326 T20 = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 327 T2i = VFMA(T2g, T2h, T2e);
Chris@10 328 T6x = VADD(T4x, T4y);
Chris@10 329 T4z = VSUB(T4x, T4y);
Chris@10 330 T7m = VSUB(T1v, T1I);
Chris@10 331 T1J = VADD(T1v, T1I);
Chris@10 332 T4G = VADD(T4A, T4F);
Chris@10 333 T6y = VSUB(T4A, T4F);
Chris@10 334 T8d = VADD(T7j, T7k);
Chris@10 335 T7l = VSUB(T7j, T7k);
Chris@10 336 T4T = VFNMS(T2g, T2d, T4S);
Chris@10 337 T1X = VMUL(T1V, T1W);
Chris@10 338 T4K = VMUL(T1V, T20);
Chris@10 339 T24 = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 340 T27 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 341 }
Chris@10 342 {
Chris@10 343 V T22, T4P, T7p, T4M, T28, T4R, T3g, T3k;
Chris@10 344 T3g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@10 345 T3k = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@10 346 {
Chris@10 347 V T3A, T3D, T21, T4L, T25, T4Q, T3h, T5y, T3B, T5Y;
Chris@10 348 T3A = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@10 349 T3D = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@10 350 T21 = VFMA(T1Z, T20, T1X);
Chris@10 351 T4L = VFNMS(T1Z, T1W, T4K);
Chris@10 352 T25 = VMUL(T23, T24);
Chris@10 353 T4Q = VMUL(T23, T27);
Chris@10 354 T3h = VMUL(T3f, T3g);
Chris@10 355 T5y = VMUL(T3f, T3k);
Chris@10 356 T3B = VMUL(T3z, T3A);
Chris@10 357 T5Y = VMUL(T3z, T3D);
Chris@10 358 T22 = VADD(T1P, T21);
Chris@10 359 T4P = VSUB(T1P, T21);
Chris@10 360 T7p = VADD(T4J, T4L);
Chris@10 361 T4M = VSUB(T4J, T4L);
Chris@10 362 T28 = VFMA(T26, T27, T25);
Chris@10 363 T4R = VFNMS(T26, T24, T4Q);
Chris@10 364 T3l = VFMA(T3j, T3k, T3h);
Chris@10 365 T5z = VFNMS(T3j, T3g, T5y);
Chris@10 366 T3E = VFMA(T3C, T3D, T3B);
Chris@10 367 T5Z = VFNMS(T3C, T3A, T5Y);
Chris@10 368 }
Chris@10 369 {
Chris@10 370 V T3o, T3s, T2j, T4N, T7q, T4U, T3p, T5A;
Chris@10 371 T3o = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 372 T3s = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 373 T2j = VADD(T28, T2i);
Chris@10 374 T4N = VSUB(T28, T2i);
Chris@10 375 T7q = VADD(T4R, T4T);
Chris@10 376 T4U = VSUB(T4R, T4T);
Chris@10 377 T3v = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 378 T3x = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 379 T3p = VMUL(T3n, T3o);
Chris@10 380 T5A = VMUL(T3n, T3s);
Chris@10 381 T4O = VSUB(T4M, T4N);
Chris@10 382 T6A = VADD(T4M, T4N);
Chris@10 383 T2k = VADD(T22, T2j);
Chris@10 384 T7o = VSUB(T22, T2j);
Chris@10 385 T6B = VSUB(T4P, T4U);
Chris@10 386 T4V = VADD(T4P, T4U);
Chris@10 387 T7r = VSUB(T7p, T7q);
Chris@10 388 T8e = VADD(T7p, T7q);
Chris@10 389 T3w = VMUL(TP, T3v);
Chris@10 390 T3t = VFMA(T3r, T3s, T3p);
Chris@10 391 T5B = VFNMS(T3r, T3o, T5A);
Chris@10 392 T5W = VMUL(TP, T3x);
Chris@10 393 }
Chris@10 394 }
Chris@10 395 }
Chris@10 396 {
Chris@10 397 V T2t, T2q, T50, T2L, T5q, T2u, T2x, T2A, T2C;
Chris@10 398 {
Chris@10 399 V T2n, T2p, T2G, T2K, T5V, T3u, T5C, T7G, T5X, T2o, T4Z, T2H, T5D, T3F, T5p;
Chris@10 400 V T3y, T60, T7H;
Chris@10 401 T2n = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 402 T2p = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 403 T2G = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@10 404 T2K = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@10 405 T3y = VFMA(TR, T3x, T3w);
Chris@10 406 T5V = VSUB(T3l, T3t);
Chris@10 407 T3u = VADD(T3l, T3t);
Chris@10 408 T5C = VSUB(T5z, T5B);
Chris@10 409 T7G = VADD(T5z, T5B);
Chris@10 410 T5X = VFNMS(TR, T3v, T5W);
Chris@10 411 T2o = VMUL(T2, T2n);
Chris@10 412 T4Z = VMUL(T2, T2p);
Chris@10 413 T2H = VMUL(T2F, T2G);
Chris@10 414 T5D = VSUB(T3y, T3E);
Chris@10 415 T3F = VADD(T3y, T3E);
Chris@10 416 T5p = VMUL(T2F, T2K);
Chris@10 417 T2t = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@10 418 T60 = VSUB(T5X, T5Z);
Chris@10 419 T7H = VADD(T5X, T5Z);
Chris@10 420 T2q = VFMA(T5, T2p, T2o);
Chris@10 421 T50 = VFNMS(T5, T2n, T4Z);
Chris@10 422 T2L = VFMA(T2J, T2K, T2H);
Chris@10 423 T5E = VSUB(T5C, T5D);
Chris@10 424 T6P = VADD(T5C, T5D);
Chris@10 425 T3G = VADD(T3u, T3F);
Chris@10 426 T7L = VSUB(T3u, T3F);
Chris@10 427 T5q = VFNMS(T2J, T2G, T5p);
Chris@10 428 T6M = VSUB(T5V, T60);
Chris@10 429 T61 = VADD(T5V, T60);
Chris@10 430 T8n = VADD(T7G, T7H);
Chris@10 431 T7I = VSUB(T7G, T7H);
Chris@10 432 T2u = VMUL(T2s, T2t);
Chris@10 433 T2x = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@10 434 T2A = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 435 T2C = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 436 }
Chris@10 437 {
Chris@10 438 V T3N, T2z, T5m, T3K, T5G, T41, T5Q, T3O, T7v, T53, T2M, T54, T7w, T5r, T3R;
Chris@10 439 V T3U, T3W;
Chris@10 440 {
Chris@10 441 V T3H, T3J, T3Y, T40, T52, T2D, T5o;
Chris@10 442 T3H = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 443 T3J = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 444 T3Y = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 445 T40 = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 446 T3N = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@10 447 {
Chris@10 448 V T2y, T51, T2B, T5n;
Chris@10 449 T2y = VFMA(T2w, T2x, T2u);
Chris@10 450 T51 = VMUL(T2s, T2x);
Chris@10 451 T2B = VMUL(T8, T2A);
Chris@10 452 T5n = VMUL(T8, T2C);
Chris@10 453 {
Chris@10 454 V T3I, T5F, T3Z, T5P;
Chris@10 455 T3I = VMUL(T3, T3H);
Chris@10 456 T5F = VMUL(T3, T3J);
Chris@10 457 T3Z = VMUL(Td, T3Y);
Chris@10 458 T5P = VMUL(Td, T40);
Chris@10 459 T2z = VADD(T2q, T2y);
Chris@10 460 T5m = VSUB(T2q, T2y);
Chris@10 461 T52 = VFNMS(T2w, T2t, T51);
Chris@10 462 T2D = VFMA(Tc, T2C, T2B);
Chris@10 463 T5o = VFNMS(Tc, T2A, T5n);
Chris@10 464 T3K = VFMA(T6, T3J, T3I);
Chris@10 465 T5G = VFNMS(T6, T3H, T5F);
Chris@10 466 T41 = VFMA(Th, T40, T3Z);
Chris@10 467 T5Q = VFNMS(Th, T3Y, T5P);
Chris@10 468 T3O = VMUL(T3M, T3N);
Chris@10 469 }
Chris@10 470 }
Chris@10 471 T7v = VADD(T50, T52);
Chris@10 472 T53 = VSUB(T50, T52);
Chris@10 473 T2M = VADD(T2D, T2L);
Chris@10 474 T54 = VSUB(T2D, T2L);
Chris@10 475 T7w = VADD(T5o, T5q);
Chris@10 476 T5r = VSUB(T5o, T5q);
Chris@10 477 T3R = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@10 478 T3U = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@10 479 T3W = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@10 480 }
Chris@10 481 {
Chris@10 482 V T2O, T37, T39, T3T, T5K, T5I, T3X, T5O, T56, T38, T5g, T7M, T5J;
Chris@10 483 {
Chris@10 484 V T3S, T5H, T3V, T5N, T2P, T2Q;
Chris@10 485 T2O = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 486 T55 = VSUB(T53, T54);
Chris@10 487 T6I = VADD(T53, T54);
Chris@10 488 T2N = VADD(T2z, T2M);
Chris@10 489 T7A = VSUB(T2z, T2M);
Chris@10 490 T5s = VADD(T5m, T5r);
Chris@10 491 T6F = VSUB(T5m, T5r);
Chris@10 492 T7x = VSUB(T7v, T7w);
Chris@10 493 T8i = VADD(T7v, T7w);
Chris@10 494 T3S = VFMA(T3Q, T3R, T3O);
Chris@10 495 T5H = VMUL(T3M, T3R);
Chris@10 496 T3V = VMUL(Te, T3U);
Chris@10 497 T5N = VMUL(Te, T3W);
Chris@10 498 T2P = VMUL(T29, T2O);
Chris@10 499 T2Q = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 500 T37 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 501 T39 = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 502 T3T = VADD(T3K, T3S);
Chris@10 503 T5K = VSUB(T3K, T3S);
Chris@10 504 T5I = VFNMS(T3Q, T3N, T5H);
Chris@10 505 T3X = VFMA(Ti, T3W, T3V);
Chris@10 506 T5O = VFNMS(Ti, T3U, T5N);
Chris@10 507 T2R = VFMA(T2b, T2Q, T2P);
Chris@10 508 T56 = VMUL(T29, T2Q);
Chris@10 509 T38 = VMUL(T1R, T37);
Chris@10 510 T5g = VMUL(T1R, T39);
Chris@10 511 }
Chris@10 512 T2U = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@10 513 T7M = VADD(T5G, T5I);
Chris@10 514 T5J = VSUB(T5G, T5I);
Chris@10 515 {
Chris@10 516 V T42, T5M, T7N, T5R;
Chris@10 517 T42 = VADD(T3X, T41);
Chris@10 518 T5M = VSUB(T3X, T41);
Chris@10 519 T7N = VADD(T5O, T5Q);
Chris@10 520 T5R = VSUB(T5O, T5Q);
Chris@10 521 T57 = VFNMS(T2b, T2O, T56);
Chris@10 522 T3a = VFMA(T1U, T39, T38);
Chris@10 523 T5h = VFNMS(T1U, T37, T5g);
Chris@10 524 T62 = VADD(T5K, T5J);
Chris@10 525 T5L = VSUB(T5J, T5K);
Chris@10 526 T7J = VSUB(T42, T3T);
Chris@10 527 T43 = VADD(T3T, T42);
Chris@10 528 T63 = VSUB(T5M, T5R);
Chris@10 529 T5S = VADD(T5M, T5R);
Chris@10 530 T8o = VADD(T7M, T7N);
Chris@10 531 T7O = VSUB(T7M, T7N);
Chris@10 532 T2V = VMUL(T2T, T2U);
Chris@10 533 }
Chris@10 534 T2Y = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@10 535 T32 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@10 536 T35 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@10 537 }
Chris@10 538 }
Chris@10 539 }
Chris@10 540 }
Chris@10 541 }
Chris@10 542 {
Chris@10 543 V T5t, T5c, T5u, T5j, T8Z, T90;
Chris@10 544 {
Chris@10 545 V T7e, T8T, T8y, T7h, T8U, T8c, T8J, T44, T8u, T8q, T7y, T7D, T8w, T2m, T3d;
Chris@10 546 V T8h, T8R, T8P, T8k, T8x, T8B, T8f;
Chris@10 547 {
Chris@10 548 V T1i, T8O, T8N, T2l, T3c, T8j;
Chris@10 549 {
Chris@10 550 V T8p, T5b, T30, T59, T36, T5f, TI, T1h, T8m, T5a, T7B;
Chris@10 551 TI = VADD(Tq, TH);
Chris@10 552 T7e = VSUB(Tq, TH);
Chris@10 553 T8T = VSUB(T1g, TZ);
Chris@10 554 T1h = VADD(TZ, T1g);
Chris@10 555 T8y = VADD(T8n, T8o);
Chris@10 556 T8p = VSUB(T8n, T8o);
Chris@10 557 {
Chris@10 558 V T8C, T8I, T2Z, T58, T33, T5e;
Chris@10 559 T7h = VSUB(T7f, T7g);
Chris@10 560 T8C = VADD(T7f, T7g);
Chris@10 561 T8I = VADD(T8D, T8H);
Chris@10 562 T8U = VSUB(T8H, T8D);
Chris@10 563 T2Z = VFMA(T2X, T2Y, T2V);
Chris@10 564 T58 = VMUL(T2T, T2Y);
Chris@10 565 T33 = VMUL(T31, T32);
Chris@10 566 T5e = VMUL(T31, T35);
Chris@10 567 T1i = VADD(TI, T1h);
Chris@10 568 T8c = VSUB(TI, T1h);
Chris@10 569 T8O = VSUB(T8I, T8C);
Chris@10 570 T8J = VADD(T8C, T8I);
Chris@10 571 T5b = VSUB(T2R, T2Z);
Chris@10 572 T30 = VADD(T2R, T2Z);
Chris@10 573 T59 = VFNMS(T2X, T2U, T58);
Chris@10 574 T36 = VFMA(T34, T35, T33);
Chris@10 575 T5f = VFNMS(T34, T32, T5e);
Chris@10 576 }
Chris@10 577 T44 = VADD(T3G, T43);
Chris@10 578 T8m = VSUB(T3G, T43);
Chris@10 579 T5a = VSUB(T57, T59);
Chris@10 580 T7B = VADD(T57, T59);
Chris@10 581 {
Chris@10 582 V T5d, T3b, T5i, T7C;
Chris@10 583 T5d = VSUB(T36, T3a);
Chris@10 584 T3b = VADD(T36, T3a);
Chris@10 585 T5i = VSUB(T5f, T5h);
Chris@10 586 T7C = VADD(T5f, T5h);
Chris@10 587 T8N = VSUB(T2k, T1J);
Chris@10 588 T2l = VADD(T1J, T2k);
Chris@10 589 T8u = VADD(T8m, T8p);
Chris@10 590 T8q = VSUB(T8m, T8p);
Chris@10 591 T5t = VADD(T5b, T5a);
Chris@10 592 T5c = VSUB(T5a, T5b);
Chris@10 593 T7y = VSUB(T3b, T30);
Chris@10 594 T3c = VADD(T30, T3b);
Chris@10 595 T5u = VSUB(T5d, T5i);
Chris@10 596 T5j = VADD(T5d, T5i);
Chris@10 597 T8j = VADD(T7B, T7C);
Chris@10 598 T7D = VSUB(T7B, T7C);
Chris@10 599 }
Chris@10 600 }
Chris@10 601 T8w = VSUB(T1i, T2l);
Chris@10 602 T2m = VADD(T1i, T2l);
Chris@10 603 T3d = VADD(T2N, T3c);
Chris@10 604 T8h = VSUB(T2N, T3c);
Chris@10 605 T8R = VSUB(T8O, T8N);
Chris@10 606 T8P = VADD(T8N, T8O);
Chris@10 607 T8k = VSUB(T8i, T8j);
Chris@10 608 T8x = VADD(T8i, T8j);
Chris@10 609 T8B = VADD(T8d, T8e);
Chris@10 610 T8f = VSUB(T8d, T8e);
Chris@10 611 }
Chris@10 612 {
Chris@10 613 V T7P, T7K, T7X, T7Y, T82, T7z, T7W, T7i, T8a, T86, T91, T8V, T8W, T7t, T7E;
Chris@10 614 V T81;
Chris@10 615 {
Chris@10 616 V T84, T85, T7n, T7s, T8L, T45;
Chris@10 617 T8L = VSUB(T44, T3d);
Chris@10 618 T45 = VADD(T3d, T44);
Chris@10 619 {
Chris@10 620 V T8t, T8l, T8A, T8z;
Chris@10 621 T8t = VSUB(T8k, T8h);
Chris@10 622 T8l = VADD(T8h, T8k);
Chris@10 623 T8A = VADD(T8x, T8y);
Chris@10 624 T8z = VSUB(T8x, T8y);
Chris@10 625 {
Chris@10 626 V T8M, T8K, T8s, T8g;
Chris@10 627 T8M = VSUB(T8J, T8B);
Chris@10 628 T8K = VADD(T8B, T8J);
Chris@10 629 T8s = VSUB(T8c, T8f);
Chris@10 630 T8g = VADD(T8c, T8f);
Chris@10 631 ST(&(ri[0]), VADD(T2m, T45), ms, &(ri[0]));
Chris@10 632 ST(&(ri[WS(rs, 16)]), VSUB(T2m, T45), ms, &(ri[0]));
Chris@10 633 {
Chris@10 634 V T8v, T8Q, T8S, T8r;
Chris@10 635 T8v = VSUB(T8t, T8u);
Chris@10 636 T8Q = VADD(T8t, T8u);
Chris@10 637 T8S = VSUB(T8q, T8l);
Chris@10 638 T8r = VADD(T8l, T8q);
Chris@10 639 ST(&(ri[WS(rs, 8)]), VADD(T8w, T8z), ms, &(ri[0]));
Chris@10 640 ST(&(ri[WS(rs, 24)]), VSUB(T8w, T8z), ms, &(ri[0]));
Chris@10 641 ST(&(ii[WS(rs, 24)]), VSUB(T8M, T8L), ms, &(ii[0]));
Chris@10 642 ST(&(ii[WS(rs, 8)]), VADD(T8L, T8M), ms, &(ii[0]));
Chris@10 643 ST(&(ii[WS(rs, 16)]), VSUB(T8K, T8A), ms, &(ii[0]));
Chris@10 644 ST(&(ii[0]), VADD(T8A, T8K), ms, &(ii[0]));
Chris@10 645 ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
Chris@10 646 ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8v, T8s), ms, &(ri[0]));
Chris@10 647 ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
Chris@10 648 ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8Q, T8P), ms, &(ii[0]));
Chris@10 649 ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
Chris@10 650 ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8S, T8R), ms, &(ii[0]));
Chris@10 651 ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
Chris@10 652 ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8r, T8g), ms, &(ri[0]));
Chris@10 653 }
Chris@10 654 }
Chris@10 655 }
Chris@10 656 T7P = VSUB(T7L, T7O);
Chris@10 657 T84 = VADD(T7L, T7O);
Chris@10 658 T85 = VADD(T7I, T7J);
Chris@10 659 T7K = VSUB(T7I, T7J);
Chris@10 660 T7X = VADD(T7m, T7l);
Chris@10 661 T7n = VSUB(T7l, T7m);
Chris@10 662 T7s = VADD(T7o, T7r);
Chris@10 663 T7Y = VSUB(T7o, T7r);
Chris@10 664 T82 = VADD(T7x, T7y);
Chris@10 665 T7z = VSUB(T7x, T7y);
Chris@10 666 T7W = VADD(T7e, T7h);
Chris@10 667 T7i = VSUB(T7e, T7h);
Chris@10 668 T8a = VFMA(LDK(KP414213562), T84, T85);
Chris@10 669 T86 = VFNMS(LDK(KP414213562), T85, T84);
Chris@10 670 T91 = VSUB(T8U, T8T);
Chris@10 671 T8V = VADD(T8T, T8U);
Chris@10 672 T8W = VADD(T7n, T7s);
Chris@10 673 T7t = VSUB(T7n, T7s);
Chris@10 674 T7E = VSUB(T7A, T7D);
Chris@10 675 T81 = VADD(T7A, T7D);
Chris@10 676 }
Chris@10 677 {
Chris@10 678 V T7S, T7u, T7T, T7F, T92, T7Z, T89, T83, T7U, T7Q;
Chris@10 679 T7S = VFNMS(LDK(KP707106781), T7t, T7i);
Chris@10 680 T7u = VFMA(LDK(KP707106781), T7t, T7i);
Chris@10 681 T7T = VFNMS(LDK(KP414213562), T7z, T7E);
Chris@10 682 T7F = VFMA(LDK(KP414213562), T7E, T7z);
Chris@10 683 T92 = VSUB(T7Y, T7X);
Chris@10 684 T7Z = VADD(T7X, T7Y);
Chris@10 685 T89 = VFNMS(LDK(KP414213562), T81, T82);
Chris@10 686 T83 = VFMA(LDK(KP414213562), T82, T81);
Chris@10 687 T7U = VFMA(LDK(KP414213562), T7K, T7P);
Chris@10 688 T7Q = VFNMS(LDK(KP414213562), T7P, T7K);
Chris@10 689 {
Chris@10 690 V T8X, T95, T93, T80, T88, T87, T7V, T94, T96, T7R, T8Y, T8b;
Chris@10 691 T8Z = VFNMS(LDK(KP707106781), T8W, T8V);
Chris@10 692 T8X = VFMA(LDK(KP707106781), T8W, T8V);
Chris@10 693 T95 = VFNMS(LDK(KP707106781), T92, T91);
Chris@10 694 T93 = VFMA(LDK(KP707106781), T92, T91);
Chris@10 695 T80 = VFMA(LDK(KP707106781), T7Z, T7W);
Chris@10 696 T88 = VFNMS(LDK(KP707106781), T7Z, T7W);
Chris@10 697 T90 = VSUB(T86, T83);
Chris@10 698 T87 = VADD(T83, T86);
Chris@10 699 T7V = VADD(T7T, T7U);
Chris@10 700 T94 = VSUB(T7U, T7T);
Chris@10 701 T96 = VADD(T7F, T7Q);
Chris@10 702 T7R = VSUB(T7F, T7Q);
Chris@10 703 T8Y = VADD(T89, T8a);
Chris@10 704 T8b = VSUB(T89, T8a);
Chris@10 705 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T87, T80), ms, &(ri[0]));
Chris@10 706 ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T87, T80), ms, &(ri[0]));
Chris@10 707 ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
Chris@10 708 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7V, T7S), ms, &(ri[0]));
Chris@10 709 ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T94, T93), ms, &(ii[0]));
Chris@10 710 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T94, T93), ms, &(ii[0]));
Chris@10 711 ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T96, T95), ms, &(ii[0]));
Chris@10 712 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T96, T95), ms, &(ii[0]));
Chris@10 713 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
Chris@10 714 ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7R, T7u), ms, &(ri[0]));
Chris@10 715 ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
Chris@10 716 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8Y, T8X), ms, &(ii[0]));
Chris@10 717 ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
Chris@10 718 ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T8b, T88), ms, &(ri[0]));
Chris@10 719 }
Chris@10 720 }
Chris@10 721 }
Chris@10 722 }
Chris@10 723 {
Chris@10 724 V T6s, T9o, T9n, T6v, T6N, T6Q, T6G, T6J, T68, T4Y, T9f, T9d, T9l, T9j, T6g;
Chris@10 725 V T6o, T6q, T6m, T66, T6a, T6p, T6j, T5x, T69;
Chris@10 726 {
Chris@10 727 V T6d, T6e, T6c, T4s, T9c, T4X, T9h, T9b, T5T, T64, T5k, T5v, T9i, T6f;
Chris@10 728 {
Chris@10 729 V T4c, T4r, T4H, T4W, T99, T9a;
Chris@10 730 T6s = VSUB(T46, T4b);
Chris@10 731 T4c = VADD(T46, T4b);
Chris@10 732 T4r = VADD(T4j, T4q);
Chris@10 733 T9o = VSUB(T4q, T4j);
Chris@10 734 T6d = VFMA(LDK(KP414213562), T4z, T4G);
Chris@10 735 T4H = VFNMS(LDK(KP414213562), T4G, T4z);
Chris@10 736 T4W = VFMA(LDK(KP414213562), T4V, T4O);
Chris@10 737 T6e = VFNMS(LDK(KP414213562), T4O, T4V);
Chris@10 738 T9n = VADD(T98, T97);
Chris@10 739 T99 = VSUB(T97, T98);
Chris@10 740 T9a = VADD(T6t, T6u);
Chris@10 741 T6v = VSUB(T6t, T6u);
Chris@10 742 ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
Chris@10 743 ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T90, T8Z), ms, &(ii[0]));
Chris@10 744 T6c = VFMA(LDK(KP707106781), T4r, T4c);
Chris@10 745 T4s = VFNMS(LDK(KP707106781), T4r, T4c);
Chris@10 746 T9c = VADD(T4H, T4W);
Chris@10 747 T4X = VSUB(T4H, T4W);
Chris@10 748 T9h = VFNMS(LDK(KP707106781), T9a, T99);
Chris@10 749 T9b = VFMA(LDK(KP707106781), T9a, T99);
Chris@10 750 T6N = VSUB(T5S, T5L);
Chris@10 751 T5T = VADD(T5L, T5S);
Chris@10 752 T64 = VADD(T62, T63);
Chris@10 753 T6Q = VSUB(T62, T63);
Chris@10 754 T6G = VSUB(T5j, T5c);
Chris@10 755 T5k = VADD(T5c, T5j);
Chris@10 756 T5v = VADD(T5t, T5u);
Chris@10 757 T6J = VSUB(T5t, T5u);
Chris@10 758 }
Chris@10 759 T68 = VFNMS(LDK(KP923879532), T4X, T4s);
Chris@10 760 T4Y = VFMA(LDK(KP923879532), T4X, T4s);
Chris@10 761 T9f = VFNMS(LDK(KP923879532), T9c, T9b);
Chris@10 762 T9d = VFMA(LDK(KP923879532), T9c, T9b);
Chris@10 763 T9i = VSUB(T6e, T6d);
Chris@10 764 T6f = VADD(T6d, T6e);
Chris@10 765 {
Chris@10 766 V T6l, T5U, T6k, T65;
Chris@10 767 T6l = VFMA(LDK(KP707106781), T5T, T5E);
Chris@10 768 T5U = VFNMS(LDK(KP707106781), T5T, T5E);
Chris@10 769 T6k = VFMA(LDK(KP707106781), T64, T61);
Chris@10 770 T65 = VFNMS(LDK(KP707106781), T64, T61);
Chris@10 771 {
Chris@10 772 V T6i, T5l, T6h, T5w;
Chris@10 773 T6i = VFMA(LDK(KP707106781), T5k, T55);
Chris@10 774 T5l = VFNMS(LDK(KP707106781), T5k, T55);
Chris@10 775 T6h = VFMA(LDK(KP707106781), T5v, T5s);
Chris@10 776 T5w = VFNMS(LDK(KP707106781), T5v, T5s);
Chris@10 777 T9l = VFNMS(LDK(KP923879532), T9i, T9h);
Chris@10 778 T9j = VFMA(LDK(KP923879532), T9i, T9h);
Chris@10 779 T6g = VFMA(LDK(KP923879532), T6f, T6c);
Chris@10 780 T6o = VFNMS(LDK(KP923879532), T6f, T6c);
Chris@10 781 T6q = VFMA(LDK(KP198912367), T6k, T6l);
Chris@10 782 T6m = VFNMS(LDK(KP198912367), T6l, T6k);
Chris@10 783 T66 = VFNMS(LDK(KP668178637), T65, T5U);
Chris@10 784 T6a = VFMA(LDK(KP668178637), T5U, T65);
Chris@10 785 T6p = VFNMS(LDK(KP198912367), T6h, T6i);
Chris@10 786 T6j = VFMA(LDK(KP198912367), T6i, T6h);
Chris@10 787 T5x = VFMA(LDK(KP668178637), T5w, T5l);
Chris@10 788 T69 = VFNMS(LDK(KP668178637), T5l, T5w);
Chris@10 789 }
Chris@10 790 }
Chris@10 791 }
Chris@10 792 {
Chris@10 793 V T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T77, T6O, T76, T6R;
Chris@10 794 {
Chris@10 795 V T6Z, T6z, T6C, T70;
Chris@10 796 {
Chris@10 797 V T6n, T9g, T9e, T6r;
Chris@10 798 T6n = VADD(T6j, T6m);
Chris@10 799 T9g = VSUB(T6m, T6j);
Chris@10 800 T9e = VADD(T6p, T6q);
Chris@10 801 T6r = VSUB(T6p, T6q);
Chris@10 802 {
Chris@10 803 V T9k, T6b, T67, T9m;
Chris@10 804 T9k = VSUB(T6a, T69);
Chris@10 805 T6b = VADD(T69, T6a);
Chris@10 806 T67 = VSUB(T5x, T66);
Chris@10 807 T9m = VADD(T5x, T66);
Chris@10 808 ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
Chris@10 809 ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)]));
Chris@10 810 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
Chris@10 811 ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)]));
Chris@10 812 ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
Chris@10 813 ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)]));
Chris@10 814 ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
Chris@10 815 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)]));
Chris@10 816 ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
Chris@10 817 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)]));
Chris@10 818 ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
Chris@10 819 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)]));
Chris@10 820 ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
Chris@10 821 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)]));
Chris@10 822 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
Chris@10 823 ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)]));
Chris@10 824 T6Y = VFNMS(LDK(KP707106781), T6v, T6s);
Chris@10 825 T6w = VFMA(LDK(KP707106781), T6v, T6s);
Chris@10 826 }
Chris@10 827 }
Chris@10 828 T6Z = VFNMS(LDK(KP414213562), T6x, T6y);
Chris@10 829 T6z = VFMA(LDK(KP414213562), T6y, T6x);
Chris@10 830 T6C = VFNMS(LDK(KP414213562), T6B, T6A);
Chris@10 831 T70 = VFMA(LDK(KP414213562), T6A, T6B);
Chris@10 832 T9w = VADD(T6z, T6C);
Chris@10 833 T6D = VSUB(T6z, T6C);
Chris@10 834 T9v = VFNMS(LDK(KP707106781), T9o, T9n);
Chris@10 835 T9p = VFMA(LDK(KP707106781), T9o, T9n);
Chris@10 836 T9q = VSUB(T70, T6Z);
Chris@10 837 T71 = VADD(T6Z, T70);
Chris@10 838 T77 = VFMA(LDK(KP707106781), T6N, T6M);
Chris@10 839 T6O = VFNMS(LDK(KP707106781), T6N, T6M);
Chris@10 840 T76 = VFMA(LDK(KP707106781), T6Q, T6P);
Chris@10 841 T6R = VFNMS(LDK(KP707106781), T6Q, T6P);
Chris@10 842 T6H = VFNMS(LDK(KP707106781), T6G, T6F);
Chris@10 843 T74 = VFMA(LDK(KP707106781), T6G, T6F);
Chris@10 844 }
Chris@10 845 T6U = VFNMS(LDK(KP923879532), T6D, T6w);
Chris@10 846 T6E = VFMA(LDK(KP923879532), T6D, T6w);
Chris@10 847 T9r = VFMA(LDK(KP923879532), T9q, T9p);
Chris@10 848 T9t = VFNMS(LDK(KP923879532), T9q, T9p);
Chris@10 849 T78 = VFNMS(LDK(KP198912367), T77, T76);
Chris@10 850 T7c = VFMA(LDK(KP198912367), T76, T77);
Chris@10 851 T6W = VFMA(LDK(KP668178637), T6O, T6R);
Chris@10 852 T6S = VFNMS(LDK(KP668178637), T6R, T6O);
Chris@10 853 T73 = VFMA(LDK(KP707106781), T6J, T6I);
Chris@10 854 T6K = VFNMS(LDK(KP707106781), T6J, T6I);
Chris@10 855 T7a = VFMA(LDK(KP923879532), T71, T6Y);
Chris@10 856 T72 = VFNMS(LDK(KP923879532), T71, T6Y);
Chris@10 857 T9x = VFNMS(LDK(KP923879532), T9w, T9v);
Chris@10 858 T9z = VFMA(LDK(KP923879532), T9w, T9v);
Chris@10 859 }
Chris@10 860 }
Chris@10 861 }
Chris@10 862 }
Chris@10 863 }
Chris@10 864 {
Chris@10 865 V T7b, T75, T6L, T6V;
Chris@10 866 T7b = VFNMS(LDK(KP198912367), T73, T74);
Chris@10 867 T75 = VFMA(LDK(KP198912367), T74, T73);
Chris@10 868 T6L = VFMA(LDK(KP668178637), T6K, T6H);
Chris@10 869 T6V = VFNMS(LDK(KP668178637), T6H, T6K);
Chris@10 870 {
Chris@10 871 V T79, T9A, T9y, T7d;
Chris@10 872 T79 = VSUB(T75, T78);
Chris@10 873 T9A = VADD(T75, T78);
Chris@10 874 T9y = VSUB(T7c, T7b);
Chris@10 875 T7d = VADD(T7b, T7c);
Chris@10 876 {
Chris@10 877 V T9s, T6X, T6T, T9u;
Chris@10 878 T9s = VADD(T6V, T6W);
Chris@10 879 T6X = VSUB(T6V, T6W);
Chris@10 880 T6T = VADD(T6L, T6S);
Chris@10 881 T9u = VSUB(T6S, T6L);
Chris@10 882 ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
Chris@10 883 ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)]));
Chris@10 884 ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
Chris@10 885 ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)]));
Chris@10 886 ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
Chris@10 887 ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)]));
Chris@10 888 ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
Chris@10 889 ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)]));
Chris@10 890 ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
Chris@10 891 ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)]));
Chris@10 892 ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
Chris@10 893 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)]));
Chris@10 894 ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
Chris@10 895 ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)]));
Chris@10 896 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
Chris@10 897 ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)]));
Chris@10 898 }
Chris@10 899 }
Chris@10 900 }
Chris@10 901 }
Chris@10 902 }
Chris@10 903 VLEAVE();
Chris@10 904 }
Chris@10 905
Chris@10 906 static const tw_instr twinstr[] = {
Chris@10 907 VTW(0, 1),
Chris@10 908 VTW(0, 3),
Chris@10 909 VTW(0, 9),
Chris@10 910 VTW(0, 27),
Chris@10 911 {TW_NEXT, (2 * VL), 0}
Chris@10 912 };
Chris@10 913
Chris@10 914 static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };
Chris@10 915
Chris@10 916 void XSIMD(codelet_t2sv_32) (planner *p) {
Chris@10 917 X(kdft_dit_register) (p, t2sv_32, &desc);
Chris@10 918 }
Chris@10 919 #else /* HAVE_FMA */
Chris@10 920
Chris@10 921 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h */
Chris@10 922
Chris@10 923 /*
Chris@10 924 * This function contains 488 FP additions, 280 FP multiplications,
Chris@10 925 * (or, 376 additions, 168 multiplications, 112 fused multiply/add),
Chris@10 926 * 158 stack variables, 7 constants, and 128 memory accesses
Chris@10 927 */
Chris@10 928 #include "ts.h"
Chris@10 929
Chris@10 930 static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 931 {
Chris@10 932 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 933 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 934 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 935 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 936 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 937 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 938 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 939 {
Chris@10 940 INT m;
Chris@10 941 for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 942 V T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y;
Chris@10 943 V T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d;
Chris@10 944 V Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C;
Chris@10 945 V T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25;
Chris@10 946 V T1S, T23;
Chris@10 947 {
Chris@10 948 V Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF;
Chris@10 949 V T10;
Chris@10 950 {
Chris@10 951 V T4, Tc, T7, Tb;
Chris@10 952 T2 = LDW(&(W[0]));
Chris@10 953 T5 = LDW(&(W[TWVL * 1]));
Chris@10 954 T3 = LDW(&(W[TWVL * 2]));
Chris@10 955 T6 = LDW(&(W[TWVL * 3]));
Chris@10 956 T4 = VMUL(T2, T3);
Chris@10 957 Tc = VMUL(T5, T3);
Chris@10 958 T7 = VMUL(T5, T6);
Chris@10 959 Tb = VMUL(T2, T6);
Chris@10 960 T8 = VADD(T4, T7);
Chris@10 961 TM = VSUB(T4, T7);
Chris@10 962 TO = VADD(Tb, Tc);
Chris@10 963 Td = VSUB(Tb, Tc);
Chris@10 964 T9 = LDW(&(W[TWVL * 4]));
Chris@10 965 Ts = VMUL(T2, T9);
Chris@10 966 T1d = VMUL(T6, T9);
Chris@10 967 Tx = VMUL(T5, T9);
Chris@10 968 T18 = VMUL(T3, T9);
Chris@10 969 Te = LDW(&(W[TWVL * 5]));
Chris@10 970 Tt = VMUL(T5, Te);
Chris@10 971 T1c = VMUL(T3, Te);
Chris@10 972 Tw = VMUL(T2, Te);
Chris@10 973 T19 = VMUL(T6, Te);
Chris@10 974 Th = LDW(&(W[TWVL * 6]));
Chris@10 975 TB = VMUL(T3, Th);
Chris@10 976 T14 = VMUL(T5, Th);
Chris@10 977 TG = VMUL(T6, Th);
Chris@10 978 TZ = VMUL(T2, Th);
Chris@10 979 Tl = LDW(&(W[TWVL * 7]));
Chris@10 980 TC = VMUL(T6, Tl);
Chris@10 981 T13 = VMUL(T2, Tl);
Chris@10 982 TF = VMUL(T3, Tl);
Chris@10 983 T10 = VMUL(T5, Tl);
Chris@10 984 }
Chris@10 985 TD = VADD(TB, TC);
Chris@10 986 TH = VSUB(TF, TG);
Chris@10 987 T1y = VADD(TZ, T10);
Chris@10 988 T1H = VADD(TF, TG);
Chris@10 989 T15 = VADD(T13, T14);
Chris@10 990 T1A = VSUB(T13, T14);
Chris@10 991 T11 = VSUB(TZ, T10);
Chris@10 992 T1F = VSUB(TB, TC);
Chris@10 993 T1n = VFMA(T9, Th, VMUL(Te, Tl));
Chris@10 994 T1p = VFNMS(Te, Th, VMUL(T9, Tl));
Chris@10 995 {
Chris@10 996 V T2o, T2p, T2s, T2t;
Chris@10 997 T2o = VMUL(T8, Th);
Chris@10 998 T2p = VMUL(Td, Tl);
Chris@10 999 T2q = VADD(T2o, T2p);
Chris@10 1000 T2I = VSUB(T2o, T2p);
Chris@10 1001 T2s = VMUL(T8, Tl);
Chris@10 1002 T2t = VMUL(Td, Th);
Chris@10 1003 T2u = VSUB(T2s, T2t);
Chris@10 1004 T2K = VADD(T2s, T2t);
Chris@10 1005 }
Chris@10 1006 {
Chris@10 1007 V T2T, T2U, T2X, T2Y;
Chris@10 1008 T2T = VMUL(TM, Th);
Chris@10 1009 T2U = VMUL(TO, Tl);
Chris@10 1010 T2V = VSUB(T2T, T2U);
Chris@10 1011 T3b = VADD(T2T, T2U);
Chris@10 1012 T2X = VMUL(TM, Tl);
Chris@10 1013 T2Y = VMUL(TO, Th);
Chris@10 1014 T2Z = VADD(T2X, T2Y);
Chris@10 1015 T3d = VSUB(T2X, T2Y);
Chris@10 1016 Tu = VADD(Ts, Tt);
Chris@10 1017 Ty = VSUB(Tw, Tx);
Chris@10 1018 T3l = VFMA(Tu, Th, VMUL(Ty, Tl));
Chris@10 1019 T3n = VFNMS(Ty, Th, VMUL(Tu, Tl));
Chris@10 1020 }
Chris@10 1021 T1t = VSUB(Ts, Tt);
Chris@10 1022 T1v = VADD(Tw, Tx);
Chris@10 1023 T2f = VFMA(T1t, Th, VMUL(T1v, Tl));
Chris@10 1024 T2h = VFNMS(T1v, Th, VMUL(T1t, Tl));
Chris@10 1025 T1a = VSUB(T18, T19);
Chris@10 1026 T1e = VADD(T1c, T1d);
Chris@10 1027 T32 = VFMA(T1a, Th, VMUL(T1e, Tl));
Chris@10 1028 T34 = VFNMS(T1e, Th, VMUL(T1a, Tl));
Chris@10 1029 T1W = VADD(T18, T19);
Chris@10 1030 T1Y = VSUB(T1c, T1d);
Chris@10 1031 T2C = VFMA(T1W, Th, VMUL(T1Y, Tl));
Chris@10 1032 T2E = VFNMS(T1Y, Th, VMUL(T1W, Tl));
Chris@10 1033 {
Chris@10 1034 V Ta, Tf, Ti, Tj;
Chris@10 1035 Ta = VMUL(T8, T9);
Chris@10 1036 Tf = VMUL(Td, Te);
Chris@10 1037 Tg = VSUB(Ta, Tf);
Chris@10 1038 TR = VADD(Ta, Tf);
Chris@10 1039 Ti = VMUL(T8, Te);
Chris@10 1040 Tj = VMUL(Td, T9);
Chris@10 1041 Tk = VADD(Ti, Tj);
Chris@10 1042 TS = VSUB(Ti, Tj);
Chris@10 1043 }
Chris@10 1044 Tm = VFMA(Tg, Th, VMUL(Tk, Tl));
Chris@10 1045 TV = VFNMS(TS, Th, VMUL(TR, Tl));
Chris@10 1046 To = VFNMS(Tk, Th, VMUL(Tg, Tl));
Chris@10 1047 TT = VFMA(TR, Th, VMUL(TS, Tl));
Chris@10 1048 {
Chris@10 1049 V T1K, T1L, T1N, T1O;
Chris@10 1050 T1K = VMUL(TM, T9);
Chris@10 1051 T1L = VMUL(TO, Te);
Chris@10 1052 T1M = VSUB(T1K, T1L);
Chris@10 1053 T21 = VADD(T1K, T1L);
Chris@10 1054 T1N = VMUL(TM, Te);
Chris@10 1055 T1O = VMUL(TO, T9);
Chris@10 1056 T1P = VADD(T1N, T1O);
Chris@10 1057 T22 = VSUB(T1N, T1O);
Chris@10 1058 }
Chris@10 1059 T1Q = VFMA(T1M, Th, VMUL(T1P, Tl));
Chris@10 1060 T25 = VFNMS(T22, Th, VMUL(T21, Tl));
Chris@10 1061 T1S = VFNMS(T1P, Th, VMUL(T1M, Tl));
Chris@10 1062 T23 = VFMA(T21, Th, VMUL(T22, Tl));
Chris@10 1063 }
Chris@10 1064 {
Chris@10 1065 V TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B;
Chris@10 1066 V T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u;
Chris@10 1067 V T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47;
Chris@10 1068 V T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p;
Chris@10 1069 V T4W, T5R, T55, T5O;
Chris@10 1070 {
Chris@10 1071 V T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp;
Chris@10 1072 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 1073 T7G = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 1074 Tn = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@10 1075 Tp = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@10 1076 Tq = VFMA(Tm, Tn, VMUL(To, Tp));
Chris@10 1077 T7F = VFNMS(To, Tn, VMUL(Tm, Tp));
Chris@10 1078 {
Chris@10 1079 V Tv, Tz, TE, TI;
Chris@10 1080 Tv = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 1081 Tz = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 1082 TA = VFMA(Tu, Tv, VMUL(Ty, Tz));
Chris@10 1083 T3C = VFNMS(Ty, Tv, VMUL(Tu, Tz));
Chris@10 1084 TE = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@10 1085 TI = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@10 1086 TJ = VFMA(TD, TE, VMUL(TH, TI));
Chris@10 1087 T3D = VFNMS(TH, TE, VMUL(TD, TI));
Chris@10 1088 }
Chris@10 1089 {
Chris@10 1090 V Tr, TK, T8a, T8b;
Chris@10 1091 Tr = VADD(T1, Tq);
Chris@10 1092 TK = VADD(TA, TJ);
Chris@10 1093 TL = VADD(Tr, TK);
Chris@10 1094 T6f = VSUB(Tr, TK);
Chris@10 1095 T8a = VSUB(T7G, T7F);
Chris@10 1096 T8b = VSUB(TA, TJ);
Chris@10 1097 T8c = VSUB(T8a, T8b);
Chris@10 1098 T8q = VADD(T8b, T8a);
Chris@10 1099 }
Chris@10 1100 {
Chris@10 1101 V T3B, T3E, T7E, T7H;
Chris@10 1102 T3B = VSUB(T1, Tq);
Chris@10 1103 T3E = VSUB(T3C, T3D);
Chris@10 1104 T3F = VSUB(T3B, T3E);
Chris@10 1105 T5t = VADD(T3B, T3E);
Chris@10 1106 T7E = VADD(T3C, T3D);
Chris@10 1107 T7H = VADD(T7F, T7G);
Chris@10 1108 T7I = VADD(T7E, T7H);
Chris@10 1109 T7W = VSUB(T7H, T7E);
Chris@10 1110 }
Chris@10 1111 }
Chris@10 1112 {
Chris@10 1113 V T2e, T4g, T2w, T4z, T2j, T4h, T2n, T4y;
Chris@10 1114 {
Chris@10 1115 V T2c, T2d, T2r, T2v;
Chris@10 1116 T2c = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1117 T2d = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1118 T2e = VFMA(T2, T2c, VMUL(T5, T2d));
Chris@10 1119 T4g = VFNMS(T5, T2c, VMUL(T2, T2d));
Chris@10 1120 T2r = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1121 T2v = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1122 T2w = VFMA(T2q, T2r, VMUL(T2u, T2v));
Chris@10 1123 T4z = VFNMS(T2u, T2r, VMUL(T2q, T2v));
Chris@10 1124 }
Chris@10 1125 {
Chris@10 1126 V T2g, T2i, T2l, T2m;
Chris@10 1127 T2g = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1128 T2i = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1129 T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
Chris@10 1130 T4h = VFNMS(T2h, T2g, VMUL(T2f, T2i));
Chris@10 1131 T2l = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1132 T2m = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1133 T2n = VFMA(T9, T2l, VMUL(Te, T2m));
Chris@10 1134 T4y = VFNMS(Te, T2l, VMUL(T9, T2m));
Chris@10 1135 }
Chris@10 1136 {
Chris@10 1137 V T2k, T2x, T6w, T6x;
Chris@10 1138 T2k = VADD(T2e, T2j);
Chris@10 1139 T2x = VADD(T2n, T2w);
Chris@10 1140 T2y = VADD(T2k, T2x);
Chris@10 1141 T6B = VSUB(T2k, T2x);
Chris@10 1142 T6w = VADD(T4g, T4h);
Chris@10 1143 T6x = VADD(T4y, T4z);
Chris@10 1144 T6y = VSUB(T6w, T6x);
Chris@10 1145 T7j = VADD(T6w, T6x);
Chris@10 1146 }
Chris@10 1147 {
Chris@10 1148 V T4i, T4j, T4x, T4A;
Chris@10 1149 T4i = VSUB(T4g, T4h);
Chris@10 1150 T4j = VSUB(T2n, T2w);
Chris@10 1151 T4k = VADD(T4i, T4j);
Chris@10 1152 T5J = VSUB(T4i, T4j);
Chris@10 1153 T4x = VSUB(T2e, T2j);
Chris@10 1154 T4A = VSUB(T4y, T4z);
Chris@10 1155 T4B = VSUB(T4x, T4A);
Chris@10 1156 T5G = VADD(T4x, T4A);
Chris@10 1157 }
Chris@10 1158 }
Chris@10 1159 {
Chris@10 1160 V T31, T4Y, T3f, T4J, T36, T4Z, T3a, T4I;
Chris@10 1161 {
Chris@10 1162 V T2W, T30, T3c, T3e;
Chris@10 1163 T2W = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1164 T30 = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1165 T31 = VFMA(T2V, T2W, VMUL(T2Z, T30));
Chris@10 1166 T4Y = VFNMS(T2Z, T2W, VMUL(T2V, T30));
Chris@10 1167 T3c = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1168 T3e = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1169 T3f = VFMA(T3b, T3c, VMUL(T3d, T3e));
Chris@10 1170 T4J = VFNMS(T3d, T3c, VMUL(T3b, T3e));
Chris@10 1171 }
Chris@10 1172 {
Chris@10 1173 V T33, T35, T38, T39;
Chris@10 1174 T33 = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1175 T35 = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1176 T36 = VFMA(T32, T33, VMUL(T34, T35));
Chris@10 1177 T4Z = VFNMS(T34, T33, VMUL(T32, T35));
Chris@10 1178 T38 = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1179 T39 = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1180 T3a = VFMA(TR, T38, VMUL(TS, T39));
Chris@10 1181 T4I = VFNMS(TS, T38, VMUL(TR, T39));
Chris@10 1182 }
Chris@10 1183 {
Chris@10 1184 V T37, T3g, T6M, T6N;
Chris@10 1185 T37 = VADD(T31, T36);
Chris@10 1186 T3g = VADD(T3a, T3f);
Chris@10 1187 T3h = VADD(T37, T3g);
Chris@10 1188 T6H = VSUB(T37, T3g);
Chris@10 1189 T6M = VADD(T4Y, T4Z);
Chris@10 1190 T6N = VADD(T4I, T4J);
Chris@10 1191 T6O = VSUB(T6M, T6N);
Chris@10 1192 T7o = VADD(T6M, T6N);
Chris@10 1193 }
Chris@10 1194 {
Chris@10 1195 V T4H, T4K, T50, T51;
Chris@10 1196 T4H = VSUB(T31, T36);
Chris@10 1197 T4K = VSUB(T4I, T4J);
Chris@10 1198 T4L = VSUB(T4H, T4K);
Chris@10 1199 T5N = VADD(T4H, T4K);
Chris@10 1200 T50 = VSUB(T4Y, T4Z);
Chris@10 1201 T51 = VSUB(T3a, T3f);
Chris@10 1202 T52 = VADD(T50, T51);
Chris@10 1203 T5Q = VSUB(T50, T51);
Chris@10 1204 }
Chris@10 1205 }
Chris@10 1206 {
Chris@10 1207 V TQ, T3G, T1g, T3N, TX, T3H, T17, T3M;
Chris@10 1208 {
Chris@10 1209 V TN, TP, T1b, T1f;
Chris@10 1210 TN = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 1211 TP = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 1212 TQ = VFMA(TM, TN, VMUL(TO, TP));
Chris@10 1213 T3G = VFNMS(TO, TN, VMUL(TM, TP));
Chris@10 1214 T1b = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 1215 T1f = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 1216 T1g = VFMA(T1a, T1b, VMUL(T1e, T1f));
Chris@10 1217 T3N = VFNMS(T1e, T1b, VMUL(T1a, T1f));
Chris@10 1218 }
Chris@10 1219 {
Chris@10 1220 V TU, TW, T12, T16;
Chris@10 1221 TU = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@10 1222 TW = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@10 1223 TX = VFMA(TT, TU, VMUL(TV, TW));
Chris@10 1224 T3H = VFNMS(TV, TU, VMUL(TT, TW));
Chris@10 1225 T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@10 1226 T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@10 1227 T17 = VFMA(T11, T12, VMUL(T15, T16));
Chris@10 1228 T3M = VFNMS(T15, T12, VMUL(T11, T16));
Chris@10 1229 }
Chris@10 1230 {
Chris@10 1231 V TY, T1h, T6g, T6h;
Chris@10 1232 TY = VADD(TQ, TX);
Chris@10 1233 T1h = VADD(T17, T1g);
Chris@10 1234 T1i = VADD(TY, T1h);
Chris@10 1235 T7V = VSUB(T1h, TY);
Chris@10 1236 T6g = VADD(T3G, T3H);
Chris@10 1237 T6h = VADD(T3M, T3N);
Chris@10 1238 T6i = VSUB(T6g, T6h);
Chris@10 1239 T7D = VADD(T6g, T6h);
Chris@10 1240 }
Chris@10 1241 {
Chris@10 1242 V T3I, T3J, T3L, T3O;
Chris@10 1243 T3I = VSUB(T3G, T3H);
Chris@10 1244 T3J = VSUB(TQ, TX);
Chris@10 1245 T3K = VSUB(T3I, T3J);
Chris@10 1246 T5u = VADD(T3J, T3I);
Chris@10 1247 T3L = VSUB(T17, T1g);
Chris@10 1248 T3O = VSUB(T3M, T3N);
Chris@10 1249 T3P = VADD(T3L, T3O);
Chris@10 1250 T5v = VSUB(T3L, T3O);
Chris@10 1251 }
Chris@10 1252 }
Chris@10 1253 {
Chris@10 1254 V T1m, T3S, T1C, T3Z, T1r, T3T, T1x, T3Y;
Chris@10 1255 {
Chris@10 1256 V T1k, T1l, T1z, T1B;
Chris@10 1257 T1k = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 1258 T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 1259 T1m = VFMA(T8, T1k, VMUL(Td, T1l));
Chris@10 1260 T3S = VFNMS(Td, T1k, VMUL(T8, T1l));
Chris@10 1261 T1z = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@10 1262 T1B = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@10 1263 T1C = VFMA(T1y, T1z, VMUL(T1A, T1B));
Chris@10 1264 T3Z = VFNMS(T1A, T1z, VMUL(T1y, T1B));
Chris@10 1265 }
Chris@10 1266 {
Chris@10 1267 V T1o, T1q, T1u, T1w;
Chris@10 1268 T1o = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@10 1269 T1q = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@10 1270 T1r = VFMA(T1n, T1o, VMUL(T1p, T1q));
Chris@10 1271 T3T = VFNMS(T1p, T1o, VMUL(T1n, T1q));
Chris@10 1272 T1u = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 1273 T1w = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 1274 T1x = VFMA(T1t, T1u, VMUL(T1v, T1w));
Chris@10 1275 T3Y = VFNMS(T1v, T1u, VMUL(T1t, T1w));
Chris@10 1276 }
Chris@10 1277 {
Chris@10 1278 V T1s, T1D, T6k, T6l;
Chris@10 1279 T1s = VADD(T1m, T1r);
Chris@10 1280 T1D = VADD(T1x, T1C);
Chris@10 1281 T1E = VADD(T1s, T1D);
Chris@10 1282 T6n = VSUB(T1s, T1D);
Chris@10 1283 T6k = VADD(T3S, T3T);
Chris@10 1284 T6l = VADD(T3Y, T3Z);
Chris@10 1285 T6m = VSUB(T6k, T6l);
Chris@10 1286 T7e = VADD(T6k, T6l);
Chris@10 1287 }
Chris@10 1288 {
Chris@10 1289 V T3U, T3V, T3X, T40;
Chris@10 1290 T3U = VSUB(T3S, T3T);
Chris@10 1291 T3V = VSUB(T1x, T1C);
Chris@10 1292 T3W = VADD(T3U, T3V);
Chris@10 1293 T5y = VSUB(T3U, T3V);
Chris@10 1294 T3X = VSUB(T1m, T1r);
Chris@10 1295 T40 = VSUB(T3Y, T3Z);
Chris@10 1296 T41 = VSUB(T3X, T40);
Chris@10 1297 T5z = VADD(T3X, T40);
Chris@10 1298 }
Chris@10 1299 }
Chris@10 1300 {
Chris@10 1301 V T1J, T43, T27, T4a, T1U, T44, T20, T49;
Chris@10 1302 {
Chris@10 1303 V T1G, T1I, T24, T26;
Chris@10 1304 T1G = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@10 1305 T1I = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@10 1306 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
Chris@10 1307 T43 = VFNMS(T1H, T1G, VMUL(T1F, T1I));
Chris@10 1308 T24 = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@10 1309 T26 = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@10 1310 T27 = VFMA(T23, T24, VMUL(T25, T26));
Chris@10 1311 T4a = VFNMS(T25, T24, VMUL(T23, T26));
Chris@10 1312 }
Chris@10 1313 {
Chris@10 1314 V T1R, T1T, T1X, T1Z;
Chris@10 1315 T1R = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 1316 T1T = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 1317 T1U = VFMA(T1Q, T1R, VMUL(T1S, T1T));
Chris@10 1318 T44 = VFNMS(T1S, T1R, VMUL(T1Q, T1T));
Chris@10 1319 T1X = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 1320 T1Z = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 1321 T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
Chris@10 1322 T49 = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
Chris@10 1323 }
Chris@10 1324 {
Chris@10 1325 V T1V, T28, T6q, T6r;
Chris@10 1326 T1V = VADD(T1J, T1U);
Chris@10 1327 T28 = VADD(T20, T27);
Chris@10 1328 T29 = VADD(T1V, T28);
Chris@10 1329 T6p = VSUB(T1V, T28);
Chris@10 1330 T6q = VADD(T43, T44);
Chris@10 1331 T6r = VADD(T49, T4a);
Chris@10 1332 T6s = VSUB(T6q, T6r);
Chris@10 1333 T7f = VADD(T6q, T6r);
Chris@10 1334 }
Chris@10 1335 {
Chris@10 1336 V T45, T46, T48, T4b;
Chris@10 1337 T45 = VSUB(T43, T44);
Chris@10 1338 T46 = VSUB(T20, T27);
Chris@10 1339 T47 = VADD(T45, T46);
Chris@10 1340 T5B = VSUB(T45, T46);
Chris@10 1341 T48 = VSUB(T1J, T1U);
Chris@10 1342 T4b = VSUB(T49, T4a);
Chris@10 1343 T4c = VSUB(T48, T4b);
Chris@10 1344 T5C = VADD(T48, T4b);
Chris@10 1345 }
Chris@10 1346 }
Chris@10 1347 {
Chris@10 1348 V T2B, T4r, T2G, T4s, T4q, T4t, T2M, T4m, T2P, T4n, T4l, T4o;
Chris@10 1349 {
Chris@10 1350 V T2z, T2A, T2D, T2F;
Chris@10 1351 T2z = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1352 T2A = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1353 T2B = VFMA(T21, T2z, VMUL(T22, T2A));
Chris@10 1354 T4r = VFNMS(T22, T2z, VMUL(T21, T2A));
Chris@10 1355 T2D = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1356 T2F = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1357 T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
Chris@10 1358 T4s = VFNMS(T2E, T2D, VMUL(T2C, T2F));
Chris@10 1359 }
Chris@10 1360 T4q = VSUB(T2B, T2G);
Chris@10 1361 T4t = VSUB(T4r, T4s);
Chris@10 1362 {
Chris@10 1363 V T2J, T2L, T2N, T2O;
Chris@10 1364 T2J = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1365 T2L = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1366 T2M = VFMA(T2I, T2J, VMUL(T2K, T2L));
Chris@10 1367 T4m = VFNMS(T2K, T2J, VMUL(T2I, T2L));
Chris@10 1368 T2N = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1369 T2O = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1370 T2P = VFMA(T1M, T2N, VMUL(T1P, T2O));
Chris@10 1371 T4n = VFNMS(T1P, T2N, VMUL(T1M, T2O));
Chris@10 1372 }
Chris@10 1373 T4l = VSUB(T2M, T2P);
Chris@10 1374 T4o = VSUB(T4m, T4n);
Chris@10 1375 {
Chris@10 1376 V T2H, T2Q, T6C, T6D;
Chris@10 1377 T2H = VADD(T2B, T2G);
Chris@10 1378 T2Q = VADD(T2M, T2P);
Chris@10 1379 T2R = VADD(T2H, T2Q);
Chris@10 1380 T6z = VSUB(T2Q, T2H);
Chris@10 1381 T6C = VADD(T4r, T4s);
Chris@10 1382 T6D = VADD(T4m, T4n);
Chris@10 1383 T6E = VSUB(T6C, T6D);
Chris@10 1384 T7k = VADD(T6C, T6D);
Chris@10 1385 }
Chris@10 1386 {
Chris@10 1387 V T4p, T4u, T4C, T4D;
Chris@10 1388 T4p = VSUB(T4l, T4o);
Chris@10 1389 T4u = VADD(T4q, T4t);
Chris@10 1390 T4v = VMUL(LDK(KP707106781), VSUB(T4p, T4u));
Chris@10 1391 T5H = VMUL(LDK(KP707106781), VADD(T4u, T4p));
Chris@10 1392 T4C = VSUB(T4t, T4q);
Chris@10 1393 T4D = VADD(T4l, T4o);
Chris@10 1394 T4E = VMUL(LDK(KP707106781), VSUB(T4C, T4D));
Chris@10 1395 T5K = VMUL(LDK(KP707106781), VADD(T4C, T4D));
Chris@10 1396 }
Chris@10 1397 }
Chris@10 1398 {
Chris@10 1399 V T3k, T4M, T3p, T4N, T4O, T4P, T3t, T4S, T3w, T4T, T4R, T4U;
Chris@10 1400 {
Chris@10 1401 V T3i, T3j, T3m, T3o;
Chris@10 1402 T3i = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1403 T3j = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1404 T3k = VFMA(T3, T3i, VMUL(T6, T3j));
Chris@10 1405 T4M = VFNMS(T6, T3i, VMUL(T3, T3j));
Chris@10 1406 T3m = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1407 T3o = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1408 T3p = VFMA(T3l, T3m, VMUL(T3n, T3o));
Chris@10 1409 T4N = VFNMS(T3n, T3m, VMUL(T3l, T3o));
Chris@10 1410 }
Chris@10 1411 T4O = VSUB(T4M, T4N);
Chris@10 1412 T4P = VSUB(T3k, T3p);
Chris@10 1413 {
Chris@10 1414 V T3r, T3s, T3u, T3v;
Chris@10 1415 T3r = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1416 T3s = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1417 T3t = VFMA(Th, T3r, VMUL(Tl, T3s));
Chris@10 1418 T4S = VFNMS(Tl, T3r, VMUL(Th, T3s));
Chris@10 1419 T3u = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1420 T3v = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1421 T3w = VFMA(Tg, T3u, VMUL(Tk, T3v));
Chris@10 1422 T4T = VFNMS(Tk, T3u, VMUL(Tg, T3v));
Chris@10 1423 }
Chris@10 1424 T4R = VSUB(T3t, T3w);
Chris@10 1425 T4U = VSUB(T4S, T4T);
Chris@10 1426 {
Chris@10 1427 V T3q, T3x, T6I, T6J;
Chris@10 1428 T3q = VADD(T3k, T3p);
Chris@10 1429 T3x = VADD(T3t, T3w);
Chris@10 1430 T3y = VADD(T3q, T3x);
Chris@10 1431 T6P = VSUB(T3x, T3q);
Chris@10 1432 T6I = VADD(T4M, T4N);
Chris@10 1433 T6J = VADD(T4S, T4T);
Chris@10 1434 T6K = VSUB(T6I, T6J);
Chris@10 1435 T7p = VADD(T6I, T6J);
Chris@10 1436 }
Chris@10 1437 {
Chris@10 1438 V T4Q, T4V, T53, T54;
Chris@10 1439 T4Q = VSUB(T4O, T4P);
Chris@10 1440 T4V = VADD(T4R, T4U);
Chris@10 1441 T4W = VMUL(LDK(KP707106781), VSUB(T4Q, T4V));
Chris@10 1442 T5R = VMUL(LDK(KP707106781), VADD(T4Q, T4V));
Chris@10 1443 T53 = VSUB(T4R, T4U);
Chris@10 1444 T54 = VADD(T4P, T4O);
Chris@10 1445 T55 = VMUL(LDK(KP707106781), VSUB(T53, T54));
Chris@10 1446 T5O = VMUL(LDK(KP707106781), VADD(T54, T53));
Chris@10 1447 }
Chris@10 1448 }
Chris@10 1449 {
Chris@10 1450 V T2b, T7x, T7K, T7M, T3A, T7L, T7A, T7B;
Chris@10 1451 {
Chris@10 1452 V T1j, T2a, T7C, T7J;
Chris@10 1453 T1j = VADD(TL, T1i);
Chris@10 1454 T2a = VADD(T1E, T29);
Chris@10 1455 T2b = VADD(T1j, T2a);
Chris@10 1456 T7x = VSUB(T1j, T2a);
Chris@10 1457 T7C = VADD(T7e, T7f);
Chris@10 1458 T7J = VADD(T7D, T7I);
Chris@10 1459 T7K = VADD(T7C, T7J);
Chris@10 1460 T7M = VSUB(T7J, T7C);
Chris@10 1461 }
Chris@10 1462 {
Chris@10 1463 V T2S, T3z, T7y, T7z;
Chris@10 1464 T2S = VADD(T2y, T2R);
Chris@10 1465 T3z = VADD(T3h, T3y);
Chris@10 1466 T3A = VADD(T2S, T3z);
Chris@10 1467 T7L = VSUB(T3z, T2S);
Chris@10 1468 T7y = VADD(T7j, T7k);
Chris@10 1469 T7z = VADD(T7o, T7p);
Chris@10 1470 T7A = VSUB(T7y, T7z);
Chris@10 1471 T7B = VADD(T7y, T7z);
Chris@10 1472 }
Chris@10 1473 ST(&(ri[WS(rs, 16)]), VSUB(T2b, T3A), ms, &(ri[0]));
Chris@10 1474 ST(&(ii[WS(rs, 16)]), VSUB(T7K, T7B), ms, &(ii[0]));
Chris@10 1475 ST(&(ri[0]), VADD(T2b, T3A), ms, &(ri[0]));
Chris@10 1476 ST(&(ii[0]), VADD(T7B, T7K), ms, &(ii[0]));
Chris@10 1477 ST(&(ri[WS(rs, 24)]), VSUB(T7x, T7A), ms, &(ri[0]));
Chris@10 1478 ST(&(ii[WS(rs, 24)]), VSUB(T7M, T7L), ms, &(ii[0]));
Chris@10 1479 ST(&(ri[WS(rs, 8)]), VADD(T7x, T7A), ms, &(ri[0]));
Chris@10 1480 ST(&(ii[WS(rs, 8)]), VADD(T7L, T7M), ms, &(ii[0]));
Chris@10 1481 }
Chris@10 1482 {
Chris@10 1483 V T7h, T7t, T7Q, T7S, T7m, T7u, T7r, T7v;
Chris@10 1484 {
Chris@10 1485 V T7d, T7g, T7O, T7P;
Chris@10 1486 T7d = VSUB(TL, T1i);
Chris@10 1487 T7g = VSUB(T7e, T7f);
Chris@10 1488 T7h = VADD(T7d, T7g);
Chris@10 1489 T7t = VSUB(T7d, T7g);
Chris@10 1490 T7O = VSUB(T29, T1E);
Chris@10 1491 T7P = VSUB(T7I, T7D);
Chris@10 1492 T7Q = VADD(T7O, T7P);
Chris@10 1493 T7S = VSUB(T7P, T7O);
Chris@10 1494 }
Chris@10 1495 {
Chris@10 1496 V T7i, T7l, T7n, T7q;
Chris@10 1497 T7i = VSUB(T2y, T2R);
Chris@10 1498 T7l = VSUB(T7j, T7k);
Chris@10 1499 T7m = VADD(T7i, T7l);
Chris@10 1500 T7u = VSUB(T7l, T7i);
Chris@10 1501 T7n = VSUB(T3h, T3y);
Chris@10 1502 T7q = VSUB(T7o, T7p);
Chris@10 1503 T7r = VSUB(T7n, T7q);
Chris@10 1504 T7v = VADD(T7n, T7q);
Chris@10 1505 }
Chris@10 1506 {
Chris@10 1507 V T7s, T7N, T7w, T7R;
Chris@10 1508 T7s = VMUL(LDK(KP707106781), VADD(T7m, T7r));
Chris@10 1509 ST(&(ri[WS(rs, 20)]), VSUB(T7h, T7s), ms, &(ri[0]));
Chris@10 1510 ST(&(ri[WS(rs, 4)]), VADD(T7h, T7s), ms, &(ri[0]));
Chris@10 1511 T7N = VMUL(LDK(KP707106781), VADD(T7u, T7v));
Chris@10 1512 ST(&(ii[WS(rs, 4)]), VADD(T7N, T7Q), ms, &(ii[0]));
Chris@10 1513 ST(&(ii[WS(rs, 20)]), VSUB(T7Q, T7N), ms, &(ii[0]));
Chris@10 1514 T7w = VMUL(LDK(KP707106781), VSUB(T7u, T7v));
Chris@10 1515 ST(&(ri[WS(rs, 28)]), VSUB(T7t, T7w), ms, &(ri[0]));
Chris@10 1516 ST(&(ri[WS(rs, 12)]), VADD(T7t, T7w), ms, &(ri[0]));
Chris@10 1517 T7R = VMUL(LDK(KP707106781), VSUB(T7r, T7m));
Chris@10 1518 ST(&(ii[WS(rs, 12)]), VADD(T7R, T7S), ms, &(ii[0]));
Chris@10 1519 ST(&(ii[WS(rs, 28)]), VSUB(T7S, T7R), ms, &(ii[0]));
Chris@10 1520 }
Chris@10 1521 }
Chris@10 1522 {
Chris@10 1523 V T6j, T7X, T83, T6X, T6u, T7U, T77, T7b, T70, T82, T6G, T6U, T74, T7a, T6R;
Chris@10 1524 V T6V;
Chris@10 1525 {
Chris@10 1526 V T6o, T6t, T6A, T6F;
Chris@10 1527 T6j = VSUB(T6f, T6i);
Chris@10 1528 T7X = VADD(T7V, T7W);
Chris@10 1529 T83 = VSUB(T7W, T7V);
Chris@10 1530 T6X = VADD(T6f, T6i);
Chris@10 1531 T6o = VSUB(T6m, T6n);
Chris@10 1532 T6t = VADD(T6p, T6s);
Chris@10 1533 T6u = VMUL(LDK(KP707106781), VSUB(T6o, T6t));
Chris@10 1534 T7U = VMUL(LDK(KP707106781), VADD(T6o, T6t));
Chris@10 1535 {
Chris@10 1536 V T75, T76, T6Y, T6Z;
Chris@10 1537 T75 = VADD(T6H, T6K);
Chris@10 1538 T76 = VADD(T6O, T6P);
Chris@10 1539 T77 = VFNMS(LDK(KP382683432), T76, VMUL(LDK(KP923879532), T75));
Chris@10 1540 T7b = VFMA(LDK(KP923879532), T76, VMUL(LDK(KP382683432), T75));
Chris@10 1541 T6Y = VADD(T6n, T6m);
Chris@10 1542 T6Z = VSUB(T6p, T6s);
Chris@10 1543 T70 = VMUL(LDK(KP707106781), VADD(T6Y, T6Z));
Chris@10 1544 T82 = VMUL(LDK(KP707106781), VSUB(T6Z, T6Y));
Chris@10 1545 }
Chris@10 1546 T6A = VSUB(T6y, T6z);
Chris@10 1547 T6F = VSUB(T6B, T6E);
Chris@10 1548 T6G = VFMA(LDK(KP923879532), T6A, VMUL(LDK(KP382683432), T6F));
Chris@10 1549 T6U = VFNMS(LDK(KP923879532), T6F, VMUL(LDK(KP382683432), T6A));
Chris@10 1550 {
Chris@10 1551 V T72, T73, T6L, T6Q;
Chris@10 1552 T72 = VADD(T6y, T6z);
Chris@10 1553 T73 = VADD(T6B, T6E);
Chris@10 1554 T74 = VFMA(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T73));
Chris@10 1555 T7a = VFNMS(LDK(KP382683432), T73, VMUL(LDK(KP923879532), T72));
Chris@10 1556 T6L = VSUB(T6H, T6K);
Chris@10 1557 T6Q = VSUB(T6O, T6P);
Chris@10 1558 T6R = VFNMS(LDK(KP923879532), T6Q, VMUL(LDK(KP382683432), T6L));
Chris@10 1559 T6V = VFMA(LDK(KP382683432), T6Q, VMUL(LDK(KP923879532), T6L));
Chris@10 1560 }
Chris@10 1561 }
Chris@10 1562 {
Chris@10 1563 V T6v, T6S, T81, T84;
Chris@10 1564 T6v = VADD(T6j, T6u);
Chris@10 1565 T6S = VADD(T6G, T6R);
Chris@10 1566 ST(&(ri[WS(rs, 22)]), VSUB(T6v, T6S), ms, &(ri[0]));
Chris@10 1567 ST(&(ri[WS(rs, 6)]), VADD(T6v, T6S), ms, &(ri[0]));
Chris@10 1568 T81 = VADD(T6U, T6V);
Chris@10 1569 T84 = VADD(T82, T83);
Chris@10 1570 ST(&(ii[WS(rs, 6)]), VADD(T81, T84), ms, &(ii[0]));
Chris@10 1571 ST(&(ii[WS(rs, 22)]), VSUB(T84, T81), ms, &(ii[0]));
Chris@10 1572 }
Chris@10 1573 {
Chris@10 1574 V T6T, T6W, T85, T86;
Chris@10 1575 T6T = VSUB(T6j, T6u);
Chris@10 1576 T6W = VSUB(T6U, T6V);
Chris@10 1577 ST(&(ri[WS(rs, 30)]), VSUB(T6T, T6W), ms, &(ri[0]));
Chris@10 1578 ST(&(ri[WS(rs, 14)]), VADD(T6T, T6W), ms, &(ri[0]));
Chris@10 1579 T85 = VSUB(T6R, T6G);
Chris@10 1580 T86 = VSUB(T83, T82);
Chris@10 1581 ST(&(ii[WS(rs, 14)]), VADD(T85, T86), ms, &(ii[0]));
Chris@10 1582 ST(&(ii[WS(rs, 30)]), VSUB(T86, T85), ms, &(ii[0]));
Chris@10 1583 }
Chris@10 1584 {
Chris@10 1585 V T71, T78, T7T, T7Y;
Chris@10 1586 T71 = VADD(T6X, T70);
Chris@10 1587 T78 = VADD(T74, T77);
Chris@10 1588 ST(&(ri[WS(rs, 18)]), VSUB(T71, T78), ms, &(ri[0]));
Chris@10 1589 ST(&(ri[WS(rs, 2)]), VADD(T71, T78), ms, &(ri[0]));
Chris@10 1590 T7T = VADD(T7a, T7b);
Chris@10 1591 T7Y = VADD(T7U, T7X);
Chris@10 1592 ST(&(ii[WS(rs, 2)]), VADD(T7T, T7Y), ms, &(ii[0]));
Chris@10 1593 ST(&(ii[WS(rs, 18)]), VSUB(T7Y, T7T), ms, &(ii[0]));
Chris@10 1594 }
Chris@10 1595 {
Chris@10 1596 V T79, T7c, T7Z, T80;
Chris@10 1597 T79 = VSUB(T6X, T70);
Chris@10 1598 T7c = VSUB(T7a, T7b);
Chris@10 1599 ST(&(ri[WS(rs, 26)]), VSUB(T79, T7c), ms, &(ri[0]));
Chris@10 1600 ST(&(ri[WS(rs, 10)]), VADD(T79, T7c), ms, &(ri[0]));
Chris@10 1601 T7Z = VSUB(T77, T74);
Chris@10 1602 T80 = VSUB(T7X, T7U);
Chris@10 1603 ST(&(ii[WS(rs, 10)]), VADD(T7Z, T80), ms, &(ii[0]));
Chris@10 1604 ST(&(ii[WS(rs, 26)]), VSUB(T80, T7Z), ms, &(ii[0]));
Chris@10 1605 }
Chris@10 1606 }
Chris@10 1607 {
Chris@10 1608 V T3R, T5d, T8r, T8x, T4e, T8o, T5n, T5r, T4G, T5a, T5g, T8w, T5k, T5q, T57;
Chris@10 1609 V T5b, T3Q, T8p;
Chris@10 1610 T3Q = VMUL(LDK(KP707106781), VSUB(T3K, T3P));
Chris@10 1611 T3R = VSUB(T3F, T3Q);
Chris@10 1612 T5d = VADD(T3F, T3Q);
Chris@10 1613 T8p = VMUL(LDK(KP707106781), VSUB(T5v, T5u));
Chris@10 1614 T8r = VADD(T8p, T8q);
Chris@10 1615 T8x = VSUB(T8q, T8p);
Chris@10 1616 {
Chris@10 1617 V T42, T4d, T5l, T5m;
Chris@10 1618 T42 = VFNMS(LDK(KP923879532), T41, VMUL(LDK(KP382683432), T3W));
Chris@10 1619 T4d = VFMA(LDK(KP382683432), T47, VMUL(LDK(KP923879532), T4c));
Chris@10 1620 T4e = VSUB(T42, T4d);
Chris@10 1621 T8o = VADD(T42, T4d);
Chris@10 1622 T5l = VADD(T4L, T4W);
Chris@10 1623 T5m = VADD(T52, T55);
Chris@10 1624 T5n = VFNMS(LDK(KP555570233), T5m, VMUL(LDK(KP831469612), T5l));
Chris@10 1625 T5r = VFMA(LDK(KP831469612), T5m, VMUL(LDK(KP555570233), T5l));
Chris@10 1626 }
Chris@10 1627 {
Chris@10 1628 V T4w, T4F, T5e, T5f;
Chris@10 1629 T4w = VSUB(T4k, T4v);
Chris@10 1630 T4F = VSUB(T4B, T4E);
Chris@10 1631 T4G = VFMA(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4F));
Chris@10 1632 T5a = VFNMS(LDK(KP980785280), T4F, VMUL(LDK(KP195090322), T4w));
Chris@10 1633 T5e = VFMA(LDK(KP923879532), T3W, VMUL(LDK(KP382683432), T41));
Chris@10 1634 T5f = VFNMS(LDK(KP923879532), T47, VMUL(LDK(KP382683432), T4c));
Chris@10 1635 T5g = VADD(T5e, T5f);
Chris@10 1636 T8w = VSUB(T5f, T5e);
Chris@10 1637 }
Chris@10 1638 {
Chris@10 1639 V T5i, T5j, T4X, T56;
Chris@10 1640 T5i = VADD(T4k, T4v);
Chris@10 1641 T5j = VADD(T4B, T4E);
Chris@10 1642 T5k = VFMA(LDK(KP555570233), T5i, VMUL(LDK(KP831469612), T5j));
Chris@10 1643 T5q = VFNMS(LDK(KP555570233), T5j, VMUL(LDK(KP831469612), T5i));
Chris@10 1644 T4X = VSUB(T4L, T4W);
Chris@10 1645 T56 = VSUB(T52, T55);
Chris@10 1646 T57 = VFNMS(LDK(KP980785280), T56, VMUL(LDK(KP195090322), T4X));
Chris@10 1647 T5b = VFMA(LDK(KP195090322), T56, VMUL(LDK(KP980785280), T4X));
Chris@10 1648 }
Chris@10 1649 {
Chris@10 1650 V T4f, T58, T8v, T8y;
Chris@10 1651 T4f = VADD(T3R, T4e);
Chris@10 1652 T58 = VADD(T4G, T57);
Chris@10 1653 ST(&(ri[WS(rs, 23)]), VSUB(T4f, T58), ms, &(ri[WS(rs, 1)]));
Chris@10 1654 ST(&(ri[WS(rs, 7)]), VADD(T4f, T58), ms, &(ri[WS(rs, 1)]));
Chris@10 1655 T8v = VADD(T5a, T5b);
Chris@10 1656 T8y = VADD(T8w, T8x);
Chris@10 1657 ST(&(ii[WS(rs, 7)]), VADD(T8v, T8y), ms, &(ii[WS(rs, 1)]));
Chris@10 1658 ST(&(ii[WS(rs, 23)]), VSUB(T8y, T8v), ms, &(ii[WS(rs, 1)]));
Chris@10 1659 }
Chris@10 1660 {
Chris@10 1661 V T59, T5c, T8z, T8A;
Chris@10 1662 T59 = VSUB(T3R, T4e);
Chris@10 1663 T5c = VSUB(T5a, T5b);
Chris@10 1664 ST(&(ri[WS(rs, 31)]), VSUB(T59, T5c), ms, &(ri[WS(rs, 1)]));
Chris@10 1665 ST(&(ri[WS(rs, 15)]), VADD(T59, T5c), ms, &(ri[WS(rs, 1)]));
Chris@10 1666 T8z = VSUB(T57, T4G);
Chris@10 1667 T8A = VSUB(T8x, T8w);
Chris@10 1668 ST(&(ii[WS(rs, 15)]), VADD(T8z, T8A), ms, &(ii[WS(rs, 1)]));
Chris@10 1669 ST(&(ii[WS(rs, 31)]), VSUB(T8A, T8z), ms, &(ii[WS(rs, 1)]));
Chris@10 1670 }
Chris@10 1671 {
Chris@10 1672 V T5h, T5o, T8n, T8s;
Chris@10 1673 T5h = VADD(T5d, T5g);
Chris@10 1674 T5o = VADD(T5k, T5n);
Chris@10 1675 ST(&(ri[WS(rs, 19)]), VSUB(T5h, T5o), ms, &(ri[WS(rs, 1)]));
Chris@10 1676 ST(&(ri[WS(rs, 3)]), VADD(T5h, T5o), ms, &(ri[WS(rs, 1)]));
Chris@10 1677 T8n = VADD(T5q, T5r);
Chris@10 1678 T8s = VADD(T8o, T8r);
Chris@10 1679 ST(&(ii[WS(rs, 3)]), VADD(T8n, T8s), ms, &(ii[WS(rs, 1)]));
Chris@10 1680 ST(&(ii[WS(rs, 19)]), VSUB(T8s, T8n), ms, &(ii[WS(rs, 1)]));
Chris@10 1681 }
Chris@10 1682 {
Chris@10 1683 V T5p, T5s, T8t, T8u;
Chris@10 1684 T5p = VSUB(T5d, T5g);
Chris@10 1685 T5s = VSUB(T5q, T5r);
Chris@10 1686 ST(&(ri[WS(rs, 27)]), VSUB(T5p, T5s), ms, &(ri[WS(rs, 1)]));
Chris@10 1687 ST(&(ri[WS(rs, 11)]), VADD(T5p, T5s), ms, &(ri[WS(rs, 1)]));
Chris@10 1688 T8t = VSUB(T5n, T5k);
Chris@10 1689 T8u = VSUB(T8r, T8o);
Chris@10 1690 ST(&(ii[WS(rs, 11)]), VADD(T8t, T8u), ms, &(ii[WS(rs, 1)]));
Chris@10 1691 ST(&(ii[WS(rs, 27)]), VSUB(T8u, T8t), ms, &(ii[WS(rs, 1)]));
Chris@10 1692 }
Chris@10 1693 }
Chris@10 1694 {
Chris@10 1695 V T5x, T5Z, T8d, T8j, T5E, T88, T69, T6d, T5M, T5W, T62, T8i, T66, T6c, T5T;
Chris@10 1696 V T5X, T5w, T89;
Chris@10 1697 T5w = VMUL(LDK(KP707106781), VADD(T5u, T5v));
Chris@10 1698 T5x = VSUB(T5t, T5w);
Chris@10 1699 T5Z = VADD(T5t, T5w);
Chris@10 1700 T89 = VMUL(LDK(KP707106781), VADD(T3K, T3P));
Chris@10 1701 T8d = VADD(T89, T8c);
Chris@10 1702 T8j = VSUB(T8c, T89);
Chris@10 1703 {
Chris@10 1704 V T5A, T5D, T67, T68;
Chris@10 1705 T5A = VFNMS(LDK(KP382683432), T5z, VMUL(LDK(KP923879532), T5y));
Chris@10 1706 T5D = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5C));
Chris@10 1707 T5E = VSUB(T5A, T5D);
Chris@10 1708 T88 = VADD(T5A, T5D);
Chris@10 1709 T67 = VADD(T5N, T5O);
Chris@10 1710 T68 = VADD(T5Q, T5R);
Chris@10 1711 T69 = VFNMS(LDK(KP195090322), T68, VMUL(LDK(KP980785280), T67));
Chris@10 1712 T6d = VFMA(LDK(KP195090322), T67, VMUL(LDK(KP980785280), T68));
Chris@10 1713 }
Chris@10 1714 {
Chris@10 1715 V T5I, T5L, T60, T61;
Chris@10 1716 T5I = VSUB(T5G, T5H);
Chris@10 1717 T5L = VSUB(T5J, T5K);
Chris@10 1718 T5M = VFMA(LDK(KP555570233), T5I, VMUL(LDK(KP831469612), T5L));
Chris@10 1719 T5W = VFNMS(LDK(KP831469612), T5I, VMUL(LDK(KP555570233), T5L));
Chris@10 1720 T60 = VFMA(LDK(KP382683432), T5y, VMUL(LDK(KP923879532), T5z));
Chris@10 1721 T61 = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5C));
Chris@10 1722 T62 = VADD(T60, T61);
Chris@10 1723 T8i = VSUB(T61, T60);
Chris@10 1724 }
Chris@10 1725 {
Chris@10 1726 V T64, T65, T5P, T5S;
Chris@10 1727 T64 = VADD(T5G, T5H);
Chris@10 1728 T65 = VADD(T5J, T5K);
Chris@10 1729 T66 = VFMA(LDK(KP980785280), T64, VMUL(LDK(KP195090322), T65));
Chris@10 1730 T6c = VFNMS(LDK(KP195090322), T64, VMUL(LDK(KP980785280), T65));
Chris@10 1731 T5P = VSUB(T5N, T5O);
Chris@10 1732 T5S = VSUB(T5Q, T5R);
Chris@10 1733 T5T = VFNMS(LDK(KP831469612), T5S, VMUL(LDK(KP555570233), T5P));
Chris@10 1734 T5X = VFMA(LDK(KP831469612), T5P, VMUL(LDK(KP555570233), T5S));
Chris@10 1735 }
Chris@10 1736 {
Chris@10 1737 V T5F, T5U, T8h, T8k;
Chris@10 1738 T5F = VADD(T5x, T5E);
Chris@10 1739 T5U = VADD(T5M, T5T);
Chris@10 1740 ST(&(ri[WS(rs, 21)]), VSUB(T5F, T5U), ms, &(ri[WS(rs, 1)]));
Chris@10 1741 ST(&(ri[WS(rs, 5)]), VADD(T5F, T5U), ms, &(ri[WS(rs, 1)]));
Chris@10 1742 T8h = VADD(T5W, T5X);
Chris@10 1743 T8k = VADD(T8i, T8j);
Chris@10 1744 ST(&(ii[WS(rs, 5)]), VADD(T8h, T8k), ms, &(ii[WS(rs, 1)]));
Chris@10 1745 ST(&(ii[WS(rs, 21)]), VSUB(T8k, T8h), ms, &(ii[WS(rs, 1)]));
Chris@10 1746 }
Chris@10 1747 {
Chris@10 1748 V T5V, T5Y, T8l, T8m;
Chris@10 1749 T5V = VSUB(T5x, T5E);
Chris@10 1750 T5Y = VSUB(T5W, T5X);
Chris@10 1751 ST(&(ri[WS(rs, 29)]), VSUB(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
Chris@10 1752 ST(&(ri[WS(rs, 13)]), VADD(T5V, T5Y), ms, &(ri[WS(rs, 1)]));
Chris@10 1753 T8l = VSUB(T5T, T5M);
Chris@10 1754 T8m = VSUB(T8j, T8i);
Chris@10 1755 ST(&(ii[WS(rs, 13)]), VADD(T8l, T8m), ms, &(ii[WS(rs, 1)]));
Chris@10 1756 ST(&(ii[WS(rs, 29)]), VSUB(T8m, T8l), ms, &(ii[WS(rs, 1)]));
Chris@10 1757 }
Chris@10 1758 {
Chris@10 1759 V T63, T6a, T87, T8e;
Chris@10 1760 T63 = VADD(T5Z, T62);
Chris@10 1761 T6a = VADD(T66, T69);
Chris@10 1762 ST(&(ri[WS(rs, 17)]), VSUB(T63, T6a), ms, &(ri[WS(rs, 1)]));
Chris@10 1763 ST(&(ri[WS(rs, 1)]), VADD(T63, T6a), ms, &(ri[WS(rs, 1)]));
Chris@10 1764 T87 = VADD(T6c, T6d);
Chris@10 1765 T8e = VADD(T88, T8d);
Chris@10 1766 ST(&(ii[WS(rs, 1)]), VADD(T87, T8e), ms, &(ii[WS(rs, 1)]));
Chris@10 1767 ST(&(ii[WS(rs, 17)]), VSUB(T8e, T87), ms, &(ii[WS(rs, 1)]));
Chris@10 1768 }
Chris@10 1769 {
Chris@10 1770 V T6b, T6e, T8f, T8g;
Chris@10 1771 T6b = VSUB(T5Z, T62);
Chris@10 1772 T6e = VSUB(T6c, T6d);
Chris@10 1773 ST(&(ri[WS(rs, 25)]), VSUB(T6b, T6e), ms, &(ri[WS(rs, 1)]));
Chris@10 1774 ST(&(ri[WS(rs, 9)]), VADD(T6b, T6e), ms, &(ri[WS(rs, 1)]));
Chris@10 1775 T8f = VSUB(T69, T66);
Chris@10 1776 T8g = VSUB(T8d, T88);
Chris@10 1777 ST(&(ii[WS(rs, 9)]), VADD(T8f, T8g), ms, &(ii[WS(rs, 1)]));
Chris@10 1778 ST(&(ii[WS(rs, 25)]), VSUB(T8g, T8f), ms, &(ii[WS(rs, 1)]));
Chris@10 1779 }
Chris@10 1780 }
Chris@10 1781 }
Chris@10 1782 }
Chris@10 1783 }
Chris@10 1784 VLEAVE();
Chris@10 1785 }
Chris@10 1786
Chris@10 1787 static const tw_instr twinstr[] = {
Chris@10 1788 VTW(0, 1),
Chris@10 1789 VTW(0, 3),
Chris@10 1790 VTW(0, 9),
Chris@10 1791 VTW(0, 27),
Chris@10 1792 {TW_NEXT, (2 * VL), 0}
Chris@10 1793 };
Chris@10 1794
Chris@10 1795 static const ct_desc desc = { 32, XSIMD_STRING("t2sv_32"), twinstr, &GENUS, {376, 168, 112, 0}, 0, 0, 0 };
Chris@10 1796
Chris@10 1797 void XSIMD(codelet_t2sv_32) (planner *p) {
Chris@10 1798 X(kdft_dit_register) (p, t2sv_32, &desc);
Chris@10 1799 }
Chris@10 1800 #endif /* HAVE_FMA */