annotate src/fftw-3.3.3/dft/simd/common/n2sv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:37:49 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 372 FP additions, 136 FP multiplications,
Chris@10 32 * (or, 236 additions, 0 multiplications, 136 fused multiply/add),
Chris@10 33 * 194 stack variables, 7 constants, and 144 memory accesses
Chris@10 34 */
Chris@10 35 #include "n2s.h"
Chris@10 36
Chris@10 37 static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 40 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 41 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 46 {
Chris@10 47 INT i;
Chris@10 48 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@10 49 V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
Chris@10 50 V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
Chris@10 51 V T6v, T6w, T3g, T3f, T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E, T4p, T49, T4l;
Chris@10 52 V T4j, T6F, T6G, T6H, T6I, T6J, T6K, T6L, T6M, T3n, T3b, T3r, T3l, T3o, T3e;
Chris@10 53 V T4q, T4o, T4k, T4g, T3h, T3p;
Chris@10 54 {
Chris@10 55 V T2T, T3T, T4r, T7, T3t, T1z, T18, T4Z, Te, T50, T1f, T4s, T1G, T3U, T2W;
Chris@10 56 V T3u, Tm, T1n, T3X, T3y, T2Z, T1O, T53, T4w, Tt, T1u, T3W, T3B, T2Y, T1V;
Chris@10 57 V T52, T4z, T3O, T2t, T3L, T2K, TZ, T5F, T4R, T5k, T5j, T4W, T5I, T5X, T2E;
Chris@10 58 V T3M, T2N, T3P, T3H, T22, T3E, T2j, T4G, T5h, TK, T5A, T5D, T5W, T2d, T3F;
Chris@10 59 V T4L, T5g, T3I, T2m;
Chris@10 60 {
Chris@10 61 V T1L, T1j, T1k, T1l, T4v, T1K, T3w;
Chris@10 62 {
Chris@10 63 V T1, T2, T12, T13, T4, T5, T15, T16;
Chris@10 64 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@10 65 T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
Chris@10 66 T12 = LD(&(ii[0]), ivs, &(ii[0]));
Chris@10 67 T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
Chris@10 68 T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@10 69 T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
Chris@10 70 T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@10 71 T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
Chris@10 72 {
Chris@10 73 V Tb, T1A, Ta, T1B, T1b, Tc, T1c, T1d;
Chris@10 74 {
Chris@10 75 V T8, T1x, T3, T2R, T14, T2S, T6, T1y, T17, T9, T19, T1a;
Chris@10 76 T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@10 77 T1x = VSUB(T1, T2);
Chris@10 78 T3 = VADD(T1, T2);
Chris@10 79 T2R = VSUB(T12, T13);
Chris@10 80 T14 = VADD(T12, T13);
Chris@10 81 T2S = VSUB(T4, T5);
Chris@10 82 T6 = VADD(T4, T5);
Chris@10 83 T1y = VSUB(T15, T16);
Chris@10 84 T17 = VADD(T15, T16);
Chris@10 85 T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
Chris@10 86 T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@10 87 T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
Chris@10 88 Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
Chris@10 89 T2T = VSUB(T2R, T2S);
Chris@10 90 T3T = VADD(T2S, T2R);
Chris@10 91 T4r = VSUB(T3, T6);
Chris@10 92 T7 = VADD(T3, T6);
Chris@10 93 T3t = VSUB(T1x, T1y);
Chris@10 94 T1z = VADD(T1x, T1y);
Chris@10 95 T18 = VADD(T14, T17);
Chris@10 96 T4Z = VSUB(T14, T17);
Chris@10 97 T1A = VSUB(T8, T9);
Chris@10 98 Ta = VADD(T8, T9);
Chris@10 99 T1B = VSUB(T19, T1a);
Chris@10 100 T1b = VADD(T19, T1a);
Chris@10 101 Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@10 102 T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
Chris@10 103 T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@10 104 }
Chris@10 105 {
Chris@10 106 V Ti, T1I, T1J, Tl;
Chris@10 107 {
Chris@10 108 V T1h, T1C, T2U, T1D, Td, T1E, T1e, T1i, Tg, Th;
Chris@10 109 Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@10 110 Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
Chris@10 111 T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@10 112 T1C = VADD(T1A, T1B);
Chris@10 113 T2U = VSUB(T1B, T1A);
Chris@10 114 T1D = VSUB(Tb, Tc);
Chris@10 115 Td = VADD(Tb, Tc);
Chris@10 116 T1E = VSUB(T1c, T1d);
Chris@10 117 T1e = VADD(T1c, T1d);
Chris@10 118 T1L = VSUB(Tg, Th);
Chris@10 119 Ti = VADD(Tg, Th);
Chris@10 120 T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
Chris@10 121 {
Chris@10 122 V T2V, T1F, Tj, Tk;
Chris@10 123 Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@10 124 Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
Chris@10 125 Te = VADD(Ta, Td);
Chris@10 126 T50 = VSUB(Td, Ta);
Chris@10 127 T2V = VADD(T1D, T1E);
Chris@10 128 T1F = VSUB(T1D, T1E);
Chris@10 129 T1f = VADD(T1b, T1e);
Chris@10 130 T4s = VSUB(T1b, T1e);
Chris@10 131 T1j = VADD(T1h, T1i);
Chris@10 132 T1I = VSUB(T1h, T1i);
Chris@10 133 T1J = VSUB(Tj, Tk);
Chris@10 134 Tl = VADD(Tj, Tk);
Chris@10 135 T1G = VADD(T1C, T1F);
Chris@10 136 T3U = VSUB(T1F, T1C);
Chris@10 137 T2W = VADD(T2U, T2V);
Chris@10 138 T3u = VSUB(T2U, T2V);
Chris@10 139 T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@10 140 T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
Chris@10 141 }
Chris@10 142 }
Chris@10 143 T4v = VSUB(Ti, Tl);
Chris@10 144 Tm = VADD(Ti, Tl);
Chris@10 145 T1K = VSUB(T1I, T1J);
Chris@10 146 T3w = VADD(T1J, T1I);
Chris@10 147 }
Chris@10 148 }
Chris@10 149 }
Chris@10 150 {
Chris@10 151 V T1r, T1S, T1q, T1s, T4x, T1R, T3z;
Chris@10 152 {
Chris@10 153 V Tp, T1P, T1Q, Ts;
Chris@10 154 {
Chris@10 155 V Tn, To, T1o, T1M, T1m, T1p;
Chris@10 156 Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
Chris@10 157 To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@10 158 T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
Chris@10 159 T1M = VSUB(T1k, T1l);
Chris@10 160 T1m = VADD(T1k, T1l);
Chris@10 161 T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@10 162 {
Chris@10 163 V Tq, Tr, T3x, T1N, T4u;
Chris@10 164 Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@10 165 Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
Chris@10 166 T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@10 167 T1S = VSUB(Tn, To);
Chris@10 168 Tp = VADD(Tn, To);
Chris@10 169 T3x = VSUB(T1L, T1M);
Chris@10 170 T1N = VADD(T1L, T1M);
Chris@10 171 T4u = VSUB(T1j, T1m);
Chris@10 172 T1n = VADD(T1j, T1m);
Chris@10 173 T1P = VSUB(T1o, T1p);
Chris@10 174 T1q = VADD(T1o, T1p);
Chris@10 175 T1Q = VSUB(Tq, Tr);
Chris@10 176 Ts = VADD(Tq, Tr);
Chris@10 177 T3X = VFNMS(LDK(KP414213562), T3w, T3x);
Chris@10 178 T3y = VFMA(LDK(KP414213562), T3x, T3w);
Chris@10 179 T2Z = VFMA(LDK(KP414213562), T1K, T1N);
Chris@10 180 T1O = VFNMS(LDK(KP414213562), T1N, T1K);
Chris@10 181 T53 = VADD(T4v, T4u);
Chris@10 182 T4w = VSUB(T4u, T4v);
Chris@10 183 T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
Chris@10 184 }
Chris@10 185 }
Chris@10 186 T4x = VSUB(Tp, Ts);
Chris@10 187 Tt = VADD(Tp, Ts);
Chris@10 188 T1R = VSUB(T1P, T1Q);
Chris@10 189 T3z = VADD(T1Q, T1P);
Chris@10 190 }
Chris@10 191 {
Chris@10 192 V T4S, T5G, T2y, T2L, T4V, T5H, T2D, T2M;
Chris@10 193 {
Chris@10 194 V T2G, TN, T4N, T2r, T2s, TQ, T2A, T4O, T2J, T2x, TU, T4T, T2w, T2z, TX;
Chris@10 195 V T2B, T2H, T2I, TR;
Chris@10 196 {
Chris@10 197 V TL, TM, T2p, T1T, T1t, T2q;
Chris@10 198 TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
Chris@10 199 TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@10 200 T2p = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
Chris@10 201 T1T = VSUB(T1r, T1s);
Chris@10 202 T1t = VADD(T1r, T1s);
Chris@10 203 T2q = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@10 204 {
Chris@10 205 V TO, TP, T3A, T1U, T4y;
Chris@10 206 TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@10 207 TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
Chris@10 208 T2H = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@10 209 T2G = VSUB(TL, TM);
Chris@10 210 TN = VADD(TL, TM);
Chris@10 211 T3A = VSUB(T1S, T1T);
Chris@10 212 T1U = VADD(T1S, T1T);
Chris@10 213 T4y = VSUB(T1q, T1t);
Chris@10 214 T1u = VADD(T1q, T1t);
Chris@10 215 T4N = VADD(T2p, T2q);
Chris@10 216 T2r = VSUB(T2p, T2q);
Chris@10 217 T2s = VSUB(TO, TP);
Chris@10 218 TQ = VADD(TO, TP);
Chris@10 219 T3W = VFMA(LDK(KP414213562), T3z, T3A);
Chris@10 220 T3B = VFNMS(LDK(KP414213562), T3A, T3z);
Chris@10 221 T2Y = VFNMS(LDK(KP414213562), T1R, T1U);
Chris@10 222 T1V = VFMA(LDK(KP414213562), T1U, T1R);
Chris@10 223 T52 = VSUB(T4x, T4y);
Chris@10 224 T4z = VADD(T4x, T4y);
Chris@10 225 T2I = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
Chris@10 226 }
Chris@10 227 }
Chris@10 228 {
Chris@10 229 V TS, TT, T2u, T2v, TV, TW;
Chris@10 230 TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@10 231 TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
Chris@10 232 T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@10 233 T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
Chris@10 234 TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
Chris@10 235 TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@10 236 T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
Chris@10 237 T4O = VADD(T2H, T2I);
Chris@10 238 T2J = VSUB(T2H, T2I);
Chris@10 239 T2x = VSUB(TS, TT);
Chris@10 240 TU = VADD(TS, TT);
Chris@10 241 T4T = VADD(T2u, T2v);
Chris@10 242 T2w = VSUB(T2u, T2v);
Chris@10 243 T2z = VSUB(TV, TW);
Chris@10 244 TX = VADD(TV, TW);
Chris@10 245 T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@10 246 }
Chris@10 247 T3O = VADD(T2s, T2r);
Chris@10 248 T2t = VSUB(T2r, T2s);
Chris@10 249 T3L = VSUB(T2G, T2J);
Chris@10 250 T2K = VADD(T2G, T2J);
Chris@10 251 T4S = VSUB(TN, TQ);
Chris@10 252 TR = VADD(TN, TQ);
Chris@10 253 {
Chris@10 254 V T4P, T4Q, TY, T4U, T2C;
Chris@10 255 T5G = VADD(T4N, T4O);
Chris@10 256 T4P = VSUB(T4N, T4O);
Chris@10 257 T4Q = VSUB(TX, TU);
Chris@10 258 TY = VADD(TU, TX);
Chris@10 259 T4U = VADD(T2A, T2B);
Chris@10 260 T2C = VSUB(T2A, T2B);
Chris@10 261 T2y = VSUB(T2w, T2x);
Chris@10 262 T2L = VADD(T2x, T2w);
Chris@10 263 TZ = VADD(TR, TY);
Chris@10 264 T5F = VSUB(TR, TY);
Chris@10 265 T4V = VSUB(T4T, T4U);
Chris@10 266 T5H = VADD(T4T, T4U);
Chris@10 267 T2D = VADD(T2z, T2C);
Chris@10 268 T2M = VSUB(T2z, T2C);
Chris@10 269 T4R = VSUB(T4P, T4Q);
Chris@10 270 T5k = VADD(T4Q, T4P);
Chris@10 271 }
Chris@10 272 }
Chris@10 273 {
Chris@10 274 V T2f, Ty, T23, T4C, T20, T21, TB, T4D, T2i, T26, TF, T24, TG, TH, T29;
Chris@10 275 V T2a;
Chris@10 276 {
Chris@10 277 V T1Y, T1Z, Tz, TA, T2g, T2h, Tw, Tx, TD, TE;
Chris@10 278 Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@10 279 Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
Chris@10 280 T5j = VADD(T4S, T4V);
Chris@10 281 T4W = VSUB(T4S, T4V);
Chris@10 282 T5I = VSUB(T5G, T5H);
Chris@10 283 T5X = VADD(T5G, T5H);
Chris@10 284 T2E = VADD(T2y, T2D);
Chris@10 285 T3M = VSUB(T2D, T2y);
Chris@10 286 T2N = VADD(T2L, T2M);
Chris@10 287 T3P = VSUB(T2L, T2M);
Chris@10 288 T2f = VSUB(Tw, Tx);
Chris@10 289 Ty = VADD(Tw, Tx);
Chris@10 290 T1Y = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@10 291 T1Z = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
Chris@10 292 Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@10 293 TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
Chris@10 294 T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@10 295 T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
Chris@10 296 TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@10 297 TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
Chris@10 298 T23 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@10 299 T4C = VADD(T1Y, T1Z);
Chris@10 300 T20 = VSUB(T1Y, T1Z);
Chris@10 301 T21 = VSUB(Tz, TA);
Chris@10 302 TB = VADD(Tz, TA);
Chris@10 303 T4D = VADD(T2g, T2h);
Chris@10 304 T2i = VSUB(T2g, T2h);
Chris@10 305 T26 = VSUB(TD, TE);
Chris@10 306 TF = VADD(TD, TE);
Chris@10 307 T24 = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
Chris@10 308 TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
Chris@10 309 TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@10 310 T29 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
Chris@10 311 T2a = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@10 312 }
Chris@10 313 {
Chris@10 314 V T4I, T25, T28, TI, T4J, T2b, T4H, TC, T5B, T4E;
Chris@10 315 T3H = VADD(T21, T20);
Chris@10 316 T22 = VSUB(T20, T21);
Chris@10 317 T3E = VSUB(T2f, T2i);
Chris@10 318 T2j = VADD(T2f, T2i);
Chris@10 319 T4I = VADD(T23, T24);
Chris@10 320 T25 = VSUB(T23, T24);
Chris@10 321 T28 = VSUB(TG, TH);
Chris@10 322 TI = VADD(TG, TH);
Chris@10 323 T4J = VADD(T29, T2a);
Chris@10 324 T2b = VSUB(T29, T2a);
Chris@10 325 T4H = VSUB(Ty, TB);
Chris@10 326 TC = VADD(Ty, TB);
Chris@10 327 T5B = VADD(T4C, T4D);
Chris@10 328 T4E = VSUB(T4C, T4D);
Chris@10 329 {
Chris@10 330 V T27, T2k, TJ, T4F, T4K, T5C, T2c, T2l;
Chris@10 331 T27 = VSUB(T25, T26);
Chris@10 332 T2k = VADD(T26, T25);
Chris@10 333 TJ = VADD(TF, TI);
Chris@10 334 T4F = VSUB(TI, TF);
Chris@10 335 T4K = VSUB(T4I, T4J);
Chris@10 336 T5C = VADD(T4I, T4J);
Chris@10 337 T2c = VADD(T28, T2b);
Chris@10 338 T2l = VSUB(T28, T2b);
Chris@10 339 T4G = VSUB(T4E, T4F);
Chris@10 340 T5h = VADD(T4F, T4E);
Chris@10 341 TK = VADD(TC, TJ);
Chris@10 342 T5A = VSUB(TC, TJ);
Chris@10 343 T5D = VSUB(T5B, T5C);
Chris@10 344 T5W = VADD(T5B, T5C);
Chris@10 345 T2d = VADD(T27, T2c);
Chris@10 346 T3F = VSUB(T2c, T27);
Chris@10 347 T4L = VSUB(T4H, T4K);
Chris@10 348 T5g = VADD(T4H, T4K);
Chris@10 349 T3I = VSUB(T2k, T2l);
Chris@10 350 T2m = VADD(T2k, T2l);
Chris@10 351 }
Chris@10 352 }
Chris@10 353 }
Chris@10 354 }
Chris@10 355 }
Chris@10 356 }
Chris@10 357 {
Chris@10 358 V T1v, T1g, T5V, Tv, T60, T5Y, T11, T10;
Chris@10 359 {
Chris@10 360 V T5o, T5n, T5i, T5r, T5f, T5l, T5w, T5u;
Chris@10 361 {
Chris@10 362 V T5d, T4t, T4A, T4X, T58, T51, T4M, T59, T54, T5e, T5b, T4B;
Chris@10 363 T5d = VADD(T4r, T4s);
Chris@10 364 T4t = VSUB(T4r, T4s);
Chris@10 365 T4A = VSUB(T4w, T4z);
Chris@10 366 T5o = VADD(T4w, T4z);
Chris@10 367 T4X = VFNMS(LDK(KP414213562), T4W, T4R);
Chris@10 368 T58 = VFMA(LDK(KP414213562), T4R, T4W);
Chris@10 369 T5n = VADD(T50, T4Z);
Chris@10 370 T51 = VSUB(T4Z, T50);
Chris@10 371 T4M = VFMA(LDK(KP414213562), T4L, T4G);
Chris@10 372 T59 = VFNMS(LDK(KP414213562), T4G, T4L);
Chris@10 373 T54 = VSUB(T52, T53);
Chris@10 374 T5e = VADD(T53, T52);
Chris@10 375 T5b = VFNMS(LDK(KP707106781), T4A, T4t);
Chris@10 376 T4B = VFMA(LDK(KP707106781), T4A, T4t);
Chris@10 377 {
Chris@10 378 V T5s, T56, T4Y, T5c, T5a, T57, T55, T5t;
Chris@10 379 T5i = VFMA(LDK(KP414213562), T5h, T5g);
Chris@10 380 T5s = VFNMS(LDK(KP414213562), T5g, T5h);
Chris@10 381 T56 = VADD(T4M, T4X);
Chris@10 382 T4Y = VSUB(T4M, T4X);
Chris@10 383 T5c = VADD(T59, T58);
Chris@10 384 T5a = VSUB(T58, T59);
Chris@10 385 T57 = VFMA(LDK(KP707106781), T54, T51);
Chris@10 386 T55 = VFNMS(LDK(KP707106781), T54, T51);
Chris@10 387 T5r = VFNMS(LDK(KP707106781), T5e, T5d);
Chris@10 388 T5f = VFMA(LDK(KP707106781), T5e, T5d);
Chris@10 389 T5t = VFMA(LDK(KP414213562), T5j, T5k);
Chris@10 390 T5l = VFNMS(LDK(KP414213562), T5k, T5j);
Chris@10 391 T61 = VFMA(LDK(KP923879532), T4Y, T4B);
Chris@10 392 STM4(&(ro[6]), T61, ovs, &(ro[0]));
Chris@10 393 T62 = VFNMS(LDK(KP923879532), T4Y, T4B);
Chris@10 394 STM4(&(ro[22]), T62, ovs, &(ro[0]));
Chris@10 395 T63 = VFMA(LDK(KP923879532), T5c, T5b);
Chris@10 396 STM4(&(ro[30]), T63, ovs, &(ro[0]));
Chris@10 397 T64 = VFNMS(LDK(KP923879532), T5c, T5b);
Chris@10 398 STM4(&(ro[14]), T64, ovs, &(ro[0]));
Chris@10 399 T65 = VFMA(LDK(KP923879532), T5a, T57);
Chris@10 400 STM4(&(io[6]), T65, ovs, &(io[0]));
Chris@10 401 T66 = VFNMS(LDK(KP923879532), T5a, T57);
Chris@10 402 STM4(&(io[22]), T66, ovs, &(io[0]));
Chris@10 403 T67 = VFMA(LDK(KP923879532), T56, T55);
Chris@10 404 STM4(&(io[30]), T67, ovs, &(io[0]));
Chris@10 405 T68 = VFNMS(LDK(KP923879532), T56, T55);
Chris@10 406 STM4(&(io[14]), T68, ovs, &(io[0]));
Chris@10 407 T5w = VADD(T5s, T5t);
Chris@10 408 T5u = VSUB(T5s, T5t);
Chris@10 409 }
Chris@10 410 }
Chris@10 411 {
Chris@10 412 V Tf, T5P, T5z, T5S, T5U, T5O, T5K, T5L, T5M, Tu, T5T, T5N;
Chris@10 413 {
Chris@10 414 V T5E, T5Q, T5q, T5m, T5v, T5p, T5R, T5J, T5x, T5y;
Chris@10 415 Tf = VADD(T7, Te);
Chris@10 416 T5x = VSUB(T7, Te);
Chris@10 417 T5y = VSUB(T1n, T1u);
Chris@10 418 T1v = VADD(T1n, T1u);
Chris@10 419 T69 = VFMA(LDK(KP923879532), T5u, T5r);
Chris@10 420 STM4(&(ro[10]), T69, ovs, &(ro[0]));
Chris@10 421 T6a = VFNMS(LDK(KP923879532), T5u, T5r);
Chris@10 422 STM4(&(ro[26]), T6a, ovs, &(ro[0]));
Chris@10 423 T5E = VADD(T5A, T5D);
Chris@10 424 T5Q = VSUB(T5D, T5A);
Chris@10 425 T5q = VSUB(T5l, T5i);
Chris@10 426 T5m = VADD(T5i, T5l);
Chris@10 427 T5v = VFMA(LDK(KP707106781), T5o, T5n);
Chris@10 428 T5p = VFNMS(LDK(KP707106781), T5o, T5n);
Chris@10 429 T5P = VSUB(T5x, T5y);
Chris@10 430 T5z = VADD(T5x, T5y);
Chris@10 431 T5R = VADD(T5F, T5I);
Chris@10 432 T5J = VSUB(T5F, T5I);
Chris@10 433 T6b = VFMA(LDK(KP923879532), T5m, T5f);
Chris@10 434 STM4(&(ro[2]), T6b, ovs, &(ro[0]));
Chris@10 435 T6c = VFNMS(LDK(KP923879532), T5m, T5f);
Chris@10 436 STM4(&(ro[18]), T6c, ovs, &(ro[0]));
Chris@10 437 T6d = VFMA(LDK(KP923879532), T5w, T5v);
Chris@10 438 STM4(&(io[2]), T6d, ovs, &(io[0]));
Chris@10 439 T6e = VFNMS(LDK(KP923879532), T5w, T5v);
Chris@10 440 STM4(&(io[18]), T6e, ovs, &(io[0]));
Chris@10 441 T6f = VFMA(LDK(KP923879532), T5q, T5p);
Chris@10 442 STM4(&(io[10]), T6f, ovs, &(io[0]));
Chris@10 443 T6g = VFNMS(LDK(KP923879532), T5q, T5p);
Chris@10 444 STM4(&(io[26]), T6g, ovs, &(io[0]));
Chris@10 445 T5S = VSUB(T5Q, T5R);
Chris@10 446 T5U = VADD(T5Q, T5R);
Chris@10 447 T5O = VSUB(T5J, T5E);
Chris@10 448 T5K = VADD(T5E, T5J);
Chris@10 449 T1g = VADD(T18, T1f);
Chris@10 450 T5L = VSUB(T18, T1f);
Chris@10 451 T5M = VSUB(Tt, Tm);
Chris@10 452 Tu = VADD(Tm, Tt);
Chris@10 453 }
Chris@10 454 T6h = VFMA(LDK(KP707106781), T5S, T5P);
Chris@10 455 STM4(&(ro[12]), T6h, ovs, &(ro[0]));
Chris@10 456 T6i = VFNMS(LDK(KP707106781), T5S, T5P);
Chris@10 457 STM4(&(ro[28]), T6i, ovs, &(ro[0]));
Chris@10 458 T6j = VFMA(LDK(KP707106781), T5K, T5z);
Chris@10 459 STM4(&(ro[4]), T6j, ovs, &(ro[0]));
Chris@10 460 T6k = VFNMS(LDK(KP707106781), T5K, T5z);
Chris@10 461 STM4(&(ro[20]), T6k, ovs, &(ro[0]));
Chris@10 462 T5T = VADD(T5M, T5L);
Chris@10 463 T5N = VSUB(T5L, T5M);
Chris@10 464 T5V = VSUB(Tf, Tu);
Chris@10 465 Tv = VADD(Tf, Tu);
Chris@10 466 T6l = VFMA(LDK(KP707106781), T5U, T5T);
Chris@10 467 STM4(&(io[4]), T6l, ovs, &(io[0]));
Chris@10 468 T6m = VFNMS(LDK(KP707106781), T5U, T5T);
Chris@10 469 STM4(&(io[20]), T6m, ovs, &(io[0]));
Chris@10 470 T6n = VFMA(LDK(KP707106781), T5O, T5N);
Chris@10 471 STM4(&(io[12]), T6n, ovs, &(io[0]));
Chris@10 472 T6o = VFNMS(LDK(KP707106781), T5O, T5N);
Chris@10 473 STM4(&(io[28]), T6o, ovs, &(io[0]));
Chris@10 474 T60 = VADD(T5W, T5X);
Chris@10 475 T5Y = VSUB(T5W, T5X);
Chris@10 476 T11 = VSUB(TZ, TK);
Chris@10 477 T10 = VADD(TK, TZ);
Chris@10 478 }
Chris@10 479 }
Chris@10 480 {
Chris@10 481 V T39, T3k, T3j, T3a, T1X, T37, T33, T31, T3d, T3c, T47, T4i, T4h, T48, T4b;
Chris@10 482 V T4a, T4e, T3N, T41, T3D, T45, T3Z, T38, T36, T32, T2Q, T42, T3K, T3Q, T4d;
Chris@10 483 {
Chris@10 484 V T2e, T2n, T2F, T2O, T1w, T5Z;
Chris@10 485 {
Chris@10 486 V T1H, T1W, T2X, T30;
Chris@10 487 T39 = VFMA(LDK(KP707106781), T1G, T1z);
Chris@10 488 T1H = VFNMS(LDK(KP707106781), T1G, T1z);
Chris@10 489 T1W = VSUB(T1O, T1V);
Chris@10 490 T3k = VADD(T1O, T1V);
Chris@10 491 T3j = VFMA(LDK(KP707106781), T2W, T2T);
Chris@10 492 T2X = VFNMS(LDK(KP707106781), T2W, T2T);
Chris@10 493 T30 = VSUB(T2Y, T2Z);
Chris@10 494 T3a = VADD(T2Z, T2Y);
Chris@10 495 T6p = VSUB(T5V, T5Y);
Chris@10 496 STM4(&(ro[24]), T6p, ovs, &(ro[0]));
Chris@10 497 T6q = VADD(T5V, T5Y);
Chris@10 498 STM4(&(ro[8]), T6q, ovs, &(ro[0]));
Chris@10 499 T6r = VADD(Tv, T10);
Chris@10 500 STM4(&(ro[0]), T6r, ovs, &(ro[0]));
Chris@10 501 T6s = VSUB(Tv, T10);
Chris@10 502 STM4(&(ro[16]), T6s, ovs, &(ro[0]));
Chris@10 503 T1w = VSUB(T1g, T1v);
Chris@10 504 T5Z = VADD(T1g, T1v);
Chris@10 505 T1X = VFMA(LDK(KP923879532), T1W, T1H);
Chris@10 506 T37 = VFNMS(LDK(KP923879532), T1W, T1H);
Chris@10 507 T33 = VFMA(LDK(KP923879532), T30, T2X);
Chris@10 508 T31 = VFNMS(LDK(KP923879532), T30, T2X);
Chris@10 509 }
Chris@10 510 T3d = VFMA(LDK(KP707106781), T2d, T22);
Chris@10 511 T2e = VFNMS(LDK(KP707106781), T2d, T22);
Chris@10 512 T2n = VFNMS(LDK(KP707106781), T2m, T2j);
Chris@10 513 T3c = VFMA(LDK(KP707106781), T2m, T2j);
Chris@10 514 T6t = VADD(T5Z, T60);
Chris@10 515 STM4(&(io[0]), T6t, ovs, &(io[0]));
Chris@10 516 T6u = VSUB(T5Z, T60);
Chris@10 517 STM4(&(io[16]), T6u, ovs, &(io[0]));
Chris@10 518 T6v = VSUB(T1w, T11);
Chris@10 519 STM4(&(io[24]), T6v, ovs, &(io[0]));
Chris@10 520 T6w = VADD(T11, T1w);
Chris@10 521 STM4(&(io[8]), T6w, ovs, &(io[0]));
Chris@10 522 T3g = VFMA(LDK(KP707106781), T2E, T2t);
Chris@10 523 T2F = VFNMS(LDK(KP707106781), T2E, T2t);
Chris@10 524 T2O = VFNMS(LDK(KP707106781), T2N, T2K);
Chris@10 525 T3f = VFMA(LDK(KP707106781), T2N, T2K);
Chris@10 526 {
Chris@10 527 V T3v, T35, T2o, T3C, T3V, T3Y;
Chris@10 528 T47 = VFNMS(LDK(KP707106781), T3u, T3t);
Chris@10 529 T3v = VFMA(LDK(KP707106781), T3u, T3t);
Chris@10 530 T35 = VFNMS(LDK(KP668178637), T2e, T2n);
Chris@10 531 T2o = VFMA(LDK(KP668178637), T2n, T2e);
Chris@10 532 T3C = VSUB(T3y, T3B);
Chris@10 533 T4i = VADD(T3y, T3B);
Chris@10 534 T4h = VFNMS(LDK(KP707106781), T3U, T3T);
Chris@10 535 T3V = VFMA(LDK(KP707106781), T3U, T3T);
Chris@10 536 T3Y = VSUB(T3W, T3X);
Chris@10 537 T48 = VADD(T3X, T3W);
Chris@10 538 {
Chris@10 539 V T3G, T34, T2P, T3J;
Chris@10 540 T4b = VFMA(LDK(KP707106781), T3F, T3E);
Chris@10 541 T3G = VFNMS(LDK(KP707106781), T3F, T3E);
Chris@10 542 T34 = VFMA(LDK(KP668178637), T2F, T2O);
Chris@10 543 T2P = VFNMS(LDK(KP668178637), T2O, T2F);
Chris@10 544 T3J = VFNMS(LDK(KP707106781), T3I, T3H);
Chris@10 545 T4a = VFMA(LDK(KP707106781), T3I, T3H);
Chris@10 546 T4e = VFMA(LDK(KP707106781), T3M, T3L);
Chris@10 547 T3N = VFNMS(LDK(KP707106781), T3M, T3L);
Chris@10 548 T41 = VFNMS(LDK(KP923879532), T3C, T3v);
Chris@10 549 T3D = VFMA(LDK(KP923879532), T3C, T3v);
Chris@10 550 T45 = VFMA(LDK(KP923879532), T3Y, T3V);
Chris@10 551 T3Z = VFNMS(LDK(KP923879532), T3Y, T3V);
Chris@10 552 T38 = VADD(T35, T34);
Chris@10 553 T36 = VSUB(T34, T35);
Chris@10 554 T32 = VADD(T2o, T2P);
Chris@10 555 T2Q = VSUB(T2o, T2P);
Chris@10 556 T42 = VFNMS(LDK(KP668178637), T3G, T3J);
Chris@10 557 T3K = VFMA(LDK(KP668178637), T3J, T3G);
Chris@10 558 T3Q = VFNMS(LDK(KP707106781), T3P, T3O);
Chris@10 559 T4d = VFMA(LDK(KP707106781), T3P, T3O);
Chris@10 560 }
Chris@10 561 }
Chris@10 562 }
Chris@10 563 {
Chris@10 564 V T4n, T4c, T43, T3R, T4m, T4f;
Chris@10 565 T6x = VFMA(LDK(KP831469612), T38, T37);
Chris@10 566 STM4(&(ro[29]), T6x, ovs, &(ro[1]));
Chris@10 567 T6y = VFNMS(LDK(KP831469612), T38, T37);
Chris@10 568 STM4(&(ro[13]), T6y, ovs, &(ro[1]));
Chris@10 569 T6z = VFMA(LDK(KP831469612), T36, T33);
Chris@10 570 STM4(&(io[5]), T6z, ovs, &(io[1]));
Chris@10 571 T6A = VFNMS(LDK(KP831469612), T36, T33);
Chris@10 572 STM4(&(io[21]), T6A, ovs, &(io[1]));
Chris@10 573 T6B = VFMA(LDK(KP831469612), T32, T31);
Chris@10 574 STM4(&(io[29]), T6B, ovs, &(io[1]));
Chris@10 575 T6C = VFNMS(LDK(KP831469612), T32, T31);
Chris@10 576 STM4(&(io[13]), T6C, ovs, &(io[1]));
Chris@10 577 T6D = VFMA(LDK(KP831469612), T2Q, T1X);
Chris@10 578 STM4(&(ro[5]), T6D, ovs, &(ro[1]));
Chris@10 579 T6E = VFNMS(LDK(KP831469612), T2Q, T1X);
Chris@10 580 STM4(&(ro[21]), T6E, ovs, &(ro[1]));
Chris@10 581 T43 = VFMA(LDK(KP668178637), T3N, T3Q);
Chris@10 582 T3R = VFNMS(LDK(KP668178637), T3Q, T3N);
Chris@10 583 {
Chris@10 584 V T44, T46, T40, T3S;
Chris@10 585 T44 = VSUB(T42, T43);
Chris@10 586 T46 = VADD(T42, T43);
Chris@10 587 T40 = VSUB(T3R, T3K);
Chris@10 588 T3S = VADD(T3K, T3R);
Chris@10 589 T4p = VFMA(LDK(KP923879532), T48, T47);
Chris@10 590 T49 = VFNMS(LDK(KP923879532), T48, T47);
Chris@10 591 T4l = VFNMS(LDK(KP923879532), T4i, T4h);
Chris@10 592 T4j = VFMA(LDK(KP923879532), T4i, T4h);
Chris@10 593 T4n = VFNMS(LDK(KP198912367), T4a, T4b);
Chris@10 594 T4c = VFMA(LDK(KP198912367), T4b, T4a);
Chris@10 595 T6F = VFMA(LDK(KP831469612), T44, T41);
Chris@10 596 STM4(&(ro[11]), T6F, ovs, &(ro[1]));
Chris@10 597 T6G = VFNMS(LDK(KP831469612), T44, T41);
Chris@10 598 STM4(&(ro[27]), T6G, ovs, &(ro[1]));
Chris@10 599 T6H = VFMA(LDK(KP831469612), T46, T45);
Chris@10 600 STM4(&(io[3]), T6H, ovs, &(io[1]));
Chris@10 601 T6I = VFNMS(LDK(KP831469612), T46, T45);
Chris@10 602 STM4(&(io[19]), T6I, ovs, &(io[1]));
Chris@10 603 T6J = VFMA(LDK(KP831469612), T40, T3Z);
Chris@10 604 STM4(&(io[11]), T6J, ovs, &(io[1]));
Chris@10 605 T6K = VFNMS(LDK(KP831469612), T40, T3Z);
Chris@10 606 STM4(&(io[27]), T6K, ovs, &(io[1]));
Chris@10 607 T6L = VFMA(LDK(KP831469612), T3S, T3D);
Chris@10 608 STM4(&(ro[3]), T6L, ovs, &(ro[1]));
Chris@10 609 T6M = VFNMS(LDK(KP831469612), T3S, T3D);
Chris@10 610 STM4(&(ro[19]), T6M, ovs, &(ro[1]));
Chris@10 611 }
Chris@10 612 T4m = VFMA(LDK(KP198912367), T4d, T4e);
Chris@10 613 T4f = VFNMS(LDK(KP198912367), T4e, T4d);
Chris@10 614 T3n = VFNMS(LDK(KP923879532), T3a, T39);
Chris@10 615 T3b = VFMA(LDK(KP923879532), T3a, T39);
Chris@10 616 T3r = VFMA(LDK(KP923879532), T3k, T3j);
Chris@10 617 T3l = VFNMS(LDK(KP923879532), T3k, T3j);
Chris@10 618 T3o = VFNMS(LDK(KP198912367), T3c, T3d);
Chris@10 619 T3e = VFMA(LDK(KP198912367), T3d, T3c);
Chris@10 620 T4q = VADD(T4n, T4m);
Chris@10 621 T4o = VSUB(T4m, T4n);
Chris@10 622 T4k = VADD(T4c, T4f);
Chris@10 623 T4g = VSUB(T4c, T4f);
Chris@10 624 }
Chris@10 625 }
Chris@10 626 }
Chris@10 627 }
Chris@10 628 {
Chris@10 629 V T6N, T6O, T6P, T6Q;
Chris@10 630 T6N = VFMA(LDK(KP980785280), T4q, T4p);
Chris@10 631 STM4(&(ro[31]), T6N, ovs, &(ro[1]));
Chris@10 632 STN4(&(ro[28]), T6i, T6x, T63, T6N, ovs);
Chris@10 633 T6O = VFNMS(LDK(KP980785280), T4q, T4p);
Chris@10 634 STM4(&(ro[15]), T6O, ovs, &(ro[1]));
Chris@10 635 STN4(&(ro[12]), T6h, T6y, T64, T6O, ovs);
Chris@10 636 T6P = VFMA(LDK(KP980785280), T4o, T4l);
Chris@10 637 STM4(&(io[7]), T6P, ovs, &(io[1]));
Chris@10 638 STN4(&(io[4]), T6l, T6z, T65, T6P, ovs);
Chris@10 639 T6Q = VFNMS(LDK(KP980785280), T4o, T4l);
Chris@10 640 STM4(&(io[23]), T6Q, ovs, &(io[1]));
Chris@10 641 STN4(&(io[20]), T6m, T6A, T66, T6Q, ovs);
Chris@10 642 {
Chris@10 643 V T6R, T6S, T6T, T6U;
Chris@10 644 T6R = VFMA(LDK(KP980785280), T4k, T4j);
Chris@10 645 STM4(&(io[31]), T6R, ovs, &(io[1]));
Chris@10 646 STN4(&(io[28]), T6o, T6B, T67, T6R, ovs);
Chris@10 647 T6S = VFNMS(LDK(KP980785280), T4k, T4j);
Chris@10 648 STM4(&(io[15]), T6S, ovs, &(io[1]));
Chris@10 649 STN4(&(io[12]), T6n, T6C, T68, T6S, ovs);
Chris@10 650 T6T = VFMA(LDK(KP980785280), T4g, T49);
Chris@10 651 STM4(&(ro[7]), T6T, ovs, &(ro[1]));
Chris@10 652 STN4(&(ro[4]), T6j, T6D, T61, T6T, ovs);
Chris@10 653 T6U = VFNMS(LDK(KP980785280), T4g, T49);
Chris@10 654 STM4(&(ro[23]), T6U, ovs, &(ro[1]));
Chris@10 655 STN4(&(ro[20]), T6k, T6E, T62, T6U, ovs);
Chris@10 656 T3h = VFNMS(LDK(KP198912367), T3g, T3f);
Chris@10 657 T3p = VFMA(LDK(KP198912367), T3f, T3g);
Chris@10 658 }
Chris@10 659 }
Chris@10 660 {
Chris@10 661 V T3s, T3q, T3i, T3m;
Chris@10 662 T3s = VADD(T3o, T3p);
Chris@10 663 T3q = VSUB(T3o, T3p);
Chris@10 664 T3i = VADD(T3e, T3h);
Chris@10 665 T3m = VSUB(T3h, T3e);
Chris@10 666 {
Chris@10 667 V T6V, T6W, T6X, T6Y;
Chris@10 668 T6V = VFMA(LDK(KP980785280), T3q, T3n);
Chris@10 669 STM4(&(ro[9]), T6V, ovs, &(ro[1]));
Chris@10 670 STN4(&(ro[8]), T6q, T6V, T69, T6F, ovs);
Chris@10 671 T6W = VFNMS(LDK(KP980785280), T3q, T3n);
Chris@10 672 STM4(&(ro[25]), T6W, ovs, &(ro[1]));
Chris@10 673 STN4(&(ro[24]), T6p, T6W, T6a, T6G, ovs);
Chris@10 674 T6X = VFMA(LDK(KP980785280), T3s, T3r);
Chris@10 675 STM4(&(io[1]), T6X, ovs, &(io[1]));
Chris@10 676 STN4(&(io[0]), T6t, T6X, T6d, T6H, ovs);
Chris@10 677 T6Y = VFNMS(LDK(KP980785280), T3s, T3r);
Chris@10 678 STM4(&(io[17]), T6Y, ovs, &(io[1]));
Chris@10 679 STN4(&(io[16]), T6u, T6Y, T6e, T6I, ovs);
Chris@10 680 {
Chris@10 681 V T6Z, T70, T71, T72;
Chris@10 682 T6Z = VFMA(LDK(KP980785280), T3m, T3l);
Chris@10 683 STM4(&(io[9]), T6Z, ovs, &(io[1]));
Chris@10 684 STN4(&(io[8]), T6w, T6Z, T6f, T6J, ovs);
Chris@10 685 T70 = VFNMS(LDK(KP980785280), T3m, T3l);
Chris@10 686 STM4(&(io[25]), T70, ovs, &(io[1]));
Chris@10 687 STN4(&(io[24]), T6v, T70, T6g, T6K, ovs);
Chris@10 688 T71 = VFMA(LDK(KP980785280), T3i, T3b);
Chris@10 689 STM4(&(ro[1]), T71, ovs, &(ro[1]));
Chris@10 690 STN4(&(ro[0]), T6r, T71, T6b, T6L, ovs);
Chris@10 691 T72 = VFNMS(LDK(KP980785280), T3i, T3b);
Chris@10 692 STM4(&(ro[17]), T72, ovs, &(ro[1]));
Chris@10 693 STN4(&(ro[16]), T6s, T72, T6c, T6M, ovs);
Chris@10 694 }
Chris@10 695 }
Chris@10 696 }
Chris@10 697 }
Chris@10 698 }
Chris@10 699 VLEAVE();
Chris@10 700 }
Chris@10 701
Chris@10 702 static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {236, 0, 136, 0}, &GENUS, 0, 1, 0, 0 };
Chris@10 703
Chris@10 704 void XSIMD(codelet_n2sv_32) (planner *p) {
Chris@10 705 X(kdft_register) (p, n2sv_32, &desc);
Chris@10 706 }
Chris@10 707
Chris@10 708 #else /* HAVE_FMA */
Chris@10 709
Chris@10 710 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include n2s.h -store-multiple 4 */
Chris@10 711
Chris@10 712 /*
Chris@10 713 * This function contains 372 FP additions, 84 FP multiplications,
Chris@10 714 * (or, 340 additions, 52 multiplications, 32 fused multiply/add),
Chris@10 715 * 130 stack variables, 7 constants, and 144 memory accesses
Chris@10 716 */
Chris@10 717 #include "n2s.h"
Chris@10 718
Chris@10 719 static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 720 {
Chris@10 721 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 722 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 723 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 724 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 725 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 726 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 727 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 728 {
Chris@10 729 INT i;
Chris@10 730 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@10 731 V T7, T4r, T4Z, T18, T1z, T3t, T3T, T2T, Te, T1f, T50, T4s, T2W, T3u, T1G;
Chris@10 732 V T3U, Tm, T1n, T1O, T2Z, T3y, T3X, T4w, T53, Tt, T1u, T1V, T2Y, T3B, T3W;
Chris@10 733 V T4z, T52, T2t, T3L, T3O, T2K, TR, TY, T5F, T5G, T5H, T5I, T4R, T5j, T2E;
Chris@10 734 V T3P, T4W, T5k, T2N, T3M, T22, T3E, T3H, T2j, TC, TJ, T5A, T5B, T5C, T5D;
Chris@10 735 V T4G, T5g, T2d, T3F, T4L, T5h, T2m, T3I;
Chris@10 736 {
Chris@10 737 V T3, T1x, T14, T2S, T6, T2R, T17, T1y;
Chris@10 738 {
Chris@10 739 V T1, T2, T12, T13;
Chris@10 740 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@10 741 T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
Chris@10 742 T3 = VADD(T1, T2);
Chris@10 743 T1x = VSUB(T1, T2);
Chris@10 744 T12 = LD(&(ii[0]), ivs, &(ii[0]));
Chris@10 745 T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
Chris@10 746 T14 = VADD(T12, T13);
Chris@10 747 T2S = VSUB(T12, T13);
Chris@10 748 }
Chris@10 749 {
Chris@10 750 V T4, T5, T15, T16;
Chris@10 751 T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@10 752 T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
Chris@10 753 T6 = VADD(T4, T5);
Chris@10 754 T2R = VSUB(T4, T5);
Chris@10 755 T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@10 756 T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
Chris@10 757 T17 = VADD(T15, T16);
Chris@10 758 T1y = VSUB(T15, T16);
Chris@10 759 }
Chris@10 760 T7 = VADD(T3, T6);
Chris@10 761 T4r = VSUB(T3, T6);
Chris@10 762 T4Z = VSUB(T14, T17);
Chris@10 763 T18 = VADD(T14, T17);
Chris@10 764 T1z = VSUB(T1x, T1y);
Chris@10 765 T3t = VADD(T1x, T1y);
Chris@10 766 T3T = VSUB(T2S, T2R);
Chris@10 767 T2T = VADD(T2R, T2S);
Chris@10 768 }
Chris@10 769 {
Chris@10 770 V Ta, T1B, T1b, T1A, Td, T1D, T1e, T1E;
Chris@10 771 {
Chris@10 772 V T8, T9, T19, T1a;
Chris@10 773 T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@10 774 T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
Chris@10 775 Ta = VADD(T8, T9);
Chris@10 776 T1B = VSUB(T8, T9);
Chris@10 777 T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@10 778 T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
Chris@10 779 T1b = VADD(T19, T1a);
Chris@10 780 T1A = VSUB(T19, T1a);
Chris@10 781 }
Chris@10 782 {
Chris@10 783 V Tb, Tc, T1c, T1d;
Chris@10 784 Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
Chris@10 785 Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@10 786 Td = VADD(Tb, Tc);
Chris@10 787 T1D = VSUB(Tb, Tc);
Chris@10 788 T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
Chris@10 789 T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@10 790 T1e = VADD(T1c, T1d);
Chris@10 791 T1E = VSUB(T1c, T1d);
Chris@10 792 }
Chris@10 793 Te = VADD(Ta, Td);
Chris@10 794 T1f = VADD(T1b, T1e);
Chris@10 795 T50 = VSUB(Td, Ta);
Chris@10 796 T4s = VSUB(T1b, T1e);
Chris@10 797 {
Chris@10 798 V T2U, T2V, T1C, T1F;
Chris@10 799 T2U = VSUB(T1D, T1E);
Chris@10 800 T2V = VADD(T1B, T1A);
Chris@10 801 T2W = VMUL(LDK(KP707106781), VSUB(T2U, T2V));
Chris@10 802 T3u = VMUL(LDK(KP707106781), VADD(T2V, T2U));
Chris@10 803 T1C = VSUB(T1A, T1B);
Chris@10 804 T1F = VADD(T1D, T1E);
Chris@10 805 T1G = VMUL(LDK(KP707106781), VSUB(T1C, T1F));
Chris@10 806 T3U = VMUL(LDK(KP707106781), VADD(T1C, T1F));
Chris@10 807 }
Chris@10 808 }
Chris@10 809 {
Chris@10 810 V Ti, T1L, T1j, T1J, Tl, T1I, T1m, T1M, T1K, T1N;
Chris@10 811 {
Chris@10 812 V Tg, Th, T1h, T1i;
Chris@10 813 Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@10 814 Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
Chris@10 815 Ti = VADD(Tg, Th);
Chris@10 816 T1L = VSUB(Tg, Th);
Chris@10 817 T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@10 818 T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
Chris@10 819 T1j = VADD(T1h, T1i);
Chris@10 820 T1J = VSUB(T1h, T1i);
Chris@10 821 }
Chris@10 822 {
Chris@10 823 V Tj, Tk, T1k, T1l;
Chris@10 824 Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@10 825 Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
Chris@10 826 Tl = VADD(Tj, Tk);
Chris@10 827 T1I = VSUB(Tj, Tk);
Chris@10 828 T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@10 829 T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
Chris@10 830 T1m = VADD(T1k, T1l);
Chris@10 831 T1M = VSUB(T1k, T1l);
Chris@10 832 }
Chris@10 833 Tm = VADD(Ti, Tl);
Chris@10 834 T1n = VADD(T1j, T1m);
Chris@10 835 T1K = VADD(T1I, T1J);
Chris@10 836 T1N = VSUB(T1L, T1M);
Chris@10 837 T1O = VFNMS(LDK(KP923879532), T1N, VMUL(LDK(KP382683432), T1K));
Chris@10 838 T2Z = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1N));
Chris@10 839 {
Chris@10 840 V T3w, T3x, T4u, T4v;
Chris@10 841 T3w = VSUB(T1J, T1I);
Chris@10 842 T3x = VADD(T1L, T1M);
Chris@10 843 T3y = VFNMS(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3w));
Chris@10 844 T3X = VFMA(LDK(KP382683432), T3w, VMUL(LDK(KP923879532), T3x));
Chris@10 845 T4u = VSUB(T1j, T1m);
Chris@10 846 T4v = VSUB(Ti, Tl);
Chris@10 847 T4w = VSUB(T4u, T4v);
Chris@10 848 T53 = VADD(T4v, T4u);
Chris@10 849 }
Chris@10 850 }
Chris@10 851 {
Chris@10 852 V Tp, T1S, T1q, T1Q, Ts, T1P, T1t, T1T, T1R, T1U;
Chris@10 853 {
Chris@10 854 V Tn, To, T1o, T1p;
Chris@10 855 Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
Chris@10 856 To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@10 857 Tp = VADD(Tn, To);
Chris@10 858 T1S = VSUB(Tn, To);
Chris@10 859 T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
Chris@10 860 T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@10 861 T1q = VADD(T1o, T1p);
Chris@10 862 T1Q = VSUB(T1o, T1p);
Chris@10 863 }
Chris@10 864 {
Chris@10 865 V Tq, Tr, T1r, T1s;
Chris@10 866 Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@10 867 Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
Chris@10 868 Ts = VADD(Tq, Tr);
Chris@10 869 T1P = VSUB(Tq, Tr);
Chris@10 870 T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@10 871 T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
Chris@10 872 T1t = VADD(T1r, T1s);
Chris@10 873 T1T = VSUB(T1r, T1s);
Chris@10 874 }
Chris@10 875 Tt = VADD(Tp, Ts);
Chris@10 876 T1u = VADD(T1q, T1t);
Chris@10 877 T1R = VADD(T1P, T1Q);
Chris@10 878 T1U = VSUB(T1S, T1T);
Chris@10 879 T1V = VFMA(LDK(KP382683432), T1R, VMUL(LDK(KP923879532), T1U));
Chris@10 880 T2Y = VFNMS(LDK(KP923879532), T1R, VMUL(LDK(KP382683432), T1U));
Chris@10 881 {
Chris@10 882 V T3z, T3A, T4x, T4y;
Chris@10 883 T3z = VSUB(T1Q, T1P);
Chris@10 884 T3A = VADD(T1S, T1T);
Chris@10 885 T3B = VFMA(LDK(KP923879532), T3z, VMUL(LDK(KP382683432), T3A));
Chris@10 886 T3W = VFNMS(LDK(KP382683432), T3z, VMUL(LDK(KP923879532), T3A));
Chris@10 887 T4x = VSUB(Tp, Ts);
Chris@10 888 T4y = VSUB(T1q, T1t);
Chris@10 889 T4z = VADD(T4x, T4y);
Chris@10 890 T52 = VSUB(T4x, T4y);
Chris@10 891 }
Chris@10 892 }
Chris@10 893 {
Chris@10 894 V TN, T2p, T2J, T4S, TQ, T2G, T2s, T4T, TU, T2x, T2w, T4O, TX, T2z, T2C;
Chris@10 895 V T4P;
Chris@10 896 {
Chris@10 897 V TL, TM, T2H, T2I;
Chris@10 898 TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
Chris@10 899 TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@10 900 TN = VADD(TL, TM);
Chris@10 901 T2p = VSUB(TL, TM);
Chris@10 902 T2H = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
Chris@10 903 T2I = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@10 904 T2J = VSUB(T2H, T2I);
Chris@10 905 T4S = VADD(T2H, T2I);
Chris@10 906 }
Chris@10 907 {
Chris@10 908 V TO, TP, T2q, T2r;
Chris@10 909 TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@10 910 TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
Chris@10 911 TQ = VADD(TO, TP);
Chris@10 912 T2G = VSUB(TO, TP);
Chris@10 913 T2q = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@10 914 T2r = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
Chris@10 915 T2s = VSUB(T2q, T2r);
Chris@10 916 T4T = VADD(T2q, T2r);
Chris@10 917 }
Chris@10 918 {
Chris@10 919 V TS, TT, T2u, T2v;
Chris@10 920 TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@10 921 TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
Chris@10 922 TU = VADD(TS, TT);
Chris@10 923 T2x = VSUB(TS, TT);
Chris@10 924 T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@10 925 T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
Chris@10 926 T2w = VSUB(T2u, T2v);
Chris@10 927 T4O = VADD(T2u, T2v);
Chris@10 928 }
Chris@10 929 {
Chris@10 930 V TV, TW, T2A, T2B;
Chris@10 931 TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
Chris@10 932 TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@10 933 TX = VADD(TV, TW);
Chris@10 934 T2z = VSUB(TV, TW);
Chris@10 935 T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
Chris@10 936 T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@10 937 T2C = VSUB(T2A, T2B);
Chris@10 938 T4P = VADD(T2A, T2B);
Chris@10 939 }
Chris@10 940 T2t = VSUB(T2p, T2s);
Chris@10 941 T3L = VADD(T2p, T2s);
Chris@10 942 T3O = VSUB(T2J, T2G);
Chris@10 943 T2K = VADD(T2G, T2J);
Chris@10 944 TR = VADD(TN, TQ);
Chris@10 945 TY = VADD(TU, TX);
Chris@10 946 T5F = VSUB(TR, TY);
Chris@10 947 {
Chris@10 948 V T4N, T4Q, T2y, T2D;
Chris@10 949 T5G = VADD(T4S, T4T);
Chris@10 950 T5H = VADD(T4O, T4P);
Chris@10 951 T5I = VSUB(T5G, T5H);
Chris@10 952 T4N = VSUB(TN, TQ);
Chris@10 953 T4Q = VSUB(T4O, T4P);
Chris@10 954 T4R = VSUB(T4N, T4Q);
Chris@10 955 T5j = VADD(T4N, T4Q);
Chris@10 956 T2y = VSUB(T2w, T2x);
Chris@10 957 T2D = VADD(T2z, T2C);
Chris@10 958 T2E = VMUL(LDK(KP707106781), VSUB(T2y, T2D));
Chris@10 959 T3P = VMUL(LDK(KP707106781), VADD(T2y, T2D));
Chris@10 960 {
Chris@10 961 V T4U, T4V, T2L, T2M;
Chris@10 962 T4U = VSUB(T4S, T4T);
Chris@10 963 T4V = VSUB(TX, TU);
Chris@10 964 T4W = VSUB(T4U, T4V);
Chris@10 965 T5k = VADD(T4V, T4U);
Chris@10 966 T2L = VSUB(T2z, T2C);
Chris@10 967 T2M = VADD(T2x, T2w);
Chris@10 968 T2N = VMUL(LDK(KP707106781), VSUB(T2L, T2M));
Chris@10 969 T3M = VMUL(LDK(KP707106781), VADD(T2M, T2L));
Chris@10 970 }
Chris@10 971 }
Chris@10 972 }
Chris@10 973 {
Chris@10 974 V Ty, T2f, T21, T4C, TB, T1Y, T2i, T4D, TF, T28, T2b, T4I, TI, T23, T26;
Chris@10 975 V T4J;
Chris@10 976 {
Chris@10 977 V Tw, Tx, T1Z, T20;
Chris@10 978 Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@10 979 Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
Chris@10 980 Ty = VADD(Tw, Tx);
Chris@10 981 T2f = VSUB(Tw, Tx);
Chris@10 982 T1Z = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@10 983 T20 = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
Chris@10 984 T21 = VSUB(T1Z, T20);
Chris@10 985 T4C = VADD(T1Z, T20);
Chris@10 986 }
Chris@10 987 {
Chris@10 988 V Tz, TA, T2g, T2h;
Chris@10 989 Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@10 990 TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
Chris@10 991 TB = VADD(Tz, TA);
Chris@10 992 T1Y = VSUB(Tz, TA);
Chris@10 993 T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@10 994 T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
Chris@10 995 T2i = VSUB(T2g, T2h);
Chris@10 996 T4D = VADD(T2g, T2h);
Chris@10 997 }
Chris@10 998 {
Chris@10 999 V TD, TE, T29, T2a;
Chris@10 1000 TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@10 1001 TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
Chris@10 1002 TF = VADD(TD, TE);
Chris@10 1003 T28 = VSUB(TD, TE);
Chris@10 1004 T29 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@10 1005 T2a = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
Chris@10 1006 T2b = VSUB(T29, T2a);
Chris@10 1007 T4I = VADD(T29, T2a);
Chris@10 1008 }
Chris@10 1009 {
Chris@10 1010 V TG, TH, T24, T25;
Chris@10 1011 TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
Chris@10 1012 TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@10 1013 TI = VADD(TG, TH);
Chris@10 1014 T23 = VSUB(TG, TH);
Chris@10 1015 T24 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
Chris@10 1016 T25 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@10 1017 T26 = VSUB(T24, T25);
Chris@10 1018 T4J = VADD(T24, T25);
Chris@10 1019 }
Chris@10 1020 T22 = VADD(T1Y, T21);
Chris@10 1021 T3E = VADD(T2f, T2i);
Chris@10 1022 T3H = VSUB(T21, T1Y);
Chris@10 1023 T2j = VSUB(T2f, T2i);
Chris@10 1024 TC = VADD(Ty, TB);
Chris@10 1025 TJ = VADD(TF, TI);
Chris@10 1026 T5A = VSUB(TC, TJ);
Chris@10 1027 {
Chris@10 1028 V T4E, T4F, T27, T2c;
Chris@10 1029 T5B = VADD(T4C, T4D);
Chris@10 1030 T5C = VADD(T4I, T4J);
Chris@10 1031 T5D = VSUB(T5B, T5C);
Chris@10 1032 T4E = VSUB(T4C, T4D);
Chris@10 1033 T4F = VSUB(TI, TF);
Chris@10 1034 T4G = VSUB(T4E, T4F);
Chris@10 1035 T5g = VADD(T4F, T4E);
Chris@10 1036 T27 = VSUB(T23, T26);
Chris@10 1037 T2c = VADD(T28, T2b);
Chris@10 1038 T2d = VMUL(LDK(KP707106781), VSUB(T27, T2c));
Chris@10 1039 T3F = VMUL(LDK(KP707106781), VADD(T2c, T27));
Chris@10 1040 {
Chris@10 1041 V T4H, T4K, T2k, T2l;
Chris@10 1042 T4H = VSUB(Ty, TB);
Chris@10 1043 T4K = VSUB(T4I, T4J);
Chris@10 1044 T4L = VSUB(T4H, T4K);
Chris@10 1045 T5h = VADD(T4H, T4K);
Chris@10 1046 T2k = VSUB(T2b, T28);
Chris@10 1047 T2l = VADD(T23, T26);
Chris@10 1048 T2m = VMUL(LDK(KP707106781), VSUB(T2k, T2l));
Chris@10 1049 T3I = VMUL(LDK(KP707106781), VADD(T2k, T2l));
Chris@10 1050 }
Chris@10 1051 }
Chris@10 1052 }
Chris@10 1053 {
Chris@10 1054 V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
Chris@10 1055 V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
Chris@10 1056 V T6v, T6w;
Chris@10 1057 {
Chris@10 1058 V T4B, T57, T5a, T5c, T4Y, T56, T55, T5b;
Chris@10 1059 {
Chris@10 1060 V T4t, T4A, T58, T59;
Chris@10 1061 T4t = VSUB(T4r, T4s);
Chris@10 1062 T4A = VMUL(LDK(KP707106781), VSUB(T4w, T4z));
Chris@10 1063 T4B = VADD(T4t, T4A);
Chris@10 1064 T57 = VSUB(T4t, T4A);
Chris@10 1065 T58 = VFNMS(LDK(KP923879532), T4L, VMUL(LDK(KP382683432), T4G));
Chris@10 1066 T59 = VFMA(LDK(KP382683432), T4W, VMUL(LDK(KP923879532), T4R));
Chris@10 1067 T5a = VSUB(T58, T59);
Chris@10 1068 T5c = VADD(T58, T59);
Chris@10 1069 }
Chris@10 1070 {
Chris@10 1071 V T4M, T4X, T51, T54;
Chris@10 1072 T4M = VFMA(LDK(KP923879532), T4G, VMUL(LDK(KP382683432), T4L));
Chris@10 1073 T4X = VFNMS(LDK(KP923879532), T4W, VMUL(LDK(KP382683432), T4R));
Chris@10 1074 T4Y = VADD(T4M, T4X);
Chris@10 1075 T56 = VSUB(T4X, T4M);
Chris@10 1076 T51 = VSUB(T4Z, T50);
Chris@10 1077 T54 = VMUL(LDK(KP707106781), VSUB(T52, T53));
Chris@10 1078 T55 = VSUB(T51, T54);
Chris@10 1079 T5b = VADD(T51, T54);
Chris@10 1080 }
Chris@10 1081 T61 = VSUB(T4B, T4Y);
Chris@10 1082 STM4(&(ro[22]), T61, ovs, &(ro[0]));
Chris@10 1083 T62 = VSUB(T5b, T5c);
Chris@10 1084 STM4(&(io[22]), T62, ovs, &(io[0]));
Chris@10 1085 T63 = VADD(T4B, T4Y);
Chris@10 1086 STM4(&(ro[6]), T63, ovs, &(ro[0]));
Chris@10 1087 T64 = VADD(T5b, T5c);
Chris@10 1088 STM4(&(io[6]), T64, ovs, &(io[0]));
Chris@10 1089 T65 = VSUB(T55, T56);
Chris@10 1090 STM4(&(io[30]), T65, ovs, &(io[0]));
Chris@10 1091 T66 = VSUB(T57, T5a);
Chris@10 1092 STM4(&(ro[30]), T66, ovs, &(ro[0]));
Chris@10 1093 T67 = VADD(T55, T56);
Chris@10 1094 STM4(&(io[14]), T67, ovs, &(io[0]));
Chris@10 1095 T68 = VADD(T57, T5a);
Chris@10 1096 STM4(&(ro[14]), T68, ovs, &(ro[0]));
Chris@10 1097 }
Chris@10 1098 {
Chris@10 1099 V T5f, T5r, T5u, T5w, T5m, T5q, T5p, T5v;
Chris@10 1100 {
Chris@10 1101 V T5d, T5e, T5s, T5t;
Chris@10 1102 T5d = VADD(T4r, T4s);
Chris@10 1103 T5e = VMUL(LDK(KP707106781), VADD(T53, T52));
Chris@10 1104 T5f = VADD(T5d, T5e);
Chris@10 1105 T5r = VSUB(T5d, T5e);
Chris@10 1106 T5s = VFNMS(LDK(KP382683432), T5h, VMUL(LDK(KP923879532), T5g));
Chris@10 1107 T5t = VFMA(LDK(KP923879532), T5k, VMUL(LDK(KP382683432), T5j));
Chris@10 1108 T5u = VSUB(T5s, T5t);
Chris@10 1109 T5w = VADD(T5s, T5t);
Chris@10 1110 }
Chris@10 1111 {
Chris@10 1112 V T5i, T5l, T5n, T5o;
Chris@10 1113 T5i = VFMA(LDK(KP382683432), T5g, VMUL(LDK(KP923879532), T5h));
Chris@10 1114 T5l = VFNMS(LDK(KP382683432), T5k, VMUL(LDK(KP923879532), T5j));
Chris@10 1115 T5m = VADD(T5i, T5l);
Chris@10 1116 T5q = VSUB(T5l, T5i);
Chris@10 1117 T5n = VADD(T50, T4Z);
Chris@10 1118 T5o = VMUL(LDK(KP707106781), VADD(T4w, T4z));
Chris@10 1119 T5p = VSUB(T5n, T5o);
Chris@10 1120 T5v = VADD(T5n, T5o);
Chris@10 1121 }
Chris@10 1122 T69 = VSUB(T5f, T5m);
Chris@10 1123 STM4(&(ro[18]), T69, ovs, &(ro[0]));
Chris@10 1124 T6a = VSUB(T5v, T5w);
Chris@10 1125 STM4(&(io[18]), T6a, ovs, &(io[0]));
Chris@10 1126 T6b = VADD(T5f, T5m);
Chris@10 1127 STM4(&(ro[2]), T6b, ovs, &(ro[0]));
Chris@10 1128 T6c = VADD(T5v, T5w);
Chris@10 1129 STM4(&(io[2]), T6c, ovs, &(io[0]));
Chris@10 1130 T6d = VSUB(T5p, T5q);
Chris@10 1131 STM4(&(io[26]), T6d, ovs, &(io[0]));
Chris@10 1132 T6e = VSUB(T5r, T5u);
Chris@10 1133 STM4(&(ro[26]), T6e, ovs, &(ro[0]));
Chris@10 1134 T6f = VADD(T5p, T5q);
Chris@10 1135 STM4(&(io[10]), T6f, ovs, &(io[0]));
Chris@10 1136 T6g = VADD(T5r, T5u);
Chris@10 1137 STM4(&(ro[10]), T6g, ovs, &(ro[0]));
Chris@10 1138 }
Chris@10 1139 {
Chris@10 1140 V T5z, T5P, T5S, T5U, T5K, T5O, T5N, T5T;
Chris@10 1141 {
Chris@10 1142 V T5x, T5y, T5Q, T5R;
Chris@10 1143 T5x = VSUB(T7, Te);
Chris@10 1144 T5y = VSUB(T1n, T1u);
Chris@10 1145 T5z = VADD(T5x, T5y);
Chris@10 1146 T5P = VSUB(T5x, T5y);
Chris@10 1147 T5Q = VSUB(T5D, T5A);
Chris@10 1148 T5R = VADD(T5F, T5I);
Chris@10 1149 T5S = VMUL(LDK(KP707106781), VSUB(T5Q, T5R));
Chris@10 1150 T5U = VMUL(LDK(KP707106781), VADD(T5Q, T5R));
Chris@10 1151 }
Chris@10 1152 {
Chris@10 1153 V T5E, T5J, T5L, T5M;
Chris@10 1154 T5E = VADD(T5A, T5D);
Chris@10 1155 T5J = VSUB(T5F, T5I);
Chris@10 1156 T5K = VMUL(LDK(KP707106781), VADD(T5E, T5J));
Chris@10 1157 T5O = VMUL(LDK(KP707106781), VSUB(T5J, T5E));
Chris@10 1158 T5L = VSUB(T18, T1f);
Chris@10 1159 T5M = VSUB(Tt, Tm);
Chris@10 1160 T5N = VSUB(T5L, T5M);
Chris@10 1161 T5T = VADD(T5M, T5L);
Chris@10 1162 }
Chris@10 1163 T6h = VSUB(T5z, T5K);
Chris@10 1164 STM4(&(ro[20]), T6h, ovs, &(ro[0]));
Chris@10 1165 T6i = VSUB(T5T, T5U);
Chris@10 1166 STM4(&(io[20]), T6i, ovs, &(io[0]));
Chris@10 1167 T6j = VADD(T5z, T5K);
Chris@10 1168 STM4(&(ro[4]), T6j, ovs, &(ro[0]));
Chris@10 1169 T6k = VADD(T5T, T5U);
Chris@10 1170 STM4(&(io[4]), T6k, ovs, &(io[0]));
Chris@10 1171 T6l = VSUB(T5N, T5O);
Chris@10 1172 STM4(&(io[28]), T6l, ovs, &(io[0]));
Chris@10 1173 T6m = VSUB(T5P, T5S);
Chris@10 1174 STM4(&(ro[28]), T6m, ovs, &(ro[0]));
Chris@10 1175 T6n = VADD(T5N, T5O);
Chris@10 1176 STM4(&(io[12]), T6n, ovs, &(io[0]));
Chris@10 1177 T6o = VADD(T5P, T5S);
Chris@10 1178 STM4(&(ro[12]), T6o, ovs, &(ro[0]));
Chris@10 1179 }
Chris@10 1180 {
Chris@10 1181 V Tv, T5V, T5Y, T60, T10, T11, T1w, T5Z;
Chris@10 1182 {
Chris@10 1183 V Tf, Tu, T5W, T5X;
Chris@10 1184 Tf = VADD(T7, Te);
Chris@10 1185 Tu = VADD(Tm, Tt);
Chris@10 1186 Tv = VADD(Tf, Tu);
Chris@10 1187 T5V = VSUB(Tf, Tu);
Chris@10 1188 T5W = VADD(T5B, T5C);
Chris@10 1189 T5X = VADD(T5G, T5H);
Chris@10 1190 T5Y = VSUB(T5W, T5X);
Chris@10 1191 T60 = VADD(T5W, T5X);
Chris@10 1192 }
Chris@10 1193 {
Chris@10 1194 V TK, TZ, T1g, T1v;
Chris@10 1195 TK = VADD(TC, TJ);
Chris@10 1196 TZ = VADD(TR, TY);
Chris@10 1197 T10 = VADD(TK, TZ);
Chris@10 1198 T11 = VSUB(TZ, TK);
Chris@10 1199 T1g = VADD(T18, T1f);
Chris@10 1200 T1v = VADD(T1n, T1u);
Chris@10 1201 T1w = VSUB(T1g, T1v);
Chris@10 1202 T5Z = VADD(T1g, T1v);
Chris@10 1203 }
Chris@10 1204 T6p = VSUB(Tv, T10);
Chris@10 1205 STM4(&(ro[16]), T6p, ovs, &(ro[0]));
Chris@10 1206 T6q = VSUB(T5Z, T60);
Chris@10 1207 STM4(&(io[16]), T6q, ovs, &(io[0]));
Chris@10 1208 T6r = VADD(Tv, T10);
Chris@10 1209 STM4(&(ro[0]), T6r, ovs, &(ro[0]));
Chris@10 1210 T6s = VADD(T5Z, T60);
Chris@10 1211 STM4(&(io[0]), T6s, ovs, &(io[0]));
Chris@10 1212 T6t = VADD(T11, T1w);
Chris@10 1213 STM4(&(io[8]), T6t, ovs, &(io[0]));
Chris@10 1214 T6u = VADD(T5V, T5Y);
Chris@10 1215 STM4(&(ro[8]), T6u, ovs, &(ro[0]));
Chris@10 1216 T6v = VSUB(T1w, T11);
Chris@10 1217 STM4(&(io[24]), T6v, ovs, &(io[0]));
Chris@10 1218 T6w = VSUB(T5V, T5Y);
Chris@10 1219 STM4(&(ro[24]), T6w, ovs, &(ro[0]));
Chris@10 1220 }
Chris@10 1221 {
Chris@10 1222 V T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E;
Chris@10 1223 {
Chris@10 1224 V T1X, T33, T31, T37, T2o, T34, T2P, T35;
Chris@10 1225 {
Chris@10 1226 V T1H, T1W, T2X, T30;
Chris@10 1227 T1H = VSUB(T1z, T1G);
Chris@10 1228 T1W = VSUB(T1O, T1V);
Chris@10 1229 T1X = VADD(T1H, T1W);
Chris@10 1230 T33 = VSUB(T1H, T1W);
Chris@10 1231 T2X = VSUB(T2T, T2W);
Chris@10 1232 T30 = VSUB(T2Y, T2Z);
Chris@10 1233 T31 = VSUB(T2X, T30);
Chris@10 1234 T37 = VADD(T2X, T30);
Chris@10 1235 }
Chris@10 1236 {
Chris@10 1237 V T2e, T2n, T2F, T2O;
Chris@10 1238 T2e = VSUB(T22, T2d);
Chris@10 1239 T2n = VSUB(T2j, T2m);
Chris@10 1240 T2o = VFMA(LDK(KP980785280), T2e, VMUL(LDK(KP195090322), T2n));
Chris@10 1241 T34 = VFNMS(LDK(KP980785280), T2n, VMUL(LDK(KP195090322), T2e));
Chris@10 1242 T2F = VSUB(T2t, T2E);
Chris@10 1243 T2O = VSUB(T2K, T2N);
Chris@10 1244 T2P = VFNMS(LDK(KP980785280), T2O, VMUL(LDK(KP195090322), T2F));
Chris@10 1245 T35 = VFMA(LDK(KP195090322), T2O, VMUL(LDK(KP980785280), T2F));
Chris@10 1246 }
Chris@10 1247 {
Chris@10 1248 V T2Q, T38, T32, T36;
Chris@10 1249 T2Q = VADD(T2o, T2P);
Chris@10 1250 T6x = VSUB(T1X, T2Q);
Chris@10 1251 STM4(&(ro[23]), T6x, ovs, &(ro[1]));
Chris@10 1252 T6y = VADD(T1X, T2Q);
Chris@10 1253 STM4(&(ro[7]), T6y, ovs, &(ro[1]));
Chris@10 1254 T38 = VADD(T34, T35);
Chris@10 1255 T6z = VSUB(T37, T38);
Chris@10 1256 STM4(&(io[23]), T6z, ovs, &(io[1]));
Chris@10 1257 T6A = VADD(T37, T38);
Chris@10 1258 STM4(&(io[7]), T6A, ovs, &(io[1]));
Chris@10 1259 T32 = VSUB(T2P, T2o);
Chris@10 1260 T6B = VSUB(T31, T32);
Chris@10 1261 STM4(&(io[31]), T6B, ovs, &(io[1]));
Chris@10 1262 T6C = VADD(T31, T32);
Chris@10 1263 STM4(&(io[15]), T6C, ovs, &(io[1]));
Chris@10 1264 T36 = VSUB(T34, T35);
Chris@10 1265 T6D = VSUB(T33, T36);
Chris@10 1266 STM4(&(ro[31]), T6D, ovs, &(ro[1]));
Chris@10 1267 T6E = VADD(T33, T36);
Chris@10 1268 STM4(&(ro[15]), T6E, ovs, &(ro[1]));
Chris@10 1269 }
Chris@10 1270 }
Chris@10 1271 {
Chris@10 1272 V T3D, T41, T3Z, T45, T3K, T42, T3R, T43;
Chris@10 1273 {
Chris@10 1274 V T3v, T3C, T3V, T3Y;
Chris@10 1275 T3v = VSUB(T3t, T3u);
Chris@10 1276 T3C = VSUB(T3y, T3B);
Chris@10 1277 T3D = VADD(T3v, T3C);
Chris@10 1278 T41 = VSUB(T3v, T3C);
Chris@10 1279 T3V = VSUB(T3T, T3U);
Chris@10 1280 T3Y = VSUB(T3W, T3X);
Chris@10 1281 T3Z = VSUB(T3V, T3Y);
Chris@10 1282 T45 = VADD(T3V, T3Y);
Chris@10 1283 }
Chris@10 1284 {
Chris@10 1285 V T3G, T3J, T3N, T3Q;
Chris@10 1286 T3G = VSUB(T3E, T3F);
Chris@10 1287 T3J = VSUB(T3H, T3I);
Chris@10 1288 T3K = VFMA(LDK(KP555570233), T3G, VMUL(LDK(KP831469612), T3J));
Chris@10 1289 T42 = VFNMS(LDK(KP831469612), T3G, VMUL(LDK(KP555570233), T3J));
Chris@10 1290 T3N = VSUB(T3L, T3M);
Chris@10 1291 T3Q = VSUB(T3O, T3P);
Chris@10 1292 T3R = VFNMS(LDK(KP831469612), T3Q, VMUL(LDK(KP555570233), T3N));
Chris@10 1293 T43 = VFMA(LDK(KP831469612), T3N, VMUL(LDK(KP555570233), T3Q));
Chris@10 1294 }
Chris@10 1295 {
Chris@10 1296 V T3S, T6F, T6G, T46, T6H, T6I;
Chris@10 1297 T3S = VADD(T3K, T3R);
Chris@10 1298 T6F = VSUB(T3D, T3S);
Chris@10 1299 STM4(&(ro[21]), T6F, ovs, &(ro[1]));
Chris@10 1300 STN4(&(ro[20]), T6h, T6F, T61, T6x, ovs);
Chris@10 1301 T6G = VADD(T3D, T3S);
Chris@10 1302 STM4(&(ro[5]), T6G, ovs, &(ro[1]));
Chris@10 1303 STN4(&(ro[4]), T6j, T6G, T63, T6y, ovs);
Chris@10 1304 T46 = VADD(T42, T43);
Chris@10 1305 T6H = VSUB(T45, T46);
Chris@10 1306 STM4(&(io[21]), T6H, ovs, &(io[1]));
Chris@10 1307 STN4(&(io[20]), T6i, T6H, T62, T6z, ovs);
Chris@10 1308 T6I = VADD(T45, T46);
Chris@10 1309 STM4(&(io[5]), T6I, ovs, &(io[1]));
Chris@10 1310 STN4(&(io[4]), T6k, T6I, T64, T6A, ovs);
Chris@10 1311 }
Chris@10 1312 {
Chris@10 1313 V T40, T6J, T6K, T44, T6L, T6M;
Chris@10 1314 T40 = VSUB(T3R, T3K);
Chris@10 1315 T6J = VSUB(T3Z, T40);
Chris@10 1316 STM4(&(io[29]), T6J, ovs, &(io[1]));
Chris@10 1317 STN4(&(io[28]), T6l, T6J, T65, T6B, ovs);
Chris@10 1318 T6K = VADD(T3Z, T40);
Chris@10 1319 STM4(&(io[13]), T6K, ovs, &(io[1]));
Chris@10 1320 STN4(&(io[12]), T6n, T6K, T67, T6C, ovs);
Chris@10 1321 T44 = VSUB(T42, T43);
Chris@10 1322 T6L = VSUB(T41, T44);
Chris@10 1323 STM4(&(ro[29]), T6L, ovs, &(ro[1]));
Chris@10 1324 STN4(&(ro[28]), T6m, T6L, T66, T6D, ovs);
Chris@10 1325 T6M = VADD(T41, T44);
Chris@10 1326 STM4(&(ro[13]), T6M, ovs, &(ro[1]));
Chris@10 1327 STN4(&(ro[12]), T6o, T6M, T68, T6E, ovs);
Chris@10 1328 }
Chris@10 1329 }
Chris@10 1330 }
Chris@10 1331 {
Chris@10 1332 V T6N, T6O, T6P, T6Q, T6R, T6S, T6T, T6U;
Chris@10 1333 {
Chris@10 1334 V T49, T4l, T4j, T4p, T4c, T4m, T4f, T4n;
Chris@10 1335 {
Chris@10 1336 V T47, T48, T4h, T4i;
Chris@10 1337 T47 = VADD(T3t, T3u);
Chris@10 1338 T48 = VADD(T3X, T3W);
Chris@10 1339 T49 = VADD(T47, T48);
Chris@10 1340 T4l = VSUB(T47, T48);
Chris@10 1341 T4h = VADD(T3T, T3U);
Chris@10 1342 T4i = VADD(T3y, T3B);
Chris@10 1343 T4j = VSUB(T4h, T4i);
Chris@10 1344 T4p = VADD(T4h, T4i);
Chris@10 1345 }
Chris@10 1346 {
Chris@10 1347 V T4a, T4b, T4d, T4e;
Chris@10 1348 T4a = VADD(T3E, T3F);
Chris@10 1349 T4b = VADD(T3H, T3I);
Chris@10 1350 T4c = VFMA(LDK(KP980785280), T4a, VMUL(LDK(KP195090322), T4b));
Chris@10 1351 T4m = VFNMS(LDK(KP195090322), T4a, VMUL(LDK(KP980785280), T4b));
Chris@10 1352 T4d = VADD(T3L, T3M);
Chris@10 1353 T4e = VADD(T3O, T3P);
Chris@10 1354 T4f = VFNMS(LDK(KP195090322), T4e, VMUL(LDK(KP980785280), T4d));
Chris@10 1355 T4n = VFMA(LDK(KP195090322), T4d, VMUL(LDK(KP980785280), T4e));
Chris@10 1356 }
Chris@10 1357 {
Chris@10 1358 V T4g, T4q, T4k, T4o;
Chris@10 1359 T4g = VADD(T4c, T4f);
Chris@10 1360 T6N = VSUB(T49, T4g);
Chris@10 1361 STM4(&(ro[17]), T6N, ovs, &(ro[1]));
Chris@10 1362 T6O = VADD(T49, T4g);
Chris@10 1363 STM4(&(ro[1]), T6O, ovs, &(ro[1]));
Chris@10 1364 T4q = VADD(T4m, T4n);
Chris@10 1365 T6P = VSUB(T4p, T4q);
Chris@10 1366 STM4(&(io[17]), T6P, ovs, &(io[1]));
Chris@10 1367 T6Q = VADD(T4p, T4q);
Chris@10 1368 STM4(&(io[1]), T6Q, ovs, &(io[1]));
Chris@10 1369 T4k = VSUB(T4f, T4c);
Chris@10 1370 T6R = VSUB(T4j, T4k);
Chris@10 1371 STM4(&(io[25]), T6R, ovs, &(io[1]));
Chris@10 1372 T6S = VADD(T4j, T4k);
Chris@10 1373 STM4(&(io[9]), T6S, ovs, &(io[1]));
Chris@10 1374 T4o = VSUB(T4m, T4n);
Chris@10 1375 T6T = VSUB(T4l, T4o);
Chris@10 1376 STM4(&(ro[25]), T6T, ovs, &(ro[1]));
Chris@10 1377 T6U = VADD(T4l, T4o);
Chris@10 1378 STM4(&(ro[9]), T6U, ovs, &(ro[1]));
Chris@10 1379 }
Chris@10 1380 }
Chris@10 1381 {
Chris@10 1382 V T3b, T3n, T3l, T3r, T3e, T3o, T3h, T3p;
Chris@10 1383 {
Chris@10 1384 V T39, T3a, T3j, T3k;
Chris@10 1385 T39 = VADD(T1z, T1G);
Chris@10 1386 T3a = VADD(T2Z, T2Y);
Chris@10 1387 T3b = VADD(T39, T3a);
Chris@10 1388 T3n = VSUB(T39, T3a);
Chris@10 1389 T3j = VADD(T2T, T2W);
Chris@10 1390 T3k = VADD(T1O, T1V);
Chris@10 1391 T3l = VSUB(T3j, T3k);
Chris@10 1392 T3r = VADD(T3j, T3k);
Chris@10 1393 }
Chris@10 1394 {
Chris@10 1395 V T3c, T3d, T3f, T3g;
Chris@10 1396 T3c = VADD(T22, T2d);
Chris@10 1397 T3d = VADD(T2j, T2m);
Chris@10 1398 T3e = VFMA(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
Chris@10 1399 T3o = VFNMS(LDK(KP555570233), T3d, VMUL(LDK(KP831469612), T3c));
Chris@10 1400 T3f = VADD(T2t, T2E);
Chris@10 1401 T3g = VADD(T2K, T2N);
Chris@10 1402 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
Chris@10 1403 T3p = VFMA(LDK(KP831469612), T3g, VMUL(LDK(KP555570233), T3f));
Chris@10 1404 }
Chris@10 1405 {
Chris@10 1406 V T3i, T6V, T6W, T3s, T6X, T6Y;
Chris@10 1407 T3i = VADD(T3e, T3h);
Chris@10 1408 T6V = VSUB(T3b, T3i);
Chris@10 1409 STM4(&(ro[19]), T6V, ovs, &(ro[1]));
Chris@10 1410 STN4(&(ro[16]), T6p, T6N, T69, T6V, ovs);
Chris@10 1411 T6W = VADD(T3b, T3i);
Chris@10 1412 STM4(&(ro[3]), T6W, ovs, &(ro[1]));
Chris@10 1413 STN4(&(ro[0]), T6r, T6O, T6b, T6W, ovs);
Chris@10 1414 T3s = VADD(T3o, T3p);
Chris@10 1415 T6X = VSUB(T3r, T3s);
Chris@10 1416 STM4(&(io[19]), T6X, ovs, &(io[1]));
Chris@10 1417 STN4(&(io[16]), T6q, T6P, T6a, T6X, ovs);
Chris@10 1418 T6Y = VADD(T3r, T3s);
Chris@10 1419 STM4(&(io[3]), T6Y, ovs, &(io[1]));
Chris@10 1420 STN4(&(io[0]), T6s, T6Q, T6c, T6Y, ovs);
Chris@10 1421 }
Chris@10 1422 {
Chris@10 1423 V T3m, T6Z, T70, T3q, T71, T72;
Chris@10 1424 T3m = VSUB(T3h, T3e);
Chris@10 1425 T6Z = VSUB(T3l, T3m);
Chris@10 1426 STM4(&(io[27]), T6Z, ovs, &(io[1]));
Chris@10 1427 STN4(&(io[24]), T6v, T6R, T6d, T6Z, ovs);
Chris@10 1428 T70 = VADD(T3l, T3m);
Chris@10 1429 STM4(&(io[11]), T70, ovs, &(io[1]));
Chris@10 1430 STN4(&(io[8]), T6t, T6S, T6f, T70, ovs);
Chris@10 1431 T3q = VSUB(T3o, T3p);
Chris@10 1432 T71 = VSUB(T3n, T3q);
Chris@10 1433 STM4(&(ro[27]), T71, ovs, &(ro[1]));
Chris@10 1434 STN4(&(ro[24]), T6w, T6T, T6e, T71, ovs);
Chris@10 1435 T72 = VADD(T3n, T3q);
Chris@10 1436 STM4(&(ro[11]), T72, ovs, &(ro[1]));
Chris@10 1437 STN4(&(ro[8]), T6u, T6U, T6g, T72, ovs);
Chris@10 1438 }
Chris@10 1439 }
Chris@10 1440 }
Chris@10 1441 }
Chris@10 1442 }
Chris@10 1443 }
Chris@10 1444 VLEAVE();
Chris@10 1445 }
Chris@10 1446
Chris@10 1447 static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {340, 52, 32, 0}, &GENUS, 0, 1, 0, 0 };
Chris@10 1448
Chris@10 1449 void XSIMD(codelet_n2sv_32) (planner *p) {
Chris@10 1450 X(kdft_register) (p, n2sv_32, &desc);
Chris@10 1451 }
Chris@10 1452
Chris@10 1453 #endif /* HAVE_FMA */