annotate src/fftw-3.3.3/dft/simd/common/n1fv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:54 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n1fv_64 -include n1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@10 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@10 33 * 168 stack variables, 15 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "n1f.h"
Chris@10 36
Chris@10 37 static void n1fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@10 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@10 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@10 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@10 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@10 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@10 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@10 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@10 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 50 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 51 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 54 {
Chris@10 55 INT i;
Chris@10 56 const R *xi;
Chris@10 57 R *xo;
Chris@10 58 xi = ri;
Chris@10 59 xo = ro;
Chris@10 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@10 61 V T5T, T5S, T5X, T65, T5Z, T5R, T67, T63, T5U, T64;
Chris@10 62 {
Chris@10 63 V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
Chris@10 64 V Tm, T3A, T3l, T2a, TC, T5p, T4o, T6E, T6e, T3i, T3B, TR, T29, T4x, T5q;
Chris@10 65 V T6h, T6D, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
Chris@10 66 V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
Chris@10 67 V T6m, T6Y, T5L, T4T;
Chris@10 68 {
Chris@10 69 V T4g, T4l, T3j, Tu, Tx, T4h, TA, T4i;
Chris@10 70 {
Chris@10 71 V T1, T2, T23, T24, T4, T5, T20, T21;
Chris@10 72 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 73 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@10 74 T23 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 75 T24 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@10 76 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 77 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@10 78 T20 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@10 79 T21 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@10 80 {
Chris@10 81 V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
Chris@10 82 {
Chris@10 83 V T8, T43, T3, T44, T25, T5i, T6, T45, T22, T9, Ti, Tj, Tb, Tc;
Chris@10 84 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 85 T43 = VSUB(T1, T2);
Chris@10 86 T3 = VADD(T1, T2);
Chris@10 87 T44 = VSUB(T23, T24);
Chris@10 88 T25 = VADD(T23, T24);
Chris@10 89 T5i = VSUB(T4, T5);
Chris@10 90 T6 = VADD(T4, T5);
Chris@10 91 T45 = VSUB(T20, T21);
Chris@10 92 T22 = VADD(T20, T21);
Chris@10 93 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@10 94 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 95 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@10 96 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@10 97 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@10 98 {
Chris@10 99 V T2T, T46, T5j, T2U;
Chris@10 100 T7 = VSUB(T3, T6);
Chris@10 101 T2T = VADD(T3, T6);
Chris@10 102 T46 = VADD(T44, T45);
Chris@10 103 T5j = VSUB(T45, T44);
Chris@10 104 T26 = VSUB(T22, T25);
Chris@10 105 T2U = VADD(T25, T22);
Chris@10 106 Ta = VADD(T8, T9);
Chris@10 107 T48 = VSUB(T8, T9);
Chris@10 108 Tk = VADD(Ti, Tj);
Chris@10 109 T4c = VSUB(Tj, Ti);
Chris@10 110 T5k = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@10 111 T6A = VFMA(LDK(KP707106781), T5j, T5i);
Chris@10 112 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@10 113 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@10 114 T2V = VADD(T2T, T2U);
Chris@10 115 T3z = VSUB(T2T, T2U);
Chris@10 116 T49 = VSUB(Tb, Tc);
Chris@10 117 Td = VADD(Tb, Tc);
Chris@10 118 }
Chris@10 119 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@10 120 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@10 121 }
Chris@10 122 {
Chris@10 123 V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
Chris@10 124 V Tp;
Chris@10 125 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 126 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@10 127 {
Chris@10 128 V Th, T4b, Tr, Ts;
Chris@10 129 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 130 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@10 131 Te = VSUB(Ta, Td);
Chris@10 132 T2W = VADD(Ta, Td);
Chris@10 133 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@10 134 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@10 135 Th = VADD(Tf, Tg);
Chris@10 136 T4b = VSUB(Tf, Tg);
Chris@10 137 Tq = VADD(To, Tp);
Chris@10 138 T4g = VSUB(To, Tp);
Chris@10 139 T4l = VSUB(Tr, Ts);
Chris@10 140 Tt = VADD(Tr, Ts);
Chris@10 141 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 142 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@10 143 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@10 144 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@10 145 Tl = VSUB(Th, Tk);
Chris@10 146 T2X = VADD(Th, Tk);
Chris@10 147 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@10 148 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@10 149 }
Chris@10 150 T3j = VADD(Tq, Tt);
Chris@10 151 Tu = VSUB(Tq, Tt);
Chris@10 152 Tx = VADD(Tv, Tw);
Chris@10 153 T4h = VSUB(Tv, Tw);
Chris@10 154 T6B = VSUB(T4d, T4a);
Chris@10 155 T4e = VADD(T4a, T4d);
Chris@10 156 T6a = VADD(T5l, T5m);
Chris@10 157 T5n = VSUB(T5l, T5m);
Chris@10 158 T3M = VSUB(T2X, T2W);
Chris@10 159 T2Y = VADD(T2W, T2X);
Chris@10 160 T27 = VSUB(Tl, Te);
Chris@10 161 Tm = VADD(Te, Tl);
Chris@10 162 TA = VADD(Ty, Tz);
Chris@10 163 T4i = VSUB(Ty, Tz);
Chris@10 164 }
Chris@10 165 }
Chris@10 166 }
Chris@10 167 {
Chris@10 168 V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3g, TJ, TF, TI;
Chris@10 169 {
Chris@10 170 V TD, TE, TG, TH;
Chris@10 171 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@10 172 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@10 173 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 174 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@10 175 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@10 176 {
Chris@10 177 V T3k, TB, T4j, T4m;
Chris@10 178 T3k = VADD(Tx, TA);
Chris@10 179 TB = VSUB(Tx, TA);
Chris@10 180 T4j = VADD(T4h, T4i);
Chris@10 181 T4m = VSUB(T4h, T4i);
Chris@10 182 T4p = VSUB(TD, TE);
Chris@10 183 TF = VADD(TD, TE);
Chris@10 184 T4u = VSUB(TH, TG);
Chris@10 185 TI = VADD(TG, TH);
Chris@10 186 T3A = VSUB(T3j, T3k);
Chris@10 187 T3l = VADD(T3j, T3k);
Chris@10 188 T2a = VFMA(LDK(KP414213562), Tu, TB);
Chris@10 189 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@10 190 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@10 191 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@10 192 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@10 193 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@10 194 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@10 195 }
Chris@10 196 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 197 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@10 198 }
Chris@10 199 T3g = VADD(TF, TI);
Chris@10 200 TJ = VSUB(TF, TI);
Chris@10 201 {
Chris@10 202 V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
Chris@10 203 {
Chris@10 204 V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
Chris@10 205 {
Chris@10 206 V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
Chris@10 207 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@10 208 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@10 209 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@10 210 T6E = VFMA(LDK(KP668178637), T6c, T6d);
Chris@10 211 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@10 212 TM = VADD(TK, TL);
Chris@10 213 T4r = VSUB(TK, TL);
Chris@10 214 TP = VADD(TN, TO);
Chris@10 215 T4q = VSUB(TN, TO);
Chris@10 216 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@10 217 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 218 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@10 219 {
Chris@10 220 V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
Chris@10 221 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@10 222 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@10 223 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 224 {
Chris@10 225 V T3h, TQ, T4s, T4v;
Chris@10 226 T3h = VADD(TP, TM);
Chris@10 227 TQ = VSUB(TM, TP);
Chris@10 228 T4s = VADD(T4q, T4r);
Chris@10 229 T4v = VSUB(T4r, T4q);
Chris@10 230 T4V = VSUB(T1r, T1s);
Chris@10 231 T1t = VADD(T1r, T1s);
Chris@10 232 T58 = VSUB(T1v, T1u);
Chris@10 233 T1w = VADD(T1u, T1v);
Chris@10 234 T4X = VSUB(T1O, T1P);
Chris@10 235 T1Q = VADD(T1O, T1P);
Chris@10 236 T3i = VADD(T3g, T3h);
Chris@10 237 T3B = VSUB(T3g, T3h);
Chris@10 238 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@10 239 T29 = VFMA(LDK(KP414213562), TJ, TQ);
Chris@10 240 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@10 241 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@10 242 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@10 243 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@10 244 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@10 245 }
Chris@10 246 {
Chris@10 247 V T4W, T1A, T50, T51, T1D, T1F, T1G;
Chris@10 248 {
Chris@10 249 V T1y, T1z, T1B, T1C;
Chris@10 250 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 251 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@10 252 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 253 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@10 254 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@10 255 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@10 256 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@10 257 T6D = VFMA(LDK(KP668178637), T6f, T6g);
Chris@10 258 T4W = VSUB(T1R, T1S);
Chris@10 259 T1T = VADD(T1R, T1S);
Chris@10 260 T1A = VADD(T1y, T1z);
Chris@10 261 T50 = VSUB(T1y, T1z);
Chris@10 262 T51 = VSUB(T1C, T1B);
Chris@10 263 T1D = VADD(T1B, T1C);
Chris@10 264 }
Chris@10 265 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@10 266 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@10 267 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 268 T4Y = VADD(T4W, T4X);
Chris@10 269 T59 = VSUB(T4X, T4W);
Chris@10 270 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@10 271 T3a = VADD(T1A, T1D);
Chris@10 272 T1E = VSUB(T1A, T1D);
Chris@10 273 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@10 274 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@10 275 T53 = VSUB(T1F, T1G);
Chris@10 276 T1H = VADD(T1F, T1G);
Chris@10 277 }
Chris@10 278 }
Chris@10 279 }
Chris@10 280 {
Chris@10 281 V T37, T54, T1K, T38;
Chris@10 282 T1x = VSUB(T1t, T1w);
Chris@10 283 T37 = VADD(T1t, T1w);
Chris@10 284 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@10 285 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@10 286 T54 = VSUB(T1J, T1I);
Chris@10 287 T1K = VADD(T1I, T1J);
Chris@10 288 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@10 289 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@10 290 T38 = VADD(T1T, T1Q);
Chris@10 291 T1U = VSUB(T1Q, T1T);
Chris@10 292 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@10 293 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@10 294 T1L = VSUB(T1H, T1K);
Chris@10 295 T3b = VADD(T1H, T1K);
Chris@10 296 T39 = VADD(T37, T38);
Chris@10 297 T3H = VSUB(T37, T38);
Chris@10 298 }
Chris@10 299 }
Chris@10 300 {
Chris@10 301 V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
Chris@10 302 V T1d;
Chris@10 303 {
Chris@10 304 V TU, TV, TX, TY, T56, T6v;
Chris@10 305 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 306 T56 = VADD(T52, T55);
Chris@10 307 T6v = VSUB(T55, T52);
Chris@10 308 {
Chris@10 309 V T5d, T6s, T1V, T1M;
Chris@10 310 T5d = VADD(T5b, T5c);
Chris@10 311 T6s = VSUB(T5c, T5b);
Chris@10 312 T1V = VSUB(T1L, T1E);
Chris@10 313 T1M = VADD(T1E, T1L);
Chris@10 314 T3I = VSUB(T3b, T3a);
Chris@10 315 T3c = VADD(T3a, T3b);
Chris@10 316 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@10 317 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@10 318 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@10 319 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@10 320 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@10 321 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@10 322 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@10 323 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@10 324 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@10 325 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@10 326 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@10 327 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@10 328 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@10 329 }
Chris@10 330 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 331 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@10 332 {
Chris@10 333 V T1h, T1i, T1k, T1l;
Chris@10 334 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 335 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@10 336 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@10 337 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@10 338 {
Chris@10 339 V T11, T4B, T4C, T12, T14, T15;
Chris@10 340 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 341 T4A = VSUB(TU, TV);
Chris@10 342 TW = VADD(TU, TV);
Chris@10 343 T4N = VSUB(TX, TY);
Chris@10 344 TZ = VADD(TX, TY);
Chris@10 345 T1j = VADD(T1h, T1i);
Chris@10 346 T4B = VSUB(T1h, T1i);
Chris@10 347 T1m = VADD(T1k, T1l);
Chris@10 348 T4C = VSUB(T1k, T1l);
Chris@10 349 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@10 350 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@10 351 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@10 352 {
Chris@10 353 V T18, T19, T1b, T1c;
Chris@10 354 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@10 355 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@10 356 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 357 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@10 358 T4O = VSUB(T4B, T4C);
Chris@10 359 T4D = VADD(T4B, T4C);
Chris@10 360 T13 = VADD(T11, T12);
Chris@10 361 T4F = VSUB(T11, T12);
Chris@10 362 T16 = VADD(T14, T15);
Chris@10 363 T4G = VSUB(T14, T15);
Chris@10 364 T1a = VADD(T18, T19);
Chris@10 365 T4I = VSUB(T18, T19);
Chris@10 366 T4J = VSUB(T1b, T1c);
Chris@10 367 T1d = VADD(T1b, T1c);
Chris@10 368 }
Chris@10 369 }
Chris@10 370 }
Chris@10 371 }
Chris@10 372 {
Chris@10 373 V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
Chris@10 374 T30 = VADD(TW, TZ);
Chris@10 375 T10 = VSUB(TW, TZ);
Chris@10 376 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@10 377 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@10 378 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@10 379 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@10 380 T33 = VADD(T13, T16);
Chris@10 381 T17 = VSUB(T13, T16);
Chris@10 382 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@10 383 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@10 384 T34 = VADD(T1a, T1d);
Chris@10 385 T1e = VSUB(T1a, T1d);
Chris@10 386 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@10 387 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@10 388 T1n = VSUB(T1j, T1m);
Chris@10 389 T31 = VADD(T1j, T1m);
Chris@10 390 {
Chris@10 391 V T1f, T1o, T6o, T4L, T4S, T6l;
Chris@10 392 T1f = VADD(T17, T1e);
Chris@10 393 T1o = VSUB(T17, T1e);
Chris@10 394 T6o = VSUB(T4H, T4K);
Chris@10 395 T4L = VADD(T4H, T4K);
Chris@10 396 T4S = VADD(T4Q, T4R);
Chris@10 397 T6l = VSUB(T4Q, T4R);
Chris@10 398 T3E = VSUB(T30, T31);
Chris@10 399 T32 = VADD(T30, T31);
Chris@10 400 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@10 401 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@10 402 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@10 403 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@10 404 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@10 405 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@10 406 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@10 407 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@10 408 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@10 409 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@10 410 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@10 411 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@10 412 }
Chris@10 413 }
Chris@10 414 }
Chris@10 415 }
Chris@10 416 }
Chris@10 417 }
Chris@10 418 {
Chris@10 419 V T6b, T6F, T7f, T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
Chris@10 420 {
Chris@10 421 V T2Z, T3r, T3s, T3m, T3d, T3v;
Chris@10 422 T2Z = VSUB(T2V, T2Y);
Chris@10 423 T3r = VADD(T2V, T2Y);
Chris@10 424 T3s = VADD(T3l, T3i);
Chris@10 425 T3m = VSUB(T3i, T3l);
Chris@10 426 T3d = VSUB(T39, T3c);
Chris@10 427 T3v = VADD(T39, T3c);
Chris@10 428 {
Chris@10 429 V T3x, T3t, T3P, T3J, T3D, T3V, T3Q, T3G, T36, T3u, T3Y, T3O, T6V, T6W;
Chris@10 430 {
Chris@10 431 V T3N, T3C, T3F, T35;
Chris@10 432 T3N = VSUB(T3B, T3A);
Chris@10 433 T3C = VADD(T3A, T3B);
Chris@10 434 T3F = VSUB(T33, T34);
Chris@10 435 T35 = VADD(T33, T34);
Chris@10 436 T3x = VSUB(T3r, T3s);
Chris@10 437 T3t = VADD(T3r, T3s);
Chris@10 438 T3P = VFMA(LDK(KP414213562), T3H, T3I);
Chris@10 439 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@10 440 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@10 441 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@10 442 T3Q = VFMA(LDK(KP414213562), T3E, T3F);
Chris@10 443 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@10 444 T36 = VSUB(T32, T35);
Chris@10 445 T3u = VADD(T32, T35);
Chris@10 446 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@10 447 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@10 448 }
Chris@10 449 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@10 450 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@10 451 T6W = VADD(T6E, T6D);
Chris@10 452 T6F = VSUB(T6D, T6E);
Chris@10 453 {
Chris@10 454 V T3K, T3Z, T3e, T3n;
Chris@10 455 T3K = VADD(T3G, T3J);
Chris@10 456 T3Z = VSUB(T3J, T3G);
Chris@10 457 T3e = VADD(T36, T3d);
Chris@10 458 T3n = VSUB(T3d, T36);
Chris@10 459 {
Chris@10 460 V T3w, T3y, T3R, T3W;
Chris@10 461 T3w = VADD(T3u, T3v);
Chris@10 462 T3y = VSUB(T3v, T3u);
Chris@10 463 T3R = VSUB(T3P, T3Q);
Chris@10 464 T3W = VADD(T3Q, T3P);
Chris@10 465 {
Chris@10 466 V T42, T40, T3L, T3T;
Chris@10 467 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@10 468 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@10 469 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@10 470 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@10 471 {
Chris@10 472 V T3o, T3q, T3f, T3p;
Chris@10 473 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@10 474 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@10 475 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@10 476 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@10 477 ST(&(xo[WS(os, 48)]), VFNMSI(T3y, T3x), ovs, &(xo[0]));
Chris@10 478 ST(&(xo[WS(os, 16)]), VFMAI(T3y, T3x), ovs, &(xo[0]));
Chris@10 479 ST(&(xo[0]), VADD(T3t, T3w), ovs, &(xo[0]));
Chris@10 480 ST(&(xo[WS(os, 32)]), VSUB(T3t, T3w), ovs, &(xo[0]));
Chris@10 481 {
Chris@10 482 V T41, T3X, T3S, T3U;
Chris@10 483 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@10 484 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@10 485 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@10 486 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@10 487 ST(&(xo[WS(os, 8)]), VFMAI(T3q, T3p), ovs, &(xo[0]));
Chris@10 488 ST(&(xo[WS(os, 56)]), VFNMSI(T3q, T3p), ovs, &(xo[0]));
Chris@10 489 ST(&(xo[WS(os, 40)]), VFMAI(T3o, T3f), ovs, &(xo[0]));
Chris@10 490 ST(&(xo[WS(os, 24)]), VFNMSI(T3o, T3f), ovs, &(xo[0]));
Chris@10 491 ST(&(xo[WS(os, 44)]), VFNMSI(T40, T3X), ovs, &(xo[0]));
Chris@10 492 ST(&(xo[WS(os, 20)]), VFMAI(T40, T3X), ovs, &(xo[0]));
Chris@10 493 ST(&(xo[WS(os, 52)]), VFMAI(T42, T41), ovs, &(xo[0]));
Chris@10 494 ST(&(xo[WS(os, 12)]), VFNMSI(T42, T41), ovs, &(xo[0]));
Chris@10 495 ST(&(xo[WS(os, 4)]), VFMAI(T3U, T3T), ovs, &(xo[0]));
Chris@10 496 ST(&(xo[WS(os, 60)]), VFNMSI(T3U, T3T), ovs, &(xo[0]));
Chris@10 497 ST(&(xo[WS(os, 36)]), VFMAI(T3S, T3L), ovs, &(xo[0]));
Chris@10 498 ST(&(xo[WS(os, 28)]), VFNMSI(T3S, T3L), ovs, &(xo[0]));
Chris@10 499 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@10 500 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@10 501 }
Chris@10 502 }
Chris@10 503 }
Chris@10 504 }
Chris@10 505 }
Chris@10 506 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@10 507 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@10 508 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@10 509 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@10 510 T6C = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@10 511 T76 = VFMA(LDK(KP923879532), T6B, T6A);
Chris@10 512 T77 = VSUB(T6e, T6h);
Chris@10 513 T6i = VADD(T6e, T6h);
Chris@10 514 }
Chris@10 515 }
Chris@10 516 {
Chris@10 517 V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T5r, T5I, T5x, T5h, T5F, T5B;
Chris@10 518 {
Chris@10 519 V TT, T2f, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
Chris@10 520 {
Chris@10 521 V T1X, T2d, T7h, T7l, T2e, T1q, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
Chris@10 522 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@10 523 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@10 524 TS = VADD(TC, TR);
Chris@10 525 T2D = VSUB(TR, TC);
Chris@10 526 {
Chris@10 527 V T7b, T7j, T74, T7i, T78, T7g;
Chris@10 528 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@10 529 T2d = VFMA(LDK(KP198912367), T1N, T1W);
Chris@10 530 T7g = VADD(T79, T7a);
Chris@10 531 T7b = VSUB(T79, T7a);
Chris@10 532 T7j = VSUB(T73, T70);
Chris@10 533 T74 = VADD(T70, T73);
Chris@10 534 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@10 535 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@10 536 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@10 537 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@10 538 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@10 539 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@10 540 T2e = VFMA(LDK(KP198912367), T1g, T1p);
Chris@10 541 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@10 542 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@10 543 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@10 544 T7m = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@10 545 T7k = VFMA(LDK(KP956940335), T7j, T7i);
Chris@10 546 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@10 547 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@10 548 }
Chris@10 549 T2k = VADD(T2e, T2d);
Chris@10 550 T2f = VSUB(T2d, T2e);
Chris@10 551 ST(&(xo[WS(os, 45)]), VFNMSI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@10 552 ST(&(xo[WS(os, 19)]), VFMAI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@10 553 ST(&(xo[WS(os, 51)]), VFMAI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@10 554 ST(&(xo[WS(os, 13)]), VFNMSI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@10 555 ST(&(xo[WS(os, 3)]), VFMAI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@10 556 ST(&(xo[WS(os, 61)]), VFNMSI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@10 557 ST(&(xo[WS(os, 35)]), VFMAI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@10 558 ST(&(xo[WS(os, 29)]), VFNMSI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@10 559 T2n = VSUB(T1X, T1q);
Chris@10 560 T1Y = VADD(T1q, T1X);
Chris@10 561 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@10 562 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@10 563 T2b = VSUB(T29, T2a);
Chris@10 564 T2s = VADD(T2a, T29);
Chris@10 565 }
Chris@10 566 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@10 567 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@10 568 {
Chris@10 569 V T5z, T4z, T5A, T5g;
Chris@10 570 {
Chris@10 571 V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
Chris@10 572 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@10 573 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@10 574 T4y = VADD(T4o, T4x);
Chris@10 575 T5T = VSUB(T4x, T4o);
Chris@10 576 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@10 577 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@10 578 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@10 579 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@10 580 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@10 581 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@10 582 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@10 583 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@10 584 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@10 585 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@10 586 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@10 587 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@10 588 {
Chris@10 589 V T2o, T2q, T2i, T2g;
Chris@10 590 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@10 591 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@10 592 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@10 593 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@10 594 T5A = VADD(T5t, T5u);
Chris@10 595 T5v = VSUB(T5t, T5u);
Chris@10 596 T5D = VSUB(T5f, T4U);
Chris@10 597 T5g = VADD(T4U, T5f);
Chris@10 598 ST(&(xo[WS(os, 46)]), VFNMSI(T2o, T2l), ovs, &(xo[0]));
Chris@10 599 ST(&(xo[WS(os, 18)]), VFMAI(T2o, T2l), ovs, &(xo[0]));
Chris@10 600 ST(&(xo[WS(os, 50)]), VFMAI(T2q, T2p), ovs, &(xo[0]));
Chris@10 601 ST(&(xo[WS(os, 14)]), VFNMSI(T2q, T2p), ovs, &(xo[0]));
Chris@10 602 ST(&(xo[WS(os, 2)]), VFMAI(T2i, T2h), ovs, &(xo[0]));
Chris@10 603 ST(&(xo[WS(os, 62)]), VFNMSI(T2i, T2h), ovs, &(xo[0]));
Chris@10 604 ST(&(xo[WS(os, 34)]), VFMAI(T2g, T1Z), ovs, &(xo[0]));
Chris@10 605 ST(&(xo[WS(os, 30)]), VFNMSI(T2g, T1Z), ovs, &(xo[0]));
Chris@10 606 T5r = VSUB(T5p, T5q);
Chris@10 607 T5I = VADD(T5p, T5q);
Chris@10 608 }
Chris@10 609 }
Chris@10 610 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@10 611 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@10 612 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@10 613 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@10 614 }
Chris@10 615 }
Chris@10 616 {
Chris@10 617 V T6J, T6R, T6L, T6z, T6T, T6P;
Chris@10 618 {
Chris@10 619 V T6N, T6j, T6O, T6y;
Chris@10 620 {
Chris@10 621 V T6q, T6H, T5C, T5s, T6I, T6x;
Chris@10 622 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@10 623 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@10 624 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@10 625 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@10 626 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@10 627 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@10 628 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@10 629 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@10 630 {
Chris@10 631 V T5E, T5G, T5y, T5w;
Chris@10 632 T5E = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@10 633 T5G = VFMA(LDK(KP995184726), T5D, T5C);
Chris@10 634 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@10 635 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@10 636 T6O = VADD(T6H, T6I);
Chris@10 637 T6J = VSUB(T6H, T6I);
Chris@10 638 T6R = VSUB(T6x, T6q);
Chris@10 639 T6y = VADD(T6q, T6x);
Chris@10 640 ST(&(xo[WS(os, 47)]), VFMAI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@10 641 ST(&(xo[WS(os, 17)]), VFNMSI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@10 642 ST(&(xo[WS(os, 49)]), VFNMSI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@10 643 ST(&(xo[WS(os, 15)]), VFMAI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@10 644 ST(&(xo[WS(os, 63)]), VFMAI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@10 645 ST(&(xo[WS(os, 1)]), VFNMSI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@10 646 ST(&(xo[WS(os, 31)]), VFMAI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@10 647 ST(&(xo[WS(os, 33)]), VFNMSI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@10 648 }
Chris@10 649 }
Chris@10 650 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@10 651 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@10 652 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@10 653 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@10 654 }
Chris@10 655 {
Chris@10 656 V T2H, T2P, T2J, T2B, T2R, T2N;
Chris@10 657 {
Chris@10 658 V T2L, T2t, T2M, T2A;
Chris@10 659 {
Chris@10 660 V T2z, T2F, T6Q, T6G, T2G, T2w;
Chris@10 661 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@10 662 T2F = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@10 663 T6Q = VFMA(LDK(KP831469612), T6F, T6C);
Chris@10 664 T6G = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@10 665 T2G = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@10 666 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@10 667 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@10 668 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@10 669 {
Chris@10 670 V T6S, T6U, T6M, T6K;
Chris@10 671 T6S = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@10 672 T6U = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@10 673 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@10 674 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@10 675 T2M = VADD(T2G, T2F);
Chris@10 676 T2H = VSUB(T2F, T2G);
Chris@10 677 T2P = VSUB(T2z, T2w);
Chris@10 678 T2A = VADD(T2w, T2z);
Chris@10 679 ST(&(xo[WS(os, 43)]), VFMAI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@10 680 ST(&(xo[WS(os, 21)]), VFNMSI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@10 681 ST(&(xo[WS(os, 53)]), VFNMSI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@10 682 ST(&(xo[WS(os, 11)]), VFMAI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@10 683 ST(&(xo[WS(os, 59)]), VFMAI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@10 684 ST(&(xo[WS(os, 5)]), VFNMSI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@10 685 ST(&(xo[WS(os, 27)]), VFMAI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@10 686 ST(&(xo[WS(os, 37)]), VFNMSI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@10 687 }
Chris@10 688 }
Chris@10 689 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@10 690 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@10 691 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@10 692 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@10 693 }
Chris@10 694 {
Chris@10 695 V T61, T5J, T62, T5Q;
Chris@10 696 {
Chris@10 697 V T5M, T5V, T2O, T2E, T5W, T5P;
Chris@10 698 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@10 699 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@10 700 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@10 701 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@10 702 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@10 703 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@10 704 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@10 705 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@10 706 {
Chris@10 707 V T2Q, T2S, T2K, T2I;
Chris@10 708 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@10 709 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@10 710 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@10 711 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@10 712 T62 = VADD(T5V, T5W);
Chris@10 713 T5X = VSUB(T5V, T5W);
Chris@10 714 T65 = VSUB(T5P, T5M);
Chris@10 715 T5Q = VADD(T5M, T5P);
Chris@10 716 ST(&(xo[WS(os, 42)]), VFMAI(T2Q, T2N), ovs, &(xo[0]));
Chris@10 717 ST(&(xo[WS(os, 22)]), VFNMSI(T2Q, T2N), ovs, &(xo[0]));
Chris@10 718 ST(&(xo[WS(os, 54)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@10 719 ST(&(xo[WS(os, 10)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@10 720 ST(&(xo[WS(os, 58)]), VFMAI(T2K, T2J), ovs, &(xo[0]));
Chris@10 721 ST(&(xo[WS(os, 6)]), VFNMSI(T2K, T2J), ovs, &(xo[0]));
Chris@10 722 ST(&(xo[WS(os, 26)]), VFMAI(T2I, T2B), ovs, &(xo[0]));
Chris@10 723 ST(&(xo[WS(os, 38)]), VFNMSI(T2I, T2B), ovs, &(xo[0]));
Chris@10 724 }
Chris@10 725 }
Chris@10 726 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@10 727 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@10 728 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@10 729 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@10 730 }
Chris@10 731 }
Chris@10 732 }
Chris@10 733 }
Chris@10 734 }
Chris@10 735 }
Chris@10 736 T5U = VFMA(LDK(KP980785280), T5T, T5S);
Chris@10 737 T64 = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@10 738 {
Chris@10 739 V T68, T66, T5Y, T60;
Chris@10 740 T68 = VFNMS(LDK(KP773010453), T65, T64);
Chris@10 741 T66 = VFMA(LDK(KP773010453), T65, T64);
Chris@10 742 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@10 743 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@10 744 ST(&(xo[WS(os, 41)]), VFNMSI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@10 745 ST(&(xo[WS(os, 23)]), VFMAI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@10 746 ST(&(xo[WS(os, 55)]), VFMAI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@10 747 ST(&(xo[WS(os, 9)]), VFNMSI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@10 748 ST(&(xo[WS(os, 7)]), VFMAI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@10 749 ST(&(xo[WS(os, 57)]), VFNMSI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@10 750 ST(&(xo[WS(os, 39)]), VFMAI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@10 751 ST(&(xo[WS(os, 25)]), VFNMSI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@10 752 }
Chris@10 753 }
Chris@10 754 }
Chris@10 755 VLEAVE();
Chris@10 756 }
Chris@10 757
Chris@10 758 static const kdft_desc desc = { 64, XSIMD_STRING("n1fv_64"), {198, 0, 258, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 759
Chris@10 760 void XSIMD(codelet_n1fv_64) (planner *p) {
Chris@10 761 X(kdft_register) (p, n1fv_64, &desc);
Chris@10 762 }
Chris@10 763
Chris@10 764 #else /* HAVE_FMA */
Chris@10 765
Chris@10 766 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n1fv_64 -include n1f.h */
Chris@10 767
Chris@10 768 /*
Chris@10 769 * This function contains 456 FP additions, 124 FP multiplications,
Chris@10 770 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@10 771 * 108 stack variables, 15 constants, and 128 memory accesses
Chris@10 772 */
Chris@10 773 #include "n1f.h"
Chris@10 774
Chris@10 775 static void n1fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 776 {
Chris@10 777 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@10 778 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@10 779 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@10 780 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@10 781 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@10 782 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@10 783 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@10 784 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@10 785 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 786 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 787 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 788 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 789 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 790 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 791 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 792 {
Chris@10 793 INT i;
Chris@10 794 const R *xi;
Chris@10 795 R *xo;
Chris@10 796 xi = ri;
Chris@10 797 xo = ro;
Chris@10 798 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@10 799 V T4p, T5q, Tb, T39, T2n, T3A, T6f, T6T, Tq, T3B, T6i, T76, T2i, T3a, T4w;
Chris@10 800 V T5r, TI, T2p, T6C, T6V, T3h, T3E, T4L, T5u, TZ, T2q, T6F, T6U, T3e, T3D;
Chris@10 801 V T4E, T5t, T23, T2N, T6t, T71, T6w, T72, T2c, T2O, T3t, T41, T5f, T5R, T5k;
Chris@10 802 V T5S, T3w, T42, T1s, T2K, T6m, T6Y, T6p, T6Z, T1B, T2L, T3m, T3Y, T4Y, T5O;
Chris@10 803 V T53, T5P, T3p, T3Z;
Chris@10 804 {
Chris@10 805 V T3, T4n, T2m, T4o, T6, T5p, T9, T5o;
Chris@10 806 {
Chris@10 807 V T1, T2, T2k, T2l;
Chris@10 808 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@10 809 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@10 810 T3 = VSUB(T1, T2);
Chris@10 811 T4n = VADD(T1, T2);
Chris@10 812 T2k = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@10 813 T2l = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@10 814 T2m = VSUB(T2k, T2l);
Chris@10 815 T4o = VADD(T2k, T2l);
Chris@10 816 }
Chris@10 817 {
Chris@10 818 V T4, T5, T7, T8;
Chris@10 819 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@10 820 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@10 821 T6 = VSUB(T4, T5);
Chris@10 822 T5p = VADD(T4, T5);
Chris@10 823 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@10 824 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@10 825 T9 = VSUB(T7, T8);
Chris@10 826 T5o = VADD(T7, T8);
Chris@10 827 }
Chris@10 828 T4p = VSUB(T4n, T4o);
Chris@10 829 T5q = VSUB(T5o, T5p);
Chris@10 830 {
Chris@10 831 V Ta, T2j, T6d, T6e;
Chris@10 832 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@10 833 Tb = VADD(T3, Ta);
Chris@10 834 T39 = VSUB(T3, Ta);
Chris@10 835 T2j = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@10 836 T2n = VSUB(T2j, T2m);
Chris@10 837 T3A = VADD(T2m, T2j);
Chris@10 838 T6d = VADD(T4n, T4o);
Chris@10 839 T6e = VADD(T5p, T5o);
Chris@10 840 T6f = VADD(T6d, T6e);
Chris@10 841 T6T = VSUB(T6d, T6e);
Chris@10 842 }
Chris@10 843 }
Chris@10 844 {
Chris@10 845 V Te, T4q, To, T4u, Th, T4r, Tl, T4t;
Chris@10 846 {
Chris@10 847 V Tc, Td, Tm, Tn;
Chris@10 848 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@10 849 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@10 850 Te = VSUB(Tc, Td);
Chris@10 851 T4q = VADD(Tc, Td);
Chris@10 852 Tm = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@10 853 Tn = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@10 854 To = VSUB(Tm, Tn);
Chris@10 855 T4u = VADD(Tm, Tn);
Chris@10 856 }
Chris@10 857 {
Chris@10 858 V Tf, Tg, Tj, Tk;
Chris@10 859 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@10 860 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@10 861 Th = VSUB(Tf, Tg);
Chris@10 862 T4r = VADD(Tf, Tg);
Chris@10 863 Tj = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@10 864 Tk = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@10 865 Tl = VSUB(Tj, Tk);
Chris@10 866 T4t = VADD(Tj, Tk);
Chris@10 867 }
Chris@10 868 {
Chris@10 869 V Ti, Tp, T6g, T6h;
Chris@10 870 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@10 871 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@10 872 Tq = VADD(Ti, Tp);
Chris@10 873 T3B = VSUB(Tp, Ti);
Chris@10 874 T6g = VADD(T4q, T4r);
Chris@10 875 T6h = VADD(T4t, T4u);
Chris@10 876 T6i = VADD(T6g, T6h);
Chris@10 877 T76 = VSUB(T6h, T6g);
Chris@10 878 }
Chris@10 879 {
Chris@10 880 V T2g, T2h, T4s, T4v;
Chris@10 881 T2g = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@10 882 T2h = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@10 883 T2i = VSUB(T2g, T2h);
Chris@10 884 T3a = VADD(T2h, T2g);
Chris@10 885 T4s = VSUB(T4q, T4r);
Chris@10 886 T4v = VSUB(T4t, T4u);
Chris@10 887 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@10 888 T5r = VMUL(LDK(KP707106781), VSUB(T4v, T4s));
Chris@10 889 }
Chris@10 890 }
Chris@10 891 {
Chris@10 892 V Tu, T4F, TG, T4G, TB, T4J, TD, T4I;
Chris@10 893 {
Chris@10 894 V Ts, Tt, TE, TF;
Chris@10 895 Ts = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@10 896 Tt = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@10 897 Tu = VSUB(Ts, Tt);
Chris@10 898 T4F = VADD(Ts, Tt);
Chris@10 899 TE = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@10 900 TF = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@10 901 TG = VSUB(TE, TF);
Chris@10 902 T4G = VADD(TE, TF);
Chris@10 903 {
Chris@10 904 V Tv, Tw, Tx, Ty, Tz, TA;
Chris@10 905 Tv = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@10 906 Tw = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@10 907 Tx = VSUB(Tv, Tw);
Chris@10 908 Ty = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@10 909 Tz = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@10 910 TA = VSUB(Ty, Tz);
Chris@10 911 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@10 912 T4J = VADD(Tv, Tw);
Chris@10 913 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@10 914 T4I = VADD(Ty, Tz);
Chris@10 915 }
Chris@10 916 }
Chris@10 917 {
Chris@10 918 V TC, TH, T6A, T6B;
Chris@10 919 TC = VADD(Tu, TB);
Chris@10 920 TH = VSUB(TD, TG);
Chris@10 921 TI = VFMA(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@10 922 T2p = VFNMS(LDK(KP195090322), TH, VMUL(LDK(KP980785280), TC));
Chris@10 923 T6A = VADD(T4F, T4G);
Chris@10 924 T6B = VADD(T4J, T4I);
Chris@10 925 T6C = VADD(T6A, T6B);
Chris@10 926 T6V = VSUB(T6A, T6B);
Chris@10 927 }
Chris@10 928 {
Chris@10 929 V T3f, T3g, T4H, T4K;
Chris@10 930 T3f = VSUB(Tu, TB);
Chris@10 931 T3g = VADD(TG, TD);
Chris@10 932 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
Chris@10 933 T3E = VFMA(LDK(KP555570233), T3f, VMUL(LDK(KP831469612), T3g));
Chris@10 934 T4H = VSUB(T4F, T4G);
Chris@10 935 T4K = VSUB(T4I, T4J);
Chris@10 936 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@10 937 T5u = VFMA(LDK(KP382683432), T4H, VMUL(LDK(KP923879532), T4K));
Chris@10 938 }
Chris@10 939 }
Chris@10 940 {
Chris@10 941 V TS, T4z, TW, T4y, TP, T4C, TX, T4B;
Chris@10 942 {
Chris@10 943 V TQ, TR, TU, TV;
Chris@10 944 TQ = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@10 945 TR = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@10 946 TS = VSUB(TQ, TR);
Chris@10 947 T4z = VADD(TQ, TR);
Chris@10 948 TU = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@10 949 TV = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@10 950 TW = VSUB(TU, TV);
Chris@10 951 T4y = VADD(TU, TV);
Chris@10 952 {
Chris@10 953 V TJ, TK, TL, TM, TN, TO;
Chris@10 954 TJ = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@10 955 TK = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@10 956 TL = VSUB(TJ, TK);
Chris@10 957 TM = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@10 958 TN = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@10 959 TO = VSUB(TM, TN);
Chris@10 960 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@10 961 T4C = VADD(TM, TN);
Chris@10 962 TX = VMUL(LDK(KP707106781), VADD(TO, TL));
Chris@10 963 T4B = VADD(TJ, TK);
Chris@10 964 }
Chris@10 965 }
Chris@10 966 {
Chris@10 967 V TT, TY, T6D, T6E;
Chris@10 968 TT = VSUB(TP, TS);
Chris@10 969 TY = VADD(TW, TX);
Chris@10 970 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@10 971 T2q = VFMA(LDK(KP980785280), TY, VMUL(LDK(KP195090322), TT));
Chris@10 972 T6D = VADD(T4y, T4z);
Chris@10 973 T6E = VADD(T4C, T4B);
Chris@10 974 T6F = VADD(T6D, T6E);
Chris@10 975 T6U = VSUB(T6D, T6E);
Chris@10 976 }
Chris@10 977 {
Chris@10 978 V T3c, T3d, T4A, T4D;
Chris@10 979 T3c = VSUB(TW, TX);
Chris@10 980 T3d = VADD(TS, TP);
Chris@10 981 T3e = VFMA(LDK(KP831469612), T3c, VMUL(LDK(KP555570233), T3d));
Chris@10 982 T3D = VFNMS(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
Chris@10 983 T4A = VSUB(T4y, T4z);
Chris@10 984 T4D = VSUB(T4B, T4C);
Chris@10 985 T4E = VFMA(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4D));
Chris@10 986 T5t = VFNMS(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@10 987 }
Chris@10 988 }
Chris@10 989 {
Chris@10 990 V T1F, T55, T2a, T56, T1M, T5h, T27, T5g, T58, T59, T1U, T5a, T25, T5b, T5c;
Chris@10 991 V T21, T5d, T24;
Chris@10 992 {
Chris@10 993 V T1D, T1E, T28, T29;
Chris@10 994 T1D = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@10 995 T1E = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@10 996 T1F = VSUB(T1D, T1E);
Chris@10 997 T55 = VADD(T1D, T1E);
Chris@10 998 T28 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@10 999 T29 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1000 T2a = VSUB(T28, T29);
Chris@10 1001 T56 = VADD(T28, T29);
Chris@10 1002 }
Chris@10 1003 {
Chris@10 1004 V T1G, T1H, T1I, T1J, T1K, T1L;
Chris@10 1005 T1G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1006 T1H = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1007 T1I = VSUB(T1G, T1H);
Chris@10 1008 T1J = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1009 T1K = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1010 T1L = VSUB(T1J, T1K);
Chris@10 1011 T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
Chris@10 1012 T5h = VADD(T1G, T1H);
Chris@10 1013 T27 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
Chris@10 1014 T5g = VADD(T1J, T1K);
Chris@10 1015 }
Chris@10 1016 {
Chris@10 1017 V T1Q, T1T, T1X, T20;
Chris@10 1018 {
Chris@10 1019 V T1O, T1P, T1R, T1S;
Chris@10 1020 T1O = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1021 T1P = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1022 T1Q = VSUB(T1O, T1P);
Chris@10 1023 T58 = VADD(T1O, T1P);
Chris@10 1024 T1R = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1025 T1S = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1026 T1T = VSUB(T1R, T1S);
Chris@10 1027 T59 = VADD(T1R, T1S);
Chris@10 1028 }
Chris@10 1029 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
Chris@10 1030 T5a = VSUB(T58, T59);
Chris@10 1031 T25 = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
Chris@10 1032 {
Chris@10 1033 V T1V, T1W, T1Y, T1Z;
Chris@10 1034 T1V = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1035 T1W = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1036 T1X = VSUB(T1V, T1W);
Chris@10 1037 T5b = VADD(T1V, T1W);
Chris@10 1038 T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1039 T1Z = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1040 T20 = VSUB(T1Y, T1Z);
Chris@10 1041 T5c = VADD(T1Y, T1Z);
Chris@10 1042 }
Chris@10 1043 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
Chris@10 1044 T5d = VSUB(T5b, T5c);
Chris@10 1045 T24 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
Chris@10 1046 }
Chris@10 1047 {
Chris@10 1048 V T1N, T22, T6r, T6s;
Chris@10 1049 T1N = VADD(T1F, T1M);
Chris@10 1050 T22 = VADD(T1U, T21);
Chris@10 1051 T23 = VSUB(T1N, T22);
Chris@10 1052 T2N = VADD(T1N, T22);
Chris@10 1053 T6r = VADD(T55, T56);
Chris@10 1054 T6s = VADD(T5h, T5g);
Chris@10 1055 T6t = VADD(T6r, T6s);
Chris@10 1056 T71 = VSUB(T6r, T6s);
Chris@10 1057 }
Chris@10 1058 {
Chris@10 1059 V T6u, T6v, T26, T2b;
Chris@10 1060 T6u = VADD(T58, T59);
Chris@10 1061 T6v = VADD(T5b, T5c);
Chris@10 1062 T6w = VADD(T6u, T6v);
Chris@10 1063 T72 = VSUB(T6v, T6u);
Chris@10 1064 T26 = VSUB(T24, T25);
Chris@10 1065 T2b = VSUB(T27, T2a);
Chris@10 1066 T2c = VSUB(T26, T2b);
Chris@10 1067 T2O = VADD(T2b, T26);
Chris@10 1068 }
Chris@10 1069 {
Chris@10 1070 V T3r, T3s, T57, T5e;
Chris@10 1071 T3r = VSUB(T1F, T1M);
Chris@10 1072 T3s = VADD(T25, T24);
Chris@10 1073 T3t = VADD(T3r, T3s);
Chris@10 1074 T41 = VSUB(T3r, T3s);
Chris@10 1075 T57 = VSUB(T55, T56);
Chris@10 1076 T5e = VMUL(LDK(KP707106781), VADD(T5a, T5d));
Chris@10 1077 T5f = VADD(T57, T5e);
Chris@10 1078 T5R = VSUB(T57, T5e);
Chris@10 1079 }
Chris@10 1080 {
Chris@10 1081 V T5i, T5j, T3u, T3v;
Chris@10 1082 T5i = VSUB(T5g, T5h);
Chris@10 1083 T5j = VMUL(LDK(KP707106781), VSUB(T5d, T5a));
Chris@10 1084 T5k = VADD(T5i, T5j);
Chris@10 1085 T5S = VSUB(T5j, T5i);
Chris@10 1086 T3u = VADD(T2a, T27);
Chris@10 1087 T3v = VSUB(T21, T1U);
Chris@10 1088 T3w = VADD(T3u, T3v);
Chris@10 1089 T42 = VSUB(T3v, T3u);
Chris@10 1090 }
Chris@10 1091 }
Chris@10 1092 {
Chris@10 1093 V T1q, T4P, T1v, T4O, T1n, T50, T1w, T4Z, T4U, T4V, T18, T4W, T1z, T4R, T4S;
Chris@10 1094 V T1f, T4T, T1y;
Chris@10 1095 {
Chris@10 1096 V T1o, T1p, T1t, T1u;
Chris@10 1097 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1098 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1099 T1q = VSUB(T1o, T1p);
Chris@10 1100 T4P = VADD(T1o, T1p);
Chris@10 1101 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1102 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1103 T1v = VSUB(T1t, T1u);
Chris@10 1104 T4O = VADD(T1t, T1u);
Chris@10 1105 }
Chris@10 1106 {
Chris@10 1107 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@10 1108 T1h = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1109 T1i = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1110 T1j = VSUB(T1h, T1i);
Chris@10 1111 T1k = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1112 T1l = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1113 T1m = VSUB(T1k, T1l);
Chris@10 1114 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@10 1115 T50 = VADD(T1k, T1l);
Chris@10 1116 T1w = VMUL(LDK(KP707106781), VADD(T1m, T1j));
Chris@10 1117 T4Z = VADD(T1h, T1i);
Chris@10 1118 }
Chris@10 1119 {
Chris@10 1120 V T14, T17, T1b, T1e;
Chris@10 1121 {
Chris@10 1122 V T12, T13, T15, T16;
Chris@10 1123 T12 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1124 T13 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1125 T14 = VSUB(T12, T13);
Chris@10 1126 T4U = VADD(T12, T13);
Chris@10 1127 T15 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1128 T16 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1129 T17 = VSUB(T15, T16);
Chris@10 1130 T4V = VADD(T15, T16);
Chris@10 1131 }
Chris@10 1132 T18 = VFNMS(LDK(KP923879532), T17, VMUL(LDK(KP382683432), T14));
Chris@10 1133 T4W = VSUB(T4U, T4V);
Chris@10 1134 T1z = VFMA(LDK(KP923879532), T14, VMUL(LDK(KP382683432), T17));
Chris@10 1135 {
Chris@10 1136 V T19, T1a, T1c, T1d;
Chris@10 1137 T19 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1138 T1a = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1139 T1b = VSUB(T19, T1a);
Chris@10 1140 T4R = VADD(T19, T1a);
Chris@10 1141 T1c = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1142 T1d = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@10 1143 T1e = VSUB(T1c, T1d);
Chris@10 1144 T4S = VADD(T1c, T1d);
Chris@10 1145 }
Chris@10 1146 T1f = VFMA(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@10 1147 T4T = VSUB(T4R, T4S);
Chris@10 1148 T1y = VFNMS(LDK(KP382683432), T1e, VMUL(LDK(KP923879532), T1b));
Chris@10 1149 }
Chris@10 1150 {
Chris@10 1151 V T1g, T1r, T6k, T6l;
Chris@10 1152 T1g = VSUB(T18, T1f);
Chris@10 1153 T1r = VSUB(T1n, T1q);
Chris@10 1154 T1s = VSUB(T1g, T1r);
Chris@10 1155 T2K = VADD(T1r, T1g);
Chris@10 1156 T6k = VADD(T4O, T4P);
Chris@10 1157 T6l = VADD(T50, T4Z);
Chris@10 1158 T6m = VADD(T6k, T6l);
Chris@10 1159 T6Y = VSUB(T6k, T6l);
Chris@10 1160 }
Chris@10 1161 {
Chris@10 1162 V T6n, T6o, T1x, T1A;
Chris@10 1163 T6n = VADD(T4R, T4S);
Chris@10 1164 T6o = VADD(T4U, T4V);
Chris@10 1165 T6p = VADD(T6n, T6o);
Chris@10 1166 T6Z = VSUB(T6o, T6n);
Chris@10 1167 T1x = VADD(T1v, T1w);
Chris@10 1168 T1A = VADD(T1y, T1z);
Chris@10 1169 T1B = VSUB(T1x, T1A);
Chris@10 1170 T2L = VADD(T1x, T1A);
Chris@10 1171 }
Chris@10 1172 {
Chris@10 1173 V T3k, T3l, T4Q, T4X;
Chris@10 1174 T3k = VSUB(T1v, T1w);
Chris@10 1175 T3l = VADD(T1f, T18);
Chris@10 1176 T3m = VADD(T3k, T3l);
Chris@10 1177 T3Y = VSUB(T3k, T3l);
Chris@10 1178 T4Q = VSUB(T4O, T4P);
Chris@10 1179 T4X = VMUL(LDK(KP707106781), VADD(T4T, T4W));
Chris@10 1180 T4Y = VADD(T4Q, T4X);
Chris@10 1181 T5O = VSUB(T4Q, T4X);
Chris@10 1182 }
Chris@10 1183 {
Chris@10 1184 V T51, T52, T3n, T3o;
Chris@10 1185 T51 = VSUB(T4Z, T50);
Chris@10 1186 T52 = VMUL(LDK(KP707106781), VSUB(T4W, T4T));
Chris@10 1187 T53 = VADD(T51, T52);
Chris@10 1188 T5P = VSUB(T52, T51);
Chris@10 1189 T3n = VADD(T1q, T1n);
Chris@10 1190 T3o = VSUB(T1z, T1y);
Chris@10 1191 T3p = VADD(T3n, T3o);
Chris@10 1192 T3Z = VSUB(T3o, T3n);
Chris@10 1193 }
Chris@10 1194 }
Chris@10 1195 {
Chris@10 1196 V T6N, T6R, T6Q, T6S;
Chris@10 1197 {
Chris@10 1198 V T6L, T6M, T6O, T6P;
Chris@10 1199 T6L = VADD(T6f, T6i);
Chris@10 1200 T6M = VADD(T6F, T6C);
Chris@10 1201 T6N = VADD(T6L, T6M);
Chris@10 1202 T6R = VSUB(T6L, T6M);
Chris@10 1203 T6O = VADD(T6m, T6p);
Chris@10 1204 T6P = VADD(T6t, T6w);
Chris@10 1205 T6Q = VADD(T6O, T6P);
Chris@10 1206 T6S = VBYI(VSUB(T6P, T6O));
Chris@10 1207 }
Chris@10 1208 ST(&(xo[WS(os, 32)]), VSUB(T6N, T6Q), ovs, &(xo[0]));
Chris@10 1209 ST(&(xo[WS(os, 16)]), VADD(T6R, T6S), ovs, &(xo[0]));
Chris@10 1210 ST(&(xo[0]), VADD(T6N, T6Q), ovs, &(xo[0]));
Chris@10 1211 ST(&(xo[WS(os, 48)]), VSUB(T6R, T6S), ovs, &(xo[0]));
Chris@10 1212 }
Chris@10 1213 {
Chris@10 1214 V T6j, T6G, T6y, T6H, T6q, T6x;
Chris@10 1215 T6j = VSUB(T6f, T6i);
Chris@10 1216 T6G = VSUB(T6C, T6F);
Chris@10 1217 T6q = VSUB(T6m, T6p);
Chris@10 1218 T6x = VSUB(T6t, T6w);
Chris@10 1219 T6y = VMUL(LDK(KP707106781), VADD(T6q, T6x));
Chris@10 1220 T6H = VMUL(LDK(KP707106781), VSUB(T6x, T6q));
Chris@10 1221 {
Chris@10 1222 V T6z, T6I, T6J, T6K;
Chris@10 1223 T6z = VADD(T6j, T6y);
Chris@10 1224 T6I = VBYI(VADD(T6G, T6H));
Chris@10 1225 ST(&(xo[WS(os, 56)]), VSUB(T6z, T6I), ovs, &(xo[0]));
Chris@10 1226 ST(&(xo[WS(os, 8)]), VADD(T6z, T6I), ovs, &(xo[0]));
Chris@10 1227 T6J = VSUB(T6j, T6y);
Chris@10 1228 T6K = VBYI(VSUB(T6H, T6G));
Chris@10 1229 ST(&(xo[WS(os, 40)]), VSUB(T6J, T6K), ovs, &(xo[0]));
Chris@10 1230 ST(&(xo[WS(os, 24)]), VADD(T6J, T6K), ovs, &(xo[0]));
Chris@10 1231 }
Chris@10 1232 }
Chris@10 1233 {
Chris@10 1234 V T6X, T7i, T78, T7g, T74, T7f, T7b, T7j, T6W, T77;
Chris@10 1235 T6W = VMUL(LDK(KP707106781), VADD(T6U, T6V));
Chris@10 1236 T6X = VADD(T6T, T6W);
Chris@10 1237 T7i = VSUB(T6T, T6W);
Chris@10 1238 T77 = VMUL(LDK(KP707106781), VSUB(T6V, T6U));
Chris@10 1239 T78 = VADD(T76, T77);
Chris@10 1240 T7g = VSUB(T77, T76);
Chris@10 1241 {
Chris@10 1242 V T70, T73, T79, T7a;
Chris@10 1243 T70 = VFMA(LDK(KP923879532), T6Y, VMUL(LDK(KP382683432), T6Z));
Chris@10 1244 T73 = VFNMS(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T71));
Chris@10 1245 T74 = VADD(T70, T73);
Chris@10 1246 T7f = VSUB(T73, T70);
Chris@10 1247 T79 = VFNMS(LDK(KP382683432), T6Y, VMUL(LDK(KP923879532), T6Z));
Chris@10 1248 T7a = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T72));
Chris@10 1249 T7b = VADD(T79, T7a);
Chris@10 1250 T7j = VSUB(T7a, T79);
Chris@10 1251 }
Chris@10 1252 {
Chris@10 1253 V T75, T7c, T7l, T7m;
Chris@10 1254 T75 = VADD(T6X, T74);
Chris@10 1255 T7c = VBYI(VADD(T78, T7b));
Chris@10 1256 ST(&(xo[WS(os, 60)]), VSUB(T75, T7c), ovs, &(xo[0]));
Chris@10 1257 ST(&(xo[WS(os, 4)]), VADD(T75, T7c), ovs, &(xo[0]));
Chris@10 1258 T7l = VBYI(VADD(T7g, T7f));
Chris@10 1259 T7m = VADD(T7i, T7j);
Chris@10 1260 ST(&(xo[WS(os, 12)]), VADD(T7l, T7m), ovs, &(xo[0]));
Chris@10 1261 ST(&(xo[WS(os, 52)]), VSUB(T7m, T7l), ovs, &(xo[0]));
Chris@10 1262 }
Chris@10 1263 {
Chris@10 1264 V T7d, T7e, T7h, T7k;
Chris@10 1265 T7d = VSUB(T6X, T74);
Chris@10 1266 T7e = VBYI(VSUB(T7b, T78));
Chris@10 1267 ST(&(xo[WS(os, 36)]), VSUB(T7d, T7e), ovs, &(xo[0]));
Chris@10 1268 ST(&(xo[WS(os, 28)]), VADD(T7d, T7e), ovs, &(xo[0]));
Chris@10 1269 T7h = VBYI(VSUB(T7f, T7g));
Chris@10 1270 T7k = VSUB(T7i, T7j);
Chris@10 1271 ST(&(xo[WS(os, 20)]), VADD(T7h, T7k), ovs, &(xo[0]));
Chris@10 1272 ST(&(xo[WS(os, 44)]), VSUB(T7k, T7h), ovs, &(xo[0]));
Chris@10 1273 }
Chris@10 1274 }
Chris@10 1275 {
Chris@10 1276 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@10 1277 {
Chris@10 1278 V T5L, T5M, T5Z, T60;
Chris@10 1279 T5L = VSUB(T4p, T4w);
Chris@10 1280 T5M = VSUB(T5u, T5t);
Chris@10 1281 T5N = VADD(T5L, T5M);
Chris@10 1282 T68 = VSUB(T5L, T5M);
Chris@10 1283 T5Z = VFNMS(LDK(KP555570233), T5O, VMUL(LDK(KP831469612), T5P));
Chris@10 1284 T60 = VFMA(LDK(KP555570233), T5R, VMUL(LDK(KP831469612), T5S));
Chris@10 1285 T61 = VADD(T5Z, T60);
Chris@10 1286 T69 = VSUB(T60, T5Z);
Chris@10 1287 }
Chris@10 1288 {
Chris@10 1289 V T5Q, T5T, T5W, T5X;
Chris@10 1290 T5Q = VFMA(LDK(KP831469612), T5O, VMUL(LDK(KP555570233), T5P));
Chris@10 1291 T5T = VFNMS(LDK(KP555570233), T5S, VMUL(LDK(KP831469612), T5R));
Chris@10 1292 T5U = VADD(T5Q, T5T);
Chris@10 1293 T65 = VSUB(T5T, T5Q);
Chris@10 1294 T5W = VSUB(T5r, T5q);
Chris@10 1295 T5X = VSUB(T4L, T4E);
Chris@10 1296 T5Y = VADD(T5W, T5X);
Chris@10 1297 T66 = VSUB(T5X, T5W);
Chris@10 1298 }
Chris@10 1299 {
Chris@10 1300 V T5V, T62, T6b, T6c;
Chris@10 1301 T5V = VADD(T5N, T5U);
Chris@10 1302 T62 = VBYI(VADD(T5Y, T61));
Chris@10 1303 ST(&(xo[WS(os, 58)]), VSUB(T5V, T62), ovs, &(xo[0]));
Chris@10 1304 ST(&(xo[WS(os, 6)]), VADD(T5V, T62), ovs, &(xo[0]));
Chris@10 1305 T6b = VBYI(VADD(T66, T65));
Chris@10 1306 T6c = VADD(T68, T69);
Chris@10 1307 ST(&(xo[WS(os, 10)]), VADD(T6b, T6c), ovs, &(xo[0]));
Chris@10 1308 ST(&(xo[WS(os, 54)]), VSUB(T6c, T6b), ovs, &(xo[0]));
Chris@10 1309 }
Chris@10 1310 {
Chris@10 1311 V T63, T64, T67, T6a;
Chris@10 1312 T63 = VSUB(T5N, T5U);
Chris@10 1313 T64 = VBYI(VSUB(T61, T5Y));
Chris@10 1314 ST(&(xo[WS(os, 38)]), VSUB(T63, T64), ovs, &(xo[0]));
Chris@10 1315 ST(&(xo[WS(os, 26)]), VADD(T63, T64), ovs, &(xo[0]));
Chris@10 1316 T67 = VBYI(VSUB(T65, T66));
Chris@10 1317 T6a = VSUB(T68, T69);
Chris@10 1318 ST(&(xo[WS(os, 22)]), VADD(T67, T6a), ovs, &(xo[0]));
Chris@10 1319 ST(&(xo[WS(os, 42)]), VSUB(T6a, T67), ovs, &(xo[0]));
Chris@10 1320 }
Chris@10 1321 }
Chris@10 1322 {
Chris@10 1323 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@10 1324 {
Chris@10 1325 V Tr, T10, T2t, T2u;
Chris@10 1326 Tr = VSUB(Tb, Tq);
Chris@10 1327 T10 = VSUB(TI, TZ);
Chris@10 1328 T11 = VADD(Tr, T10);
Chris@10 1329 T2C = VSUB(Tr, T10);
Chris@10 1330 T2t = VFNMS(LDK(KP634393284), T1B, VMUL(LDK(KP773010453), T1s));
Chris@10 1331 T2u = VFMA(LDK(KP773010453), T2c, VMUL(LDK(KP634393284), T23));
Chris@10 1332 T2v = VADD(T2t, T2u);
Chris@10 1333 T2D = VSUB(T2u, T2t);
Chris@10 1334 }
Chris@10 1335 {
Chris@10 1336 V T1C, T2d, T2o, T2r;
Chris@10 1337 T1C = VFMA(LDK(KP634393284), T1s, VMUL(LDK(KP773010453), T1B));
Chris@10 1338 T2d = VFNMS(LDK(KP634393284), T2c, VMUL(LDK(KP773010453), T23));
Chris@10 1339 T2e = VADD(T1C, T2d);
Chris@10 1340 T2z = VSUB(T2d, T1C);
Chris@10 1341 T2o = VSUB(T2i, T2n);
Chris@10 1342 T2r = VSUB(T2p, T2q);
Chris@10 1343 T2s = VADD(T2o, T2r);
Chris@10 1344 T2A = VSUB(T2r, T2o);
Chris@10 1345 }
Chris@10 1346 {
Chris@10 1347 V T2f, T2w, T2F, T2G;
Chris@10 1348 T2f = VADD(T11, T2e);
Chris@10 1349 T2w = VBYI(VADD(T2s, T2v));
Chris@10 1350 ST(&(xo[WS(os, 57)]), VSUB(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@10 1351 ST(&(xo[WS(os, 7)]), VADD(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@10 1352 T2F = VBYI(VADD(T2A, T2z));
Chris@10 1353 T2G = VADD(T2C, T2D);
Chris@10 1354 ST(&(xo[WS(os, 9)]), VADD(T2F, T2G), ovs, &(xo[WS(os, 1)]));
Chris@10 1355 ST(&(xo[WS(os, 55)]), VSUB(T2G, T2F), ovs, &(xo[WS(os, 1)]));
Chris@10 1356 }
Chris@10 1357 {
Chris@10 1358 V T2x, T2y, T2B, T2E;
Chris@10 1359 T2x = VSUB(T11, T2e);
Chris@10 1360 T2y = VBYI(VSUB(T2v, T2s));
Chris@10 1361 ST(&(xo[WS(os, 39)]), VSUB(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@10 1362 ST(&(xo[WS(os, 25)]), VADD(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@10 1363 T2B = VBYI(VSUB(T2z, T2A));
Chris@10 1364 T2E = VSUB(T2C, T2D);
Chris@10 1365 ST(&(xo[WS(os, 23)]), VADD(T2B, T2E), ovs, &(xo[WS(os, 1)]));
Chris@10 1366 ST(&(xo[WS(os, 41)]), VSUB(T2E, T2B), ovs, &(xo[WS(os, 1)]));
Chris@10 1367 }
Chris@10 1368 }
Chris@10 1369 {
Chris@10 1370 V T3j, T3Q, T3J, T3R, T3y, T3N, T3G, T3O;
Chris@10 1371 {
Chris@10 1372 V T3b, T3i, T3H, T3I;
Chris@10 1373 T3b = VADD(T39, T3a);
Chris@10 1374 T3i = VADD(T3e, T3h);
Chris@10 1375 T3j = VADD(T3b, T3i);
Chris@10 1376 T3Q = VSUB(T3b, T3i);
Chris@10 1377 T3H = VFNMS(LDK(KP290284677), T3m, VMUL(LDK(KP956940335), T3p));
Chris@10 1378 T3I = VFMA(LDK(KP290284677), T3t, VMUL(LDK(KP956940335), T3w));
Chris@10 1379 T3J = VADD(T3H, T3I);
Chris@10 1380 T3R = VSUB(T3I, T3H);
Chris@10 1381 }
Chris@10 1382 {
Chris@10 1383 V T3q, T3x, T3C, T3F;
Chris@10 1384 T3q = VFMA(LDK(KP956940335), T3m, VMUL(LDK(KP290284677), T3p));
Chris@10 1385 T3x = VFNMS(LDK(KP290284677), T3w, VMUL(LDK(KP956940335), T3t));
Chris@10 1386 T3y = VADD(T3q, T3x);
Chris@10 1387 T3N = VSUB(T3x, T3q);
Chris@10 1388 T3C = VADD(T3A, T3B);
Chris@10 1389 T3F = VADD(T3D, T3E);
Chris@10 1390 T3G = VADD(T3C, T3F);
Chris@10 1391 T3O = VSUB(T3F, T3C);
Chris@10 1392 }
Chris@10 1393 {
Chris@10 1394 V T3z, T3K, T3T, T3U;
Chris@10 1395 T3z = VADD(T3j, T3y);
Chris@10 1396 T3K = VBYI(VADD(T3G, T3J));
Chris@10 1397 ST(&(xo[WS(os, 61)]), VSUB(T3z, T3K), ovs, &(xo[WS(os, 1)]));
Chris@10 1398 ST(&(xo[WS(os, 3)]), VADD(T3z, T3K), ovs, &(xo[WS(os, 1)]));
Chris@10 1399 T3T = VBYI(VADD(T3O, T3N));
Chris@10 1400 T3U = VADD(T3Q, T3R);
Chris@10 1401 ST(&(xo[WS(os, 13)]), VADD(T3T, T3U), ovs, &(xo[WS(os, 1)]));
Chris@10 1402 ST(&(xo[WS(os, 51)]), VSUB(T3U, T3T), ovs, &(xo[WS(os, 1)]));
Chris@10 1403 }
Chris@10 1404 {
Chris@10 1405 V T3L, T3M, T3P, T3S;
Chris@10 1406 T3L = VSUB(T3j, T3y);
Chris@10 1407 T3M = VBYI(VSUB(T3J, T3G));
Chris@10 1408 ST(&(xo[WS(os, 35)]), VSUB(T3L, T3M), ovs, &(xo[WS(os, 1)]));
Chris@10 1409 ST(&(xo[WS(os, 29)]), VADD(T3L, T3M), ovs, &(xo[WS(os, 1)]));
Chris@10 1410 T3P = VBYI(VSUB(T3N, T3O));
Chris@10 1411 T3S = VSUB(T3Q, T3R);
Chris@10 1412 ST(&(xo[WS(os, 19)]), VADD(T3P, T3S), ovs, &(xo[WS(os, 1)]));
Chris@10 1413 ST(&(xo[WS(os, 45)]), VSUB(T3S, T3P), ovs, &(xo[WS(os, 1)]));
Chris@10 1414 }
Chris@10 1415 }
Chris@10 1416 {
Chris@10 1417 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@10 1418 {
Chris@10 1419 V T4x, T4M, T5x, T5y;
Chris@10 1420 T4x = VADD(T4p, T4w);
Chris@10 1421 T4M = VADD(T4E, T4L);
Chris@10 1422 T4N = VADD(T4x, T4M);
Chris@10 1423 T5G = VSUB(T4x, T4M);
Chris@10 1424 T5x = VFNMS(LDK(KP195090322), T4Y, VMUL(LDK(KP980785280), T53));
Chris@10 1425 T5y = VFMA(LDK(KP195090322), T5f, VMUL(LDK(KP980785280), T5k));
Chris@10 1426 T5z = VADD(T5x, T5y);
Chris@10 1427 T5H = VSUB(T5y, T5x);
Chris@10 1428 }
Chris@10 1429 {
Chris@10 1430 V T54, T5l, T5s, T5v;
Chris@10 1431 T54 = VFMA(LDK(KP980785280), T4Y, VMUL(LDK(KP195090322), T53));
Chris@10 1432 T5l = VFNMS(LDK(KP195090322), T5k, VMUL(LDK(KP980785280), T5f));
Chris@10 1433 T5m = VADD(T54, T5l);
Chris@10 1434 T5D = VSUB(T5l, T54);
Chris@10 1435 T5s = VADD(T5q, T5r);
Chris@10 1436 T5v = VADD(T5t, T5u);
Chris@10 1437 T5w = VADD(T5s, T5v);
Chris@10 1438 T5E = VSUB(T5v, T5s);
Chris@10 1439 }
Chris@10 1440 {
Chris@10 1441 V T5n, T5A, T5J, T5K;
Chris@10 1442 T5n = VADD(T4N, T5m);
Chris@10 1443 T5A = VBYI(VADD(T5w, T5z));
Chris@10 1444 ST(&(xo[WS(os, 62)]), VSUB(T5n, T5A), ovs, &(xo[0]));
Chris@10 1445 ST(&(xo[WS(os, 2)]), VADD(T5n, T5A), ovs, &(xo[0]));
Chris@10 1446 T5J = VBYI(VADD(T5E, T5D));
Chris@10 1447 T5K = VADD(T5G, T5H);
Chris@10 1448 ST(&(xo[WS(os, 14)]), VADD(T5J, T5K), ovs, &(xo[0]));
Chris@10 1449 ST(&(xo[WS(os, 50)]), VSUB(T5K, T5J), ovs, &(xo[0]));
Chris@10 1450 }
Chris@10 1451 {
Chris@10 1452 V T5B, T5C, T5F, T5I;
Chris@10 1453 T5B = VSUB(T4N, T5m);
Chris@10 1454 T5C = VBYI(VSUB(T5z, T5w));
Chris@10 1455 ST(&(xo[WS(os, 34)]), VSUB(T5B, T5C), ovs, &(xo[0]));
Chris@10 1456 ST(&(xo[WS(os, 30)]), VADD(T5B, T5C), ovs, &(xo[0]));
Chris@10 1457 T5F = VBYI(VSUB(T5D, T5E));
Chris@10 1458 T5I = VSUB(T5G, T5H);
Chris@10 1459 ST(&(xo[WS(os, 18)]), VADD(T5F, T5I), ovs, &(xo[0]));
Chris@10 1460 ST(&(xo[WS(os, 46)]), VSUB(T5I, T5F), ovs, &(xo[0]));
Chris@10 1461 }
Chris@10 1462 }
Chris@10 1463 {
Chris@10 1464 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@10 1465 {
Chris@10 1466 V T2H, T2I, T2V, T2W;
Chris@10 1467 T2H = VADD(Tb, Tq);
Chris@10 1468 T2I = VADD(T2q, T2p);
Chris@10 1469 T2J = VADD(T2H, T2I);
Chris@10 1470 T34 = VSUB(T2H, T2I);
Chris@10 1471 T2V = VFNMS(LDK(KP098017140), T2L, VMUL(LDK(KP995184726), T2K));
Chris@10 1472 T2W = VFMA(LDK(KP995184726), T2O, VMUL(LDK(KP098017140), T2N));
Chris@10 1473 T2X = VADD(T2V, T2W);
Chris@10 1474 T35 = VSUB(T2W, T2V);
Chris@10 1475 }
Chris@10 1476 {
Chris@10 1477 V T2M, T2P, T2S, T2T;
Chris@10 1478 T2M = VFMA(LDK(KP098017140), T2K, VMUL(LDK(KP995184726), T2L));
Chris@10 1479 T2P = VFNMS(LDK(KP098017140), T2O, VMUL(LDK(KP995184726), T2N));
Chris@10 1480 T2Q = VADD(T2M, T2P);
Chris@10 1481 T31 = VSUB(T2P, T2M);
Chris@10 1482 T2S = VADD(T2n, T2i);
Chris@10 1483 T2T = VADD(TZ, TI);
Chris@10 1484 T2U = VADD(T2S, T2T);
Chris@10 1485 T32 = VSUB(T2T, T2S);
Chris@10 1486 }
Chris@10 1487 {
Chris@10 1488 V T2R, T2Y, T37, T38;
Chris@10 1489 T2R = VADD(T2J, T2Q);
Chris@10 1490 T2Y = VBYI(VADD(T2U, T2X));
Chris@10 1491 ST(&(xo[WS(os, 63)]), VSUB(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@10 1492 ST(&(xo[WS(os, 1)]), VADD(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@10 1493 T37 = VBYI(VADD(T32, T31));
Chris@10 1494 T38 = VADD(T34, T35);
Chris@10 1495 ST(&(xo[WS(os, 15)]), VADD(T37, T38), ovs, &(xo[WS(os, 1)]));
Chris@10 1496 ST(&(xo[WS(os, 49)]), VSUB(T38, T37), ovs, &(xo[WS(os, 1)]));
Chris@10 1497 }
Chris@10 1498 {
Chris@10 1499 V T2Z, T30, T33, T36;
Chris@10 1500 T2Z = VSUB(T2J, T2Q);
Chris@10 1501 T30 = VBYI(VSUB(T2X, T2U));
Chris@10 1502 ST(&(xo[WS(os, 33)]), VSUB(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@10 1503 ST(&(xo[WS(os, 31)]), VADD(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@10 1504 T33 = VBYI(VSUB(T31, T32));
Chris@10 1505 T36 = VSUB(T34, T35);
Chris@10 1506 ST(&(xo[WS(os, 17)]), VADD(T33, T36), ovs, &(xo[WS(os, 1)]));
Chris@10 1507 ST(&(xo[WS(os, 47)]), VSUB(T36, T33), ovs, &(xo[WS(os, 1)]));
Chris@10 1508 }
Chris@10 1509 }
Chris@10 1510 {
Chris@10 1511 V T3X, T4i, T4b, T4j, T44, T4f, T48, T4g;
Chris@10 1512 {
Chris@10 1513 V T3V, T3W, T49, T4a;
Chris@10 1514 T3V = VSUB(T39, T3a);
Chris@10 1515 T3W = VSUB(T3E, T3D);
Chris@10 1516 T3X = VADD(T3V, T3W);
Chris@10 1517 T4i = VSUB(T3V, T3W);
Chris@10 1518 T49 = VFNMS(LDK(KP471396736), T3Y, VMUL(LDK(KP881921264), T3Z));
Chris@10 1519 T4a = VFMA(LDK(KP471396736), T41, VMUL(LDK(KP881921264), T42));
Chris@10 1520 T4b = VADD(T49, T4a);
Chris@10 1521 T4j = VSUB(T4a, T49);
Chris@10 1522 }
Chris@10 1523 {
Chris@10 1524 V T40, T43, T46, T47;
Chris@10 1525 T40 = VFMA(LDK(KP881921264), T3Y, VMUL(LDK(KP471396736), T3Z));
Chris@10 1526 T43 = VFNMS(LDK(KP471396736), T42, VMUL(LDK(KP881921264), T41));
Chris@10 1527 T44 = VADD(T40, T43);
Chris@10 1528 T4f = VSUB(T43, T40);
Chris@10 1529 T46 = VSUB(T3B, T3A);
Chris@10 1530 T47 = VSUB(T3h, T3e);
Chris@10 1531 T48 = VADD(T46, T47);
Chris@10 1532 T4g = VSUB(T47, T46);
Chris@10 1533 }
Chris@10 1534 {
Chris@10 1535 V T45, T4c, T4l, T4m;
Chris@10 1536 T45 = VADD(T3X, T44);
Chris@10 1537 T4c = VBYI(VADD(T48, T4b));
Chris@10 1538 ST(&(xo[WS(os, 59)]), VSUB(T45, T4c), ovs, &(xo[WS(os, 1)]));
Chris@10 1539 ST(&(xo[WS(os, 5)]), VADD(T45, T4c), ovs, &(xo[WS(os, 1)]));
Chris@10 1540 T4l = VBYI(VADD(T4g, T4f));
Chris@10 1541 T4m = VADD(T4i, T4j);
Chris@10 1542 ST(&(xo[WS(os, 11)]), VADD(T4l, T4m), ovs, &(xo[WS(os, 1)]));
Chris@10 1543 ST(&(xo[WS(os, 53)]), VSUB(T4m, T4l), ovs, &(xo[WS(os, 1)]));
Chris@10 1544 }
Chris@10 1545 {
Chris@10 1546 V T4d, T4e, T4h, T4k;
Chris@10 1547 T4d = VSUB(T3X, T44);
Chris@10 1548 T4e = VBYI(VSUB(T4b, T48));
Chris@10 1549 ST(&(xo[WS(os, 37)]), VSUB(T4d, T4e), ovs, &(xo[WS(os, 1)]));
Chris@10 1550 ST(&(xo[WS(os, 27)]), VADD(T4d, T4e), ovs, &(xo[WS(os, 1)]));
Chris@10 1551 T4h = VBYI(VSUB(T4f, T4g));
Chris@10 1552 T4k = VSUB(T4i, T4j);
Chris@10 1553 ST(&(xo[WS(os, 21)]), VADD(T4h, T4k), ovs, &(xo[WS(os, 1)]));
Chris@10 1554 ST(&(xo[WS(os, 43)]), VSUB(T4k, T4h), ovs, &(xo[WS(os, 1)]));
Chris@10 1555 }
Chris@10 1556 }
Chris@10 1557 }
Chris@10 1558 }
Chris@10 1559 VLEAVE();
Chris@10 1560 }
Chris@10 1561
Chris@10 1562 static const kdft_desc desc = { 64, XSIMD_STRING("n1fv_64"), {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 1563
Chris@10 1564 void XSIMD(codelet_n1fv_64) (planner *p) {
Chris@10 1565 X(kdft_register) (p, n1fv_64, &desc);
Chris@10 1566 }
Chris@10 1567
Chris@10 1568 #endif /* HAVE_FMA */