annotate src/fftw-3.3.5/dft/simd/common/n2fv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:20 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@42 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@42 33 * 178 stack variables, 15 constants, and 160 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2f.h"
Chris@42 36
Chris@42 37 static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@42 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@42 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@42 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@42 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 50 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 51 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 54 {
Chris@42 55 INT i;
Chris@42 56 const R *xi;
Chris@42 57 R *xo;
Chris@42 58 xi = ri;
Chris@42 59 xo = ro;
Chris@42 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 61 V T7r, T7s, T7t, T7u, T5T, T5S, T5X, T65, T8a, T8b, T8e, T8g, T5Z, T5R, T67;
Chris@42 62 V T63, T5U, T64;
Chris@42 63 {
Chris@42 64 V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
Chris@42 65 V Tm, T3A, T3l, T2a, TC, T5p, T4o, T6E, T6e, T3i, T3B, TR, T29, T4x, T5q;
Chris@42 66 V T6h, T6D, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
Chris@42 67 V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
Chris@42 68 V T6m, T6Y, T5L, T4T;
Chris@42 69 {
Chris@42 70 V T4g, T4l, T3j, Tu, Tx, T4h, TA, T4i;
Chris@42 71 {
Chris@42 72 V T1, T2, T23, T24, T4, T5, T20, T21;
Chris@42 73 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 74 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 75 T23 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 76 T24 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 77 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 78 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 79 T20 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 80 T21 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 81 {
Chris@42 82 V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
Chris@42 83 {
Chris@42 84 V T8, T43, T3, T44, T25, T5i, T6, T45, T22, T9, Ti, Tj, Tb, Tc;
Chris@42 85 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 86 T43 = VSUB(T1, T2);
Chris@42 87 T3 = VADD(T1, T2);
Chris@42 88 T44 = VSUB(T23, T24);
Chris@42 89 T25 = VADD(T23, T24);
Chris@42 90 T5i = VSUB(T4, T5);
Chris@42 91 T6 = VADD(T4, T5);
Chris@42 92 T45 = VSUB(T20, T21);
Chris@42 93 T22 = VADD(T20, T21);
Chris@42 94 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 95 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 96 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 97 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 98 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 99 {
Chris@42 100 V T2T, T46, T5j, T2U;
Chris@42 101 T7 = VSUB(T3, T6);
Chris@42 102 T2T = VADD(T3, T6);
Chris@42 103 T46 = VADD(T44, T45);
Chris@42 104 T5j = VSUB(T45, T44);
Chris@42 105 T26 = VSUB(T22, T25);
Chris@42 106 T2U = VADD(T25, T22);
Chris@42 107 Ta = VADD(T8, T9);
Chris@42 108 T48 = VSUB(T8, T9);
Chris@42 109 Tk = VADD(Ti, Tj);
Chris@42 110 T4c = VSUB(Tj, Ti);
Chris@42 111 T5k = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@42 112 T6A = VFMA(LDK(KP707106781), T5j, T5i);
Chris@42 113 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@42 114 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@42 115 T2V = VADD(T2T, T2U);
Chris@42 116 T3z = VSUB(T2T, T2U);
Chris@42 117 T49 = VSUB(Tb, Tc);
Chris@42 118 Td = VADD(Tb, Tc);
Chris@42 119 }
Chris@42 120 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 121 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 122 }
Chris@42 123 {
Chris@42 124 V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
Chris@42 125 V Tp;
Chris@42 126 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 127 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 128 {
Chris@42 129 V Th, T4b, Tr, Ts;
Chris@42 130 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 131 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 132 Te = VSUB(Ta, Td);
Chris@42 133 T2W = VADD(Ta, Td);
Chris@42 134 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@42 135 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@42 136 Th = VADD(Tf, Tg);
Chris@42 137 T4b = VSUB(Tf, Tg);
Chris@42 138 Tq = VADD(To, Tp);
Chris@42 139 T4g = VSUB(To, Tp);
Chris@42 140 T4l = VSUB(Tr, Ts);
Chris@42 141 Tt = VADD(Tr, Ts);
Chris@42 142 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 143 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 144 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@42 145 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@42 146 Tl = VSUB(Th, Tk);
Chris@42 147 T2X = VADD(Th, Tk);
Chris@42 148 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 149 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 150 }
Chris@42 151 T3j = VADD(Tq, Tt);
Chris@42 152 Tu = VSUB(Tq, Tt);
Chris@42 153 Tx = VADD(Tv, Tw);
Chris@42 154 T4h = VSUB(Tv, Tw);
Chris@42 155 T6B = VSUB(T4d, T4a);
Chris@42 156 T4e = VADD(T4a, T4d);
Chris@42 157 T6a = VADD(T5l, T5m);
Chris@42 158 T5n = VSUB(T5l, T5m);
Chris@42 159 T3M = VSUB(T2X, T2W);
Chris@42 160 T2Y = VADD(T2W, T2X);
Chris@42 161 T27 = VSUB(Tl, Te);
Chris@42 162 Tm = VADD(Te, Tl);
Chris@42 163 TA = VADD(Ty, Tz);
Chris@42 164 T4i = VSUB(Ty, Tz);
Chris@42 165 }
Chris@42 166 }
Chris@42 167 }
Chris@42 168 {
Chris@42 169 V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3g, TJ, TF, TI;
Chris@42 170 {
Chris@42 171 V TD, TE, TG, TH;
Chris@42 172 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 173 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 174 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 175 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 176 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 177 {
Chris@42 178 V T3k, TB, T4j, T4m;
Chris@42 179 T3k = VADD(Tx, TA);
Chris@42 180 TB = VSUB(Tx, TA);
Chris@42 181 T4j = VADD(T4h, T4i);
Chris@42 182 T4m = VSUB(T4h, T4i);
Chris@42 183 T4p = VSUB(TD, TE);
Chris@42 184 TF = VADD(TD, TE);
Chris@42 185 T4u = VSUB(TH, TG);
Chris@42 186 TI = VADD(TG, TH);
Chris@42 187 T3A = VSUB(T3j, T3k);
Chris@42 188 T3l = VADD(T3j, T3k);
Chris@42 189 T2a = VFMA(LDK(KP414213562), Tu, TB);
Chris@42 190 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@42 191 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@42 192 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@42 193 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@42 194 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@42 195 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 196 }
Chris@42 197 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 198 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 199 }
Chris@42 200 T3g = VADD(TF, TI);
Chris@42 201 TJ = VSUB(TF, TI);
Chris@42 202 {
Chris@42 203 V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
Chris@42 204 {
Chris@42 205 V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
Chris@42 206 {
Chris@42 207 V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
Chris@42 208 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 209 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@42 210 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@42 211 T6E = VFMA(LDK(KP668178637), T6c, T6d);
Chris@42 212 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@42 213 TM = VADD(TK, TL);
Chris@42 214 T4r = VSUB(TK, TL);
Chris@42 215 TP = VADD(TN, TO);
Chris@42 216 T4q = VSUB(TN, TO);
Chris@42 217 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 219 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 220 {
Chris@42 221 V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
Chris@42 222 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 223 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 224 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 225 {
Chris@42 226 V T3h, TQ, T4s, T4v;
Chris@42 227 T3h = VADD(TP, TM);
Chris@42 228 TQ = VSUB(TM, TP);
Chris@42 229 T4s = VADD(T4q, T4r);
Chris@42 230 T4v = VSUB(T4r, T4q);
Chris@42 231 T4V = VSUB(T1r, T1s);
Chris@42 232 T1t = VADD(T1r, T1s);
Chris@42 233 T58 = VSUB(T1v, T1u);
Chris@42 234 T1w = VADD(T1u, T1v);
Chris@42 235 T4X = VSUB(T1O, T1P);
Chris@42 236 T1Q = VADD(T1O, T1P);
Chris@42 237 T3i = VADD(T3g, T3h);
Chris@42 238 T3B = VSUB(T3g, T3h);
Chris@42 239 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@42 240 T29 = VFMA(LDK(KP414213562), TJ, TQ);
Chris@42 241 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@42 242 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@42 243 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@42 244 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@42 245 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 246 }
Chris@42 247 {
Chris@42 248 V T4W, T1A, T50, T51, T1D, T1F, T1G;
Chris@42 249 {
Chris@42 250 V T1y, T1z, T1B, T1C;
Chris@42 251 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 252 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 253 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 254 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 255 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@42 256 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@42 257 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@42 258 T6D = VFMA(LDK(KP668178637), T6f, T6g);
Chris@42 259 T4W = VSUB(T1R, T1S);
Chris@42 260 T1T = VADD(T1R, T1S);
Chris@42 261 T1A = VADD(T1y, T1z);
Chris@42 262 T50 = VSUB(T1y, T1z);
Chris@42 263 T51 = VSUB(T1C, T1B);
Chris@42 264 T1D = VADD(T1B, T1C);
Chris@42 265 }
Chris@42 266 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 267 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 268 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 269 T4Y = VADD(T4W, T4X);
Chris@42 270 T59 = VSUB(T4X, T4W);
Chris@42 271 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 272 T3a = VADD(T1A, T1D);
Chris@42 273 T1E = VSUB(T1A, T1D);
Chris@42 274 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@42 275 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@42 276 T53 = VSUB(T1F, T1G);
Chris@42 277 T1H = VADD(T1F, T1G);
Chris@42 278 }
Chris@42 279 }
Chris@42 280 }
Chris@42 281 {
Chris@42 282 V T37, T54, T1K, T38;
Chris@42 283 T1x = VSUB(T1t, T1w);
Chris@42 284 T37 = VADD(T1t, T1w);
Chris@42 285 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@42 286 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@42 287 T54 = VSUB(T1J, T1I);
Chris@42 288 T1K = VADD(T1I, T1J);
Chris@42 289 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@42 290 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@42 291 T38 = VADD(T1T, T1Q);
Chris@42 292 T1U = VSUB(T1Q, T1T);
Chris@42 293 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@42 294 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@42 295 T1L = VSUB(T1H, T1K);
Chris@42 296 T3b = VADD(T1H, T1K);
Chris@42 297 T39 = VADD(T37, T38);
Chris@42 298 T3H = VSUB(T37, T38);
Chris@42 299 }
Chris@42 300 }
Chris@42 301 {
Chris@42 302 V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
Chris@42 303 V T1d;
Chris@42 304 {
Chris@42 305 V TU, TV, TX, TY, T56, T6v;
Chris@42 306 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 307 T56 = VADD(T52, T55);
Chris@42 308 T6v = VSUB(T55, T52);
Chris@42 309 {
Chris@42 310 V T5d, T6s, T1V, T1M;
Chris@42 311 T5d = VADD(T5b, T5c);
Chris@42 312 T6s = VSUB(T5c, T5b);
Chris@42 313 T1V = VSUB(T1L, T1E);
Chris@42 314 T1M = VADD(T1E, T1L);
Chris@42 315 T3I = VSUB(T3b, T3a);
Chris@42 316 T3c = VADD(T3a, T3b);
Chris@42 317 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@42 318 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@42 319 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@42 320 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@42 321 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@42 322 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@42 323 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@42 324 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@42 325 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@42 326 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@42 327 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@42 328 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@42 329 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 330 }
Chris@42 331 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 332 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 333 {
Chris@42 334 V T1h, T1i, T1k, T1l;
Chris@42 335 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 336 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 337 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 338 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 339 {
Chris@42 340 V T11, T4B, T4C, T12, T14, T15;
Chris@42 341 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 342 T4A = VSUB(TU, TV);
Chris@42 343 TW = VADD(TU, TV);
Chris@42 344 T4N = VSUB(TX, TY);
Chris@42 345 TZ = VADD(TX, TY);
Chris@42 346 T1j = VADD(T1h, T1i);
Chris@42 347 T4B = VSUB(T1h, T1i);
Chris@42 348 T1m = VADD(T1k, T1l);
Chris@42 349 T4C = VSUB(T1k, T1l);
Chris@42 350 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 351 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 352 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 353 {
Chris@42 354 V T18, T19, T1b, T1c;
Chris@42 355 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 356 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 357 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 358 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 359 T4O = VSUB(T4B, T4C);
Chris@42 360 T4D = VADD(T4B, T4C);
Chris@42 361 T13 = VADD(T11, T12);
Chris@42 362 T4F = VSUB(T11, T12);
Chris@42 363 T16 = VADD(T14, T15);
Chris@42 364 T4G = VSUB(T14, T15);
Chris@42 365 T1a = VADD(T18, T19);
Chris@42 366 T4I = VSUB(T18, T19);
Chris@42 367 T4J = VSUB(T1b, T1c);
Chris@42 368 T1d = VADD(T1b, T1c);
Chris@42 369 }
Chris@42 370 }
Chris@42 371 }
Chris@42 372 }
Chris@42 373 {
Chris@42 374 V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
Chris@42 375 T30 = VADD(TW, TZ);
Chris@42 376 T10 = VSUB(TW, TZ);
Chris@42 377 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@42 378 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@42 379 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@42 380 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@42 381 T33 = VADD(T13, T16);
Chris@42 382 T17 = VSUB(T13, T16);
Chris@42 383 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@42 384 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@42 385 T34 = VADD(T1a, T1d);
Chris@42 386 T1e = VSUB(T1a, T1d);
Chris@42 387 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@42 388 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@42 389 T1n = VSUB(T1j, T1m);
Chris@42 390 T31 = VADD(T1j, T1m);
Chris@42 391 {
Chris@42 392 V T1f, T1o, T6o, T4L, T4S, T6l;
Chris@42 393 T1f = VADD(T17, T1e);
Chris@42 394 T1o = VSUB(T17, T1e);
Chris@42 395 T6o = VSUB(T4H, T4K);
Chris@42 396 T4L = VADD(T4H, T4K);
Chris@42 397 T4S = VADD(T4Q, T4R);
Chris@42 398 T6l = VSUB(T4Q, T4R);
Chris@42 399 T3E = VSUB(T30, T31);
Chris@42 400 T32 = VADD(T30, T31);
Chris@42 401 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@42 402 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@42 403 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@42 404 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@42 405 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@42 406 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@42 407 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@42 408 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@42 409 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@42 410 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@42 411 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@42 412 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@42 413 }
Chris@42 414 }
Chris@42 415 }
Chris@42 416 }
Chris@42 417 }
Chris@42 418 }
Chris@42 419 {
Chris@42 420 V T6b, T6F, T7n, T7o, T7p, T7q, T7v, T7w, T7x, T7y, T7z, T7A, T7B, T7C, T7f;
Chris@42 421 V T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
Chris@42 422 {
Chris@42 423 V T2Z, T3r, T3s, T3m, T3d, T3v;
Chris@42 424 T2Z = VSUB(T2V, T2Y);
Chris@42 425 T3r = VADD(T2V, T2Y);
Chris@42 426 T3s = VADD(T3l, T3i);
Chris@42 427 T3m = VSUB(T3i, T3l);
Chris@42 428 T3d = VSUB(T39, T3c);
Chris@42 429 T3v = VADD(T39, T3c);
Chris@42 430 {
Chris@42 431 V T3x, T3t, T3P, T3J, T3D, T3V, T3Q, T3G, T36, T3u, T3Y, T3O, T6V, T6W;
Chris@42 432 {
Chris@42 433 V T3N, T3C, T3F, T35;
Chris@42 434 T3N = VSUB(T3B, T3A);
Chris@42 435 T3C = VADD(T3A, T3B);
Chris@42 436 T3F = VSUB(T33, T34);
Chris@42 437 T35 = VADD(T33, T34);
Chris@42 438 T3x = VSUB(T3r, T3s);
Chris@42 439 T3t = VADD(T3r, T3s);
Chris@42 440 T3P = VFMA(LDK(KP414213562), T3H, T3I);
Chris@42 441 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@42 442 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@42 443 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@42 444 T3Q = VFMA(LDK(KP414213562), T3E, T3F);
Chris@42 445 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@42 446 T36 = VSUB(T32, T35);
Chris@42 447 T3u = VADD(T32, T35);
Chris@42 448 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@42 449 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@42 450 }
Chris@42 451 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@42 452 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@42 453 T6W = VADD(T6E, T6D);
Chris@42 454 T6F = VSUB(T6D, T6E);
Chris@42 455 {
Chris@42 456 V T3K, T3Z, T3e, T3n;
Chris@42 457 T3K = VADD(T3G, T3J);
Chris@42 458 T3Z = VSUB(T3J, T3G);
Chris@42 459 T3e = VADD(T36, T3d);
Chris@42 460 T3n = VSUB(T3d, T36);
Chris@42 461 {
Chris@42 462 V T3w, T3y, T3R, T3W;
Chris@42 463 T3w = VADD(T3u, T3v);
Chris@42 464 T3y = VSUB(T3v, T3u);
Chris@42 465 T3R = VSUB(T3P, T3Q);
Chris@42 466 T3W = VADD(T3Q, T3P);
Chris@42 467 {
Chris@42 468 V T42, T40, T3L, T3T;
Chris@42 469 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@42 470 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@42 471 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@42 472 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@42 473 {
Chris@42 474 V T3o, T3q, T3f, T3p;
Chris@42 475 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@42 476 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@42 477 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@42 478 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@42 479 T7n = VFNMSI(T3y, T3x);
Chris@42 480 STM2(&(xo[96]), T7n, ovs, &(xo[0]));
Chris@42 481 T7o = VFMAI(T3y, T3x);
Chris@42 482 STM2(&(xo[32]), T7o, ovs, &(xo[0]));
Chris@42 483 T7p = VADD(T3t, T3w);
Chris@42 484 STM2(&(xo[0]), T7p, ovs, &(xo[0]));
Chris@42 485 T7q = VSUB(T3t, T3w);
Chris@42 486 STM2(&(xo[64]), T7q, ovs, &(xo[0]));
Chris@42 487 {
Chris@42 488 V T41, T3X, T3S, T3U;
Chris@42 489 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@42 490 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@42 491 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@42 492 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@42 493 T7r = VFMAI(T3q, T3p);
Chris@42 494 STM2(&(xo[16]), T7r, ovs, &(xo[0]));
Chris@42 495 T7s = VFNMSI(T3q, T3p);
Chris@42 496 STM2(&(xo[112]), T7s, ovs, &(xo[0]));
Chris@42 497 T7t = VFMAI(T3o, T3f);
Chris@42 498 STM2(&(xo[80]), T7t, ovs, &(xo[0]));
Chris@42 499 T7u = VFNMSI(T3o, T3f);
Chris@42 500 STM2(&(xo[48]), T7u, ovs, &(xo[0]));
Chris@42 501 T7v = VFNMSI(T40, T3X);
Chris@42 502 STM2(&(xo[88]), T7v, ovs, &(xo[0]));
Chris@42 503 T7w = VFMAI(T40, T3X);
Chris@42 504 STM2(&(xo[40]), T7w, ovs, &(xo[0]));
Chris@42 505 T7x = VFMAI(T42, T41);
Chris@42 506 STM2(&(xo[104]), T7x, ovs, &(xo[0]));
Chris@42 507 T7y = VFNMSI(T42, T41);
Chris@42 508 STM2(&(xo[24]), T7y, ovs, &(xo[0]));
Chris@42 509 T7z = VFMAI(T3U, T3T);
Chris@42 510 STM2(&(xo[8]), T7z, ovs, &(xo[0]));
Chris@42 511 T7A = VFNMSI(T3U, T3T);
Chris@42 512 STM2(&(xo[120]), T7A, ovs, &(xo[0]));
Chris@42 513 T7B = VFMAI(T3S, T3L);
Chris@42 514 STM2(&(xo[72]), T7B, ovs, &(xo[0]));
Chris@42 515 T7C = VFNMSI(T3S, T3L);
Chris@42 516 STM2(&(xo[56]), T7C, ovs, &(xo[0]));
Chris@42 517 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@42 518 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@42 519 }
Chris@42 520 }
Chris@42 521 }
Chris@42 522 }
Chris@42 523 }
Chris@42 524 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@42 525 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@42 526 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@42 527 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@42 528 T6C = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@42 529 T76 = VFMA(LDK(KP923879532), T6B, T6A);
Chris@42 530 T77 = VSUB(T6e, T6h);
Chris@42 531 T6i = VADD(T6e, T6h);
Chris@42 532 }
Chris@42 533 }
Chris@42 534 {
Chris@42 535 V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T7L, T7O, T7Q, T7S, T5r, T5I, T5x;
Chris@42 536 V T5h, T5F, T5B;
Chris@42 537 {
Chris@42 538 V TT, T2f, T7E, T7F, T7H, T7J, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
Chris@42 539 {
Chris@42 540 V T1X, T2d, T7h, T7l, T2e, T1q, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
Chris@42 541 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@42 542 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@42 543 TS = VADD(TC, TR);
Chris@42 544 T2D = VSUB(TR, TC);
Chris@42 545 {
Chris@42 546 V T7b, T7j, T74, T7i, T78, T7g;
Chris@42 547 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@42 548 T2d = VFMA(LDK(KP198912367), T1N, T1W);
Chris@42 549 T7g = VADD(T79, T7a);
Chris@42 550 T7b = VSUB(T79, T7a);
Chris@42 551 T7j = VSUB(T73, T70);
Chris@42 552 T74 = VADD(T70, T73);
Chris@42 553 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@42 554 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@42 555 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@42 556 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@42 557 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@42 558 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@42 559 T2e = VFMA(LDK(KP198912367), T1g, T1p);
Chris@42 560 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@42 561 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@42 562 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@42 563 T7m = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@42 564 T7k = VFMA(LDK(KP956940335), T7j, T7i);
Chris@42 565 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@42 566 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@42 567 }
Chris@42 568 T2k = VADD(T2e, T2d);
Chris@42 569 T2f = VSUB(T2d, T2e);
Chris@42 570 {
Chris@42 571 V T7D, T7G, T7I, T7K;
Chris@42 572 T7D = VFNMSI(T7k, T7h);
Chris@42 573 STM2(&(xo[90]), T7D, ovs, &(xo[2]));
Chris@42 574 STN2(&(xo[88]), T7v, T7D, ovs);
Chris@42 575 T7E = VFMAI(T7k, T7h);
Chris@42 576 STM2(&(xo[38]), T7E, ovs, &(xo[2]));
Chris@42 577 T7F = VFMAI(T7m, T7l);
Chris@42 578 STM2(&(xo[102]), T7F, ovs, &(xo[2]));
Chris@42 579 T7G = VFNMSI(T7m, T7l);
Chris@42 580 STM2(&(xo[26]), T7G, ovs, &(xo[2]));
Chris@42 581 STN2(&(xo[24]), T7y, T7G, ovs);
Chris@42 582 T7H = VFMAI(T7e, T7d);
Chris@42 583 STM2(&(xo[6]), T7H, ovs, &(xo[2]));
Chris@42 584 T7I = VFNMSI(T7e, T7d);
Chris@42 585 STM2(&(xo[122]), T7I, ovs, &(xo[2]));
Chris@42 586 STN2(&(xo[120]), T7A, T7I, ovs);
Chris@42 587 T7J = VFMAI(T7c, T75);
Chris@42 588 STM2(&(xo[70]), T7J, ovs, &(xo[2]));
Chris@42 589 T7K = VFNMSI(T7c, T75);
Chris@42 590 STM2(&(xo[58]), T7K, ovs, &(xo[2]));
Chris@42 591 STN2(&(xo[56]), T7C, T7K, ovs);
Chris@42 592 T2n = VSUB(T1X, T1q);
Chris@42 593 T1Y = VADD(T1q, T1X);
Chris@42 594 }
Chris@42 595 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@42 596 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@42 597 T2b = VSUB(T29, T2a);
Chris@42 598 T2s = VADD(T2a, T29);
Chris@42 599 }
Chris@42 600 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@42 601 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@42 602 {
Chris@42 603 V T5z, T4z, T5A, T5g;
Chris@42 604 {
Chris@42 605 V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
Chris@42 606 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@42 607 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@42 608 T4y = VADD(T4o, T4x);
Chris@42 609 T5T = VSUB(T4x, T4o);
Chris@42 610 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@42 611 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@42 612 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@42 613 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@42 614 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@42 615 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@42 616 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@42 617 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@42 618 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@42 619 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@42 620 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@42 621 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@42 622 {
Chris@42 623 V T2o, T2q, T2i, T2g;
Chris@42 624 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@42 625 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@42 626 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@42 627 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@42 628 T5A = VADD(T5t, T5u);
Chris@42 629 T5v = VSUB(T5t, T5u);
Chris@42 630 T5D = VSUB(T5f, T4U);
Chris@42 631 T5g = VADD(T4U, T5f);
Chris@42 632 T7L = VFNMSI(T2o, T2l);
Chris@42 633 STM2(&(xo[92]), T7L, ovs, &(xo[0]));
Chris@42 634 {
Chris@42 635 V T7M, T7N, T7P, T7R;
Chris@42 636 T7M = VFMAI(T2o, T2l);
Chris@42 637 STM2(&(xo[36]), T7M, ovs, &(xo[0]));
Chris@42 638 STN2(&(xo[36]), T7M, T7E, ovs);
Chris@42 639 T7N = VFMAI(T2q, T2p);
Chris@42 640 STM2(&(xo[100]), T7N, ovs, &(xo[0]));
Chris@42 641 STN2(&(xo[100]), T7N, T7F, ovs);
Chris@42 642 T7O = VFNMSI(T2q, T2p);
Chris@42 643 STM2(&(xo[28]), T7O, ovs, &(xo[0]));
Chris@42 644 T7P = VFMAI(T2i, T2h);
Chris@42 645 STM2(&(xo[4]), T7P, ovs, &(xo[0]));
Chris@42 646 STN2(&(xo[4]), T7P, T7H, ovs);
Chris@42 647 T7Q = VFNMSI(T2i, T2h);
Chris@42 648 STM2(&(xo[124]), T7Q, ovs, &(xo[0]));
Chris@42 649 T7R = VFMAI(T2g, T1Z);
Chris@42 650 STM2(&(xo[68]), T7R, ovs, &(xo[0]));
Chris@42 651 STN2(&(xo[68]), T7R, T7J, ovs);
Chris@42 652 T7S = VFNMSI(T2g, T1Z);
Chris@42 653 STM2(&(xo[60]), T7S, ovs, &(xo[0]));
Chris@42 654 T5r = VSUB(T5p, T5q);
Chris@42 655 T5I = VADD(T5p, T5q);
Chris@42 656 }
Chris@42 657 }
Chris@42 658 }
Chris@42 659 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@42 660 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@42 661 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@42 662 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@42 663 }
Chris@42 664 }
Chris@42 665 {
Chris@42 666 V T6J, T6R, T6L, T6z, T6T, T6P;
Chris@42 667 {
Chris@42 668 V T6N, T6j, T6O, T6y;
Chris@42 669 {
Chris@42 670 V T6q, T6H, T5C, T5s, T6I, T6x;
Chris@42 671 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@42 672 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@42 673 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@42 674 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@42 675 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@42 676 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@42 677 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@42 678 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@42 679 {
Chris@42 680 V T5E, T5G, T5y, T5w;
Chris@42 681 T5E = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@42 682 T5G = VFMA(LDK(KP995184726), T5D, T5C);
Chris@42 683 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@42 684 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@42 685 T6O = VADD(T6H, T6I);
Chris@42 686 T6J = VSUB(T6H, T6I);
Chris@42 687 T6R = VSUB(T6x, T6q);
Chris@42 688 T6y = VADD(T6q, T6x);
Chris@42 689 {
Chris@42 690 V T7T, T7U, T7V, T7W;
Chris@42 691 T7T = VFMAI(T5E, T5B);
Chris@42 692 STM2(&(xo[94]), T7T, ovs, &(xo[2]));
Chris@42 693 STN2(&(xo[92]), T7L, T7T, ovs);
Chris@42 694 T7U = VFNMSI(T5E, T5B);
Chris@42 695 STM2(&(xo[34]), T7U, ovs, &(xo[2]));
Chris@42 696 STN2(&(xo[32]), T7o, T7U, ovs);
Chris@42 697 T7V = VFNMSI(T5G, T5F);
Chris@42 698 STM2(&(xo[98]), T7V, ovs, &(xo[2]));
Chris@42 699 STN2(&(xo[96]), T7n, T7V, ovs);
Chris@42 700 T7W = VFMAI(T5G, T5F);
Chris@42 701 STM2(&(xo[30]), T7W, ovs, &(xo[2]));
Chris@42 702 STN2(&(xo[28]), T7O, T7W, ovs);
Chris@42 703 {
Chris@42 704 V T7X, T7Y, T7Z, T80;
Chris@42 705 T7X = VFMAI(T5y, T5x);
Chris@42 706 STM2(&(xo[126]), T7X, ovs, &(xo[2]));
Chris@42 707 STN2(&(xo[124]), T7Q, T7X, ovs);
Chris@42 708 T7Y = VFNMSI(T5y, T5x);
Chris@42 709 STM2(&(xo[2]), T7Y, ovs, &(xo[2]));
Chris@42 710 STN2(&(xo[0]), T7p, T7Y, ovs);
Chris@42 711 T7Z = VFMAI(T5w, T5h);
Chris@42 712 STM2(&(xo[62]), T7Z, ovs, &(xo[2]));
Chris@42 713 STN2(&(xo[60]), T7S, T7Z, ovs);
Chris@42 714 T80 = VFNMSI(T5w, T5h);
Chris@42 715 STM2(&(xo[66]), T80, ovs, &(xo[2]));
Chris@42 716 STN2(&(xo[64]), T7q, T80, ovs);
Chris@42 717 }
Chris@42 718 }
Chris@42 719 }
Chris@42 720 }
Chris@42 721 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@42 722 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@42 723 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@42 724 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@42 725 }
Chris@42 726 {
Chris@42 727 V T2H, T2P, T81, T84, T85, T87, T2J, T2B, T2R, T2N;
Chris@42 728 {
Chris@42 729 V T2L, T2t, T2M, T2A;
Chris@42 730 {
Chris@42 731 V T2z, T2F, T6Q, T6G, T2G, T2w;
Chris@42 732 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@42 733 T2F = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@42 734 T6Q = VFMA(LDK(KP831469612), T6F, T6C);
Chris@42 735 T6G = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@42 736 T2G = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@42 737 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@42 738 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@42 739 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@42 740 {
Chris@42 741 V T6S, T6U, T6M, T6K;
Chris@42 742 T6S = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@42 743 T6U = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@42 744 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@42 745 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@42 746 T2M = VADD(T2G, T2F);
Chris@42 747 T2H = VSUB(T2F, T2G);
Chris@42 748 T2P = VSUB(T2z, T2w);
Chris@42 749 T2A = VADD(T2w, T2z);
Chris@42 750 T81 = VFMAI(T6S, T6P);
Chris@42 751 STM2(&(xo[86]), T81, ovs, &(xo[2]));
Chris@42 752 {
Chris@42 753 V T82, T83, T86, T88;
Chris@42 754 T82 = VFNMSI(T6S, T6P);
Chris@42 755 STM2(&(xo[42]), T82, ovs, &(xo[2]));
Chris@42 756 STN2(&(xo[40]), T7w, T82, ovs);
Chris@42 757 T83 = VFNMSI(T6U, T6T);
Chris@42 758 STM2(&(xo[106]), T83, ovs, &(xo[2]));
Chris@42 759 STN2(&(xo[104]), T7x, T83, ovs);
Chris@42 760 T84 = VFMAI(T6U, T6T);
Chris@42 761 STM2(&(xo[22]), T84, ovs, &(xo[2]));
Chris@42 762 T85 = VFMAI(T6M, T6L);
Chris@42 763 STM2(&(xo[118]), T85, ovs, &(xo[2]));
Chris@42 764 T86 = VFNMSI(T6M, T6L);
Chris@42 765 STM2(&(xo[10]), T86, ovs, &(xo[2]));
Chris@42 766 STN2(&(xo[8]), T7z, T86, ovs);
Chris@42 767 T87 = VFMAI(T6K, T6z);
Chris@42 768 STM2(&(xo[54]), T87, ovs, &(xo[2]));
Chris@42 769 T88 = VFNMSI(T6K, T6z);
Chris@42 770 STM2(&(xo[74]), T88, ovs, &(xo[2]));
Chris@42 771 STN2(&(xo[72]), T7B, T88, ovs);
Chris@42 772 }
Chris@42 773 }
Chris@42 774 }
Chris@42 775 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@42 776 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@42 777 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@42 778 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@42 779 }
Chris@42 780 {
Chris@42 781 V T61, T5J, T62, T5Q;
Chris@42 782 {
Chris@42 783 V T5M, T5V, T2O, T2E, T5W, T5P;
Chris@42 784 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@42 785 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@42 786 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@42 787 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@42 788 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@42 789 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@42 790 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@42 791 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@42 792 {
Chris@42 793 V T2Q, T2S, T2K, T2I;
Chris@42 794 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@42 795 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@42 796 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@42 797 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@42 798 T62 = VADD(T5V, T5W);
Chris@42 799 T5X = VSUB(T5V, T5W);
Chris@42 800 T65 = VSUB(T5P, T5M);
Chris@42 801 T5Q = VADD(T5M, T5P);
Chris@42 802 {
Chris@42 803 V T89, T8c, T8d, T8f;
Chris@42 804 T89 = VFMAI(T2Q, T2N);
Chris@42 805 STM2(&(xo[84]), T89, ovs, &(xo[0]));
Chris@42 806 STN2(&(xo[84]), T89, T81, ovs);
Chris@42 807 T8a = VFNMSI(T2Q, T2N);
Chris@42 808 STM2(&(xo[44]), T8a, ovs, &(xo[0]));
Chris@42 809 T8b = VFNMSI(T2S, T2R);
Chris@42 810 STM2(&(xo[108]), T8b, ovs, &(xo[0]));
Chris@42 811 T8c = VFMAI(T2S, T2R);
Chris@42 812 STM2(&(xo[20]), T8c, ovs, &(xo[0]));
Chris@42 813 STN2(&(xo[20]), T8c, T84, ovs);
Chris@42 814 T8d = VFMAI(T2K, T2J);
Chris@42 815 STM2(&(xo[116]), T8d, ovs, &(xo[0]));
Chris@42 816 STN2(&(xo[116]), T8d, T85, ovs);
Chris@42 817 T8e = VFNMSI(T2K, T2J);
Chris@42 818 STM2(&(xo[12]), T8e, ovs, &(xo[0]));
Chris@42 819 T8f = VFMAI(T2I, T2B);
Chris@42 820 STM2(&(xo[52]), T8f, ovs, &(xo[0]));
Chris@42 821 STN2(&(xo[52]), T8f, T87, ovs);
Chris@42 822 T8g = VFNMSI(T2I, T2B);
Chris@42 823 STM2(&(xo[76]), T8g, ovs, &(xo[0]));
Chris@42 824 }
Chris@42 825 }
Chris@42 826 }
Chris@42 827 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@42 828 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@42 829 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@42 830 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@42 831 }
Chris@42 832 }
Chris@42 833 }
Chris@42 834 }
Chris@42 835 }
Chris@42 836 }
Chris@42 837 T5U = VFMA(LDK(KP980785280), T5T, T5S);
Chris@42 838 T64 = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@42 839 {
Chris@42 840 V T68, T66, T5Y, T60;
Chris@42 841 T68 = VFNMS(LDK(KP773010453), T65, T64);
Chris@42 842 T66 = VFMA(LDK(KP773010453), T65, T64);
Chris@42 843 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@42 844 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@42 845 {
Chris@42 846 V T8h, T8i, T8j, T8k;
Chris@42 847 T8h = VFNMSI(T66, T63);
Chris@42 848 STM2(&(xo[82]), T8h, ovs, &(xo[2]));
Chris@42 849 STN2(&(xo[80]), T7t, T8h, ovs);
Chris@42 850 T8i = VFMAI(T66, T63);
Chris@42 851 STM2(&(xo[46]), T8i, ovs, &(xo[2]));
Chris@42 852 STN2(&(xo[44]), T8a, T8i, ovs);
Chris@42 853 T8j = VFMAI(T68, T67);
Chris@42 854 STM2(&(xo[110]), T8j, ovs, &(xo[2]));
Chris@42 855 STN2(&(xo[108]), T8b, T8j, ovs);
Chris@42 856 T8k = VFNMSI(T68, T67);
Chris@42 857 STM2(&(xo[18]), T8k, ovs, &(xo[2]));
Chris@42 858 STN2(&(xo[16]), T7r, T8k, ovs);
Chris@42 859 {
Chris@42 860 V T8l, T8m, T8n, T8o;
Chris@42 861 T8l = VFMAI(T60, T5Z);
Chris@42 862 STM2(&(xo[14]), T8l, ovs, &(xo[2]));
Chris@42 863 STN2(&(xo[12]), T8e, T8l, ovs);
Chris@42 864 T8m = VFNMSI(T60, T5Z);
Chris@42 865 STM2(&(xo[114]), T8m, ovs, &(xo[2]));
Chris@42 866 STN2(&(xo[112]), T7s, T8m, ovs);
Chris@42 867 T8n = VFMAI(T5Y, T5R);
Chris@42 868 STM2(&(xo[78]), T8n, ovs, &(xo[2]));
Chris@42 869 STN2(&(xo[76]), T8g, T8n, ovs);
Chris@42 870 T8o = VFNMSI(T5Y, T5R);
Chris@42 871 STM2(&(xo[50]), T8o, ovs, &(xo[2]));
Chris@42 872 STN2(&(xo[48]), T7u, T8o, ovs);
Chris@42 873 }
Chris@42 874 }
Chris@42 875 }
Chris@42 876 }
Chris@42 877 }
Chris@42 878 VLEAVE();
Chris@42 879 }
Chris@42 880
Chris@42 881 static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 882
Chris@42 883 void XSIMD(codelet_n2fv_64) (planner *p) {
Chris@42 884 X(kdft_register) (p, n2fv_64, &desc);
Chris@42 885 }
Chris@42 886
Chris@42 887 #else /* HAVE_FMA */
Chris@42 888
Chris@42 889 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include n2f.h -store-multiple 2 */
Chris@42 890
Chris@42 891 /*
Chris@42 892 * This function contains 456 FP additions, 124 FP multiplications,
Chris@42 893 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@42 894 * 128 stack variables, 15 constants, and 160 memory accesses
Chris@42 895 */
Chris@42 896 #include "n2f.h"
Chris@42 897
Chris@42 898 static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 899 {
Chris@42 900 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@42 901 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 902 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@42 903 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 904 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@42 905 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 906 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@42 907 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 908 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 909 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 910 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 911 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 912 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 913 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 914 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 915 {
Chris@42 916 INT i;
Chris@42 917 const R *xi;
Chris@42 918 R *xo;
Chris@42 919 xi = ri;
Chris@42 920 xo = ro;
Chris@42 921 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 922 V T4p, T5q, Tb, T39, T2n, T3A, T6f, T6T, Tq, T3B, T6i, T76, T2i, T3a, T4w;
Chris@42 923 V T5r, TI, T2p, T6C, T6V, T3h, T3E, T4L, T5u, TZ, T2q, T6F, T6U, T3e, T3D;
Chris@42 924 V T4E, T5t, T23, T2N, T6t, T71, T6w, T72, T2c, T2O, T3t, T41, T5f, T5R, T5k;
Chris@42 925 V T5S, T3w, T42, T1s, T2K, T6m, T6Y, T6p, T6Z, T1B, T2L, T3m, T3Y, T4Y, T5O;
Chris@42 926 V T53, T5P, T3p, T3Z;
Chris@42 927 {
Chris@42 928 V T3, T4n, T2m, T4o, T6, T5p, T9, T5o;
Chris@42 929 {
Chris@42 930 V T1, T2, T2k, T2l;
Chris@42 931 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 932 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 933 T3 = VSUB(T1, T2);
Chris@42 934 T4n = VADD(T1, T2);
Chris@42 935 T2k = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 936 T2l = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 937 T2m = VSUB(T2k, T2l);
Chris@42 938 T4o = VADD(T2k, T2l);
Chris@42 939 }
Chris@42 940 {
Chris@42 941 V T4, T5, T7, T8;
Chris@42 942 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 943 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 944 T6 = VSUB(T4, T5);
Chris@42 945 T5p = VADD(T4, T5);
Chris@42 946 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 947 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 948 T9 = VSUB(T7, T8);
Chris@42 949 T5o = VADD(T7, T8);
Chris@42 950 }
Chris@42 951 T4p = VSUB(T4n, T4o);
Chris@42 952 T5q = VSUB(T5o, T5p);
Chris@42 953 {
Chris@42 954 V Ta, T2j, T6d, T6e;
Chris@42 955 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 956 Tb = VADD(T3, Ta);
Chris@42 957 T39 = VSUB(T3, Ta);
Chris@42 958 T2j = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@42 959 T2n = VSUB(T2j, T2m);
Chris@42 960 T3A = VADD(T2m, T2j);
Chris@42 961 T6d = VADD(T4n, T4o);
Chris@42 962 T6e = VADD(T5p, T5o);
Chris@42 963 T6f = VADD(T6d, T6e);
Chris@42 964 T6T = VSUB(T6d, T6e);
Chris@42 965 }
Chris@42 966 }
Chris@42 967 {
Chris@42 968 V Te, T4q, To, T4u, Th, T4r, Tl, T4t;
Chris@42 969 {
Chris@42 970 V Tc, Td, Tm, Tn;
Chris@42 971 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 972 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 973 Te = VSUB(Tc, Td);
Chris@42 974 T4q = VADD(Tc, Td);
Chris@42 975 Tm = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 976 Tn = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 977 To = VSUB(Tm, Tn);
Chris@42 978 T4u = VADD(Tm, Tn);
Chris@42 979 }
Chris@42 980 {
Chris@42 981 V Tf, Tg, Tj, Tk;
Chris@42 982 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 983 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 984 Th = VSUB(Tf, Tg);
Chris@42 985 T4r = VADD(Tf, Tg);
Chris@42 986 Tj = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 987 Tk = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 988 Tl = VSUB(Tj, Tk);
Chris@42 989 T4t = VADD(Tj, Tk);
Chris@42 990 }
Chris@42 991 {
Chris@42 992 V Ti, Tp, T6g, T6h;
Chris@42 993 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@42 994 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@42 995 Tq = VADD(Ti, Tp);
Chris@42 996 T3B = VSUB(Tp, Ti);
Chris@42 997 T6g = VADD(T4q, T4r);
Chris@42 998 T6h = VADD(T4t, T4u);
Chris@42 999 T6i = VADD(T6g, T6h);
Chris@42 1000 T76 = VSUB(T6h, T6g);
Chris@42 1001 }
Chris@42 1002 {
Chris@42 1003 V T2g, T2h, T4s, T4v;
Chris@42 1004 T2g = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@42 1005 T2h = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@42 1006 T2i = VSUB(T2g, T2h);
Chris@42 1007 T3a = VADD(T2h, T2g);
Chris@42 1008 T4s = VSUB(T4q, T4r);
Chris@42 1009 T4v = VSUB(T4t, T4u);
Chris@42 1010 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@42 1011 T5r = VMUL(LDK(KP707106781), VSUB(T4v, T4s));
Chris@42 1012 }
Chris@42 1013 }
Chris@42 1014 {
Chris@42 1015 V Tu, T4F, TG, T4G, TB, T4J, TD, T4I;
Chris@42 1016 {
Chris@42 1017 V Ts, Tt, TE, TF;
Chris@42 1018 Ts = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 1019 Tt = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 1020 Tu = VSUB(Ts, Tt);
Chris@42 1021 T4F = VADD(Ts, Tt);
Chris@42 1022 TE = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 1023 TF = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 1024 TG = VSUB(TE, TF);
Chris@42 1025 T4G = VADD(TE, TF);
Chris@42 1026 {
Chris@42 1027 V Tv, Tw, Tx, Ty, Tz, TA;
Chris@42 1028 Tv = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 1029 Tw = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 1030 Tx = VSUB(Tv, Tw);
Chris@42 1031 Ty = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 1032 Tz = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 1033 TA = VSUB(Ty, Tz);
Chris@42 1034 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@42 1035 T4J = VADD(Tv, Tw);
Chris@42 1036 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@42 1037 T4I = VADD(Ty, Tz);
Chris@42 1038 }
Chris@42 1039 }
Chris@42 1040 {
Chris@42 1041 V TC, TH, T6A, T6B;
Chris@42 1042 TC = VADD(Tu, TB);
Chris@42 1043 TH = VSUB(TD, TG);
Chris@42 1044 TI = VFMA(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@42 1045 T2p = VFNMS(LDK(KP195090322), TH, VMUL(LDK(KP980785280), TC));
Chris@42 1046 T6A = VADD(T4F, T4G);
Chris@42 1047 T6B = VADD(T4J, T4I);
Chris@42 1048 T6C = VADD(T6A, T6B);
Chris@42 1049 T6V = VSUB(T6A, T6B);
Chris@42 1050 }
Chris@42 1051 {
Chris@42 1052 V T3f, T3g, T4H, T4K;
Chris@42 1053 T3f = VSUB(Tu, TB);
Chris@42 1054 T3g = VADD(TG, TD);
Chris@42 1055 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
Chris@42 1056 T3E = VFMA(LDK(KP555570233), T3f, VMUL(LDK(KP831469612), T3g));
Chris@42 1057 T4H = VSUB(T4F, T4G);
Chris@42 1058 T4K = VSUB(T4I, T4J);
Chris@42 1059 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@42 1060 T5u = VFMA(LDK(KP382683432), T4H, VMUL(LDK(KP923879532), T4K));
Chris@42 1061 }
Chris@42 1062 }
Chris@42 1063 {
Chris@42 1064 V TS, T4z, TW, T4y, TP, T4C, TX, T4B;
Chris@42 1065 {
Chris@42 1066 V TQ, TR, TU, TV;
Chris@42 1067 TQ = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 1068 TR = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 1069 TS = VSUB(TQ, TR);
Chris@42 1070 T4z = VADD(TQ, TR);
Chris@42 1071 TU = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 1072 TV = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 1073 TW = VSUB(TU, TV);
Chris@42 1074 T4y = VADD(TU, TV);
Chris@42 1075 {
Chris@42 1076 V TJ, TK, TL, TM, TN, TO;
Chris@42 1077 TJ = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 1078 TK = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 1079 TL = VSUB(TJ, TK);
Chris@42 1080 TM = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 1081 TN = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 1082 TO = VSUB(TM, TN);
Chris@42 1083 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@42 1084 T4C = VADD(TM, TN);
Chris@42 1085 TX = VMUL(LDK(KP707106781), VADD(TO, TL));
Chris@42 1086 T4B = VADD(TJ, TK);
Chris@42 1087 }
Chris@42 1088 }
Chris@42 1089 {
Chris@42 1090 V TT, TY, T6D, T6E;
Chris@42 1091 TT = VSUB(TP, TS);
Chris@42 1092 TY = VADD(TW, TX);
Chris@42 1093 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@42 1094 T2q = VFMA(LDK(KP980785280), TY, VMUL(LDK(KP195090322), TT));
Chris@42 1095 T6D = VADD(T4y, T4z);
Chris@42 1096 T6E = VADD(T4C, T4B);
Chris@42 1097 T6F = VADD(T6D, T6E);
Chris@42 1098 T6U = VSUB(T6D, T6E);
Chris@42 1099 }
Chris@42 1100 {
Chris@42 1101 V T3c, T3d, T4A, T4D;
Chris@42 1102 T3c = VSUB(TW, TX);
Chris@42 1103 T3d = VADD(TS, TP);
Chris@42 1104 T3e = VFMA(LDK(KP831469612), T3c, VMUL(LDK(KP555570233), T3d));
Chris@42 1105 T3D = VFNMS(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
Chris@42 1106 T4A = VSUB(T4y, T4z);
Chris@42 1107 T4D = VSUB(T4B, T4C);
Chris@42 1108 T4E = VFMA(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4D));
Chris@42 1109 T5t = VFNMS(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@42 1110 }
Chris@42 1111 }
Chris@42 1112 {
Chris@42 1113 V T1F, T55, T2a, T56, T1M, T5h, T27, T5g, T58, T59, T1U, T5a, T25, T5b, T5c;
Chris@42 1114 V T21, T5d, T24;
Chris@42 1115 {
Chris@42 1116 V T1D, T1E, T28, T29;
Chris@42 1117 T1D = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1118 T1E = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1119 T1F = VSUB(T1D, T1E);
Chris@42 1120 T55 = VADD(T1D, T1E);
Chris@42 1121 T28 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1122 T29 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1123 T2a = VSUB(T28, T29);
Chris@42 1124 T56 = VADD(T28, T29);
Chris@42 1125 }
Chris@42 1126 {
Chris@42 1127 V T1G, T1H, T1I, T1J, T1K, T1L;
Chris@42 1128 T1G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1129 T1H = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1130 T1I = VSUB(T1G, T1H);
Chris@42 1131 T1J = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1132 T1K = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1133 T1L = VSUB(T1J, T1K);
Chris@42 1134 T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
Chris@42 1135 T5h = VADD(T1G, T1H);
Chris@42 1136 T27 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
Chris@42 1137 T5g = VADD(T1J, T1K);
Chris@42 1138 }
Chris@42 1139 {
Chris@42 1140 V T1Q, T1T, T1X, T20;
Chris@42 1141 {
Chris@42 1142 V T1O, T1P, T1R, T1S;
Chris@42 1143 T1O = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1144 T1P = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1145 T1Q = VSUB(T1O, T1P);
Chris@42 1146 T58 = VADD(T1O, T1P);
Chris@42 1147 T1R = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1148 T1S = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1149 T1T = VSUB(T1R, T1S);
Chris@42 1150 T59 = VADD(T1R, T1S);
Chris@42 1151 }
Chris@42 1152 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
Chris@42 1153 T5a = VSUB(T58, T59);
Chris@42 1154 T25 = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
Chris@42 1155 {
Chris@42 1156 V T1V, T1W, T1Y, T1Z;
Chris@42 1157 T1V = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1158 T1W = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1159 T1X = VSUB(T1V, T1W);
Chris@42 1160 T5b = VADD(T1V, T1W);
Chris@42 1161 T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1162 T1Z = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1163 T20 = VSUB(T1Y, T1Z);
Chris@42 1164 T5c = VADD(T1Y, T1Z);
Chris@42 1165 }
Chris@42 1166 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
Chris@42 1167 T5d = VSUB(T5b, T5c);
Chris@42 1168 T24 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
Chris@42 1169 }
Chris@42 1170 {
Chris@42 1171 V T1N, T22, T6r, T6s;
Chris@42 1172 T1N = VADD(T1F, T1M);
Chris@42 1173 T22 = VADD(T1U, T21);
Chris@42 1174 T23 = VSUB(T1N, T22);
Chris@42 1175 T2N = VADD(T1N, T22);
Chris@42 1176 T6r = VADD(T55, T56);
Chris@42 1177 T6s = VADD(T5h, T5g);
Chris@42 1178 T6t = VADD(T6r, T6s);
Chris@42 1179 T71 = VSUB(T6r, T6s);
Chris@42 1180 }
Chris@42 1181 {
Chris@42 1182 V T6u, T6v, T26, T2b;
Chris@42 1183 T6u = VADD(T58, T59);
Chris@42 1184 T6v = VADD(T5b, T5c);
Chris@42 1185 T6w = VADD(T6u, T6v);
Chris@42 1186 T72 = VSUB(T6v, T6u);
Chris@42 1187 T26 = VSUB(T24, T25);
Chris@42 1188 T2b = VSUB(T27, T2a);
Chris@42 1189 T2c = VSUB(T26, T2b);
Chris@42 1190 T2O = VADD(T2b, T26);
Chris@42 1191 }
Chris@42 1192 {
Chris@42 1193 V T3r, T3s, T57, T5e;
Chris@42 1194 T3r = VSUB(T1F, T1M);
Chris@42 1195 T3s = VADD(T25, T24);
Chris@42 1196 T3t = VADD(T3r, T3s);
Chris@42 1197 T41 = VSUB(T3r, T3s);
Chris@42 1198 T57 = VSUB(T55, T56);
Chris@42 1199 T5e = VMUL(LDK(KP707106781), VADD(T5a, T5d));
Chris@42 1200 T5f = VADD(T57, T5e);
Chris@42 1201 T5R = VSUB(T57, T5e);
Chris@42 1202 }
Chris@42 1203 {
Chris@42 1204 V T5i, T5j, T3u, T3v;
Chris@42 1205 T5i = VSUB(T5g, T5h);
Chris@42 1206 T5j = VMUL(LDK(KP707106781), VSUB(T5d, T5a));
Chris@42 1207 T5k = VADD(T5i, T5j);
Chris@42 1208 T5S = VSUB(T5j, T5i);
Chris@42 1209 T3u = VADD(T2a, T27);
Chris@42 1210 T3v = VSUB(T21, T1U);
Chris@42 1211 T3w = VADD(T3u, T3v);
Chris@42 1212 T42 = VSUB(T3v, T3u);
Chris@42 1213 }
Chris@42 1214 }
Chris@42 1215 {
Chris@42 1216 V T1q, T4P, T1v, T4O, T1n, T50, T1w, T4Z, T4U, T4V, T18, T4W, T1z, T4R, T4S;
Chris@42 1217 V T1f, T4T, T1y;
Chris@42 1218 {
Chris@42 1219 V T1o, T1p, T1t, T1u;
Chris@42 1220 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1221 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1222 T1q = VSUB(T1o, T1p);
Chris@42 1223 T4P = VADD(T1o, T1p);
Chris@42 1224 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1225 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1226 T1v = VSUB(T1t, T1u);
Chris@42 1227 T4O = VADD(T1t, T1u);
Chris@42 1228 }
Chris@42 1229 {
Chris@42 1230 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@42 1231 T1h = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1232 T1i = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1233 T1j = VSUB(T1h, T1i);
Chris@42 1234 T1k = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1235 T1l = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1236 T1m = VSUB(T1k, T1l);
Chris@42 1237 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@42 1238 T50 = VADD(T1k, T1l);
Chris@42 1239 T1w = VMUL(LDK(KP707106781), VADD(T1m, T1j));
Chris@42 1240 T4Z = VADD(T1h, T1i);
Chris@42 1241 }
Chris@42 1242 {
Chris@42 1243 V T14, T17, T1b, T1e;
Chris@42 1244 {
Chris@42 1245 V T12, T13, T15, T16;
Chris@42 1246 T12 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1247 T13 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1248 T14 = VSUB(T12, T13);
Chris@42 1249 T4U = VADD(T12, T13);
Chris@42 1250 T15 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1251 T16 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1252 T17 = VSUB(T15, T16);
Chris@42 1253 T4V = VADD(T15, T16);
Chris@42 1254 }
Chris@42 1255 T18 = VFNMS(LDK(KP923879532), T17, VMUL(LDK(KP382683432), T14));
Chris@42 1256 T4W = VSUB(T4U, T4V);
Chris@42 1257 T1z = VFMA(LDK(KP923879532), T14, VMUL(LDK(KP382683432), T17));
Chris@42 1258 {
Chris@42 1259 V T19, T1a, T1c, T1d;
Chris@42 1260 T19 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1261 T1a = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1262 T1b = VSUB(T19, T1a);
Chris@42 1263 T4R = VADD(T19, T1a);
Chris@42 1264 T1c = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1265 T1d = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1266 T1e = VSUB(T1c, T1d);
Chris@42 1267 T4S = VADD(T1c, T1d);
Chris@42 1268 }
Chris@42 1269 T1f = VFMA(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@42 1270 T4T = VSUB(T4R, T4S);
Chris@42 1271 T1y = VFNMS(LDK(KP382683432), T1e, VMUL(LDK(KP923879532), T1b));
Chris@42 1272 }
Chris@42 1273 {
Chris@42 1274 V T1g, T1r, T6k, T6l;
Chris@42 1275 T1g = VSUB(T18, T1f);
Chris@42 1276 T1r = VSUB(T1n, T1q);
Chris@42 1277 T1s = VSUB(T1g, T1r);
Chris@42 1278 T2K = VADD(T1r, T1g);
Chris@42 1279 T6k = VADD(T4O, T4P);
Chris@42 1280 T6l = VADD(T50, T4Z);
Chris@42 1281 T6m = VADD(T6k, T6l);
Chris@42 1282 T6Y = VSUB(T6k, T6l);
Chris@42 1283 }
Chris@42 1284 {
Chris@42 1285 V T6n, T6o, T1x, T1A;
Chris@42 1286 T6n = VADD(T4R, T4S);
Chris@42 1287 T6o = VADD(T4U, T4V);
Chris@42 1288 T6p = VADD(T6n, T6o);
Chris@42 1289 T6Z = VSUB(T6o, T6n);
Chris@42 1290 T1x = VADD(T1v, T1w);
Chris@42 1291 T1A = VADD(T1y, T1z);
Chris@42 1292 T1B = VSUB(T1x, T1A);
Chris@42 1293 T2L = VADD(T1x, T1A);
Chris@42 1294 }
Chris@42 1295 {
Chris@42 1296 V T3k, T3l, T4Q, T4X;
Chris@42 1297 T3k = VSUB(T1v, T1w);
Chris@42 1298 T3l = VADD(T1f, T18);
Chris@42 1299 T3m = VADD(T3k, T3l);
Chris@42 1300 T3Y = VSUB(T3k, T3l);
Chris@42 1301 T4Q = VSUB(T4O, T4P);
Chris@42 1302 T4X = VMUL(LDK(KP707106781), VADD(T4T, T4W));
Chris@42 1303 T4Y = VADD(T4Q, T4X);
Chris@42 1304 T5O = VSUB(T4Q, T4X);
Chris@42 1305 }
Chris@42 1306 {
Chris@42 1307 V T51, T52, T3n, T3o;
Chris@42 1308 T51 = VSUB(T4Z, T50);
Chris@42 1309 T52 = VMUL(LDK(KP707106781), VSUB(T4W, T4T));
Chris@42 1310 T53 = VADD(T51, T52);
Chris@42 1311 T5P = VSUB(T52, T51);
Chris@42 1312 T3n = VADD(T1q, T1n);
Chris@42 1313 T3o = VSUB(T1z, T1y);
Chris@42 1314 T3p = VADD(T3n, T3o);
Chris@42 1315 T3Z = VSUB(T3o, T3n);
Chris@42 1316 }
Chris@42 1317 }
Chris@42 1318 {
Chris@42 1319 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
Chris@42 1320 V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
Chris@42 1321 {
Chris@42 1322 V T6N, T6R, T6Q, T6S;
Chris@42 1323 {
Chris@42 1324 V T6L, T6M, T6O, T6P;
Chris@42 1325 T6L = VADD(T6f, T6i);
Chris@42 1326 T6M = VADD(T6F, T6C);
Chris@42 1327 T6N = VADD(T6L, T6M);
Chris@42 1328 T6R = VSUB(T6L, T6M);
Chris@42 1329 T6O = VADD(T6m, T6p);
Chris@42 1330 T6P = VADD(T6t, T6w);
Chris@42 1331 T6Q = VADD(T6O, T6P);
Chris@42 1332 T6S = VBYI(VSUB(T6P, T6O));
Chris@42 1333 }
Chris@42 1334 T7n = VSUB(T6N, T6Q);
Chris@42 1335 STM2(&(xo[64]), T7n, ovs, &(xo[0]));
Chris@42 1336 T7o = VADD(T6R, T6S);
Chris@42 1337 STM2(&(xo[32]), T7o, ovs, &(xo[0]));
Chris@42 1338 T7p = VADD(T6N, T6Q);
Chris@42 1339 STM2(&(xo[0]), T7p, ovs, &(xo[0]));
Chris@42 1340 T7q = VSUB(T6R, T6S);
Chris@42 1341 STM2(&(xo[96]), T7q, ovs, &(xo[0]));
Chris@42 1342 }
Chris@42 1343 {
Chris@42 1344 V T6j, T6G, T6y, T6H, T6q, T6x;
Chris@42 1345 T6j = VSUB(T6f, T6i);
Chris@42 1346 T6G = VSUB(T6C, T6F);
Chris@42 1347 T6q = VSUB(T6m, T6p);
Chris@42 1348 T6x = VSUB(T6t, T6w);
Chris@42 1349 T6y = VMUL(LDK(KP707106781), VADD(T6q, T6x));
Chris@42 1350 T6H = VMUL(LDK(KP707106781), VSUB(T6x, T6q));
Chris@42 1351 {
Chris@42 1352 V T6z, T6I, T6J, T6K;
Chris@42 1353 T6z = VADD(T6j, T6y);
Chris@42 1354 T6I = VBYI(VADD(T6G, T6H));
Chris@42 1355 T7r = VSUB(T6z, T6I);
Chris@42 1356 STM2(&(xo[112]), T7r, ovs, &(xo[0]));
Chris@42 1357 T7s = VADD(T6z, T6I);
Chris@42 1358 STM2(&(xo[16]), T7s, ovs, &(xo[0]));
Chris@42 1359 T6J = VSUB(T6j, T6y);
Chris@42 1360 T6K = VBYI(VSUB(T6H, T6G));
Chris@42 1361 T7t = VSUB(T6J, T6K);
Chris@42 1362 STM2(&(xo[80]), T7t, ovs, &(xo[0]));
Chris@42 1363 T7u = VADD(T6J, T6K);
Chris@42 1364 STM2(&(xo[48]), T7u, ovs, &(xo[0]));
Chris@42 1365 }
Chris@42 1366 }
Chris@42 1367 {
Chris@42 1368 V T6X, T7i, T78, T7g, T74, T7f, T7b, T7j, T6W, T77;
Chris@42 1369 T6W = VMUL(LDK(KP707106781), VADD(T6U, T6V));
Chris@42 1370 T6X = VADD(T6T, T6W);
Chris@42 1371 T7i = VSUB(T6T, T6W);
Chris@42 1372 T77 = VMUL(LDK(KP707106781), VSUB(T6V, T6U));
Chris@42 1373 T78 = VADD(T76, T77);
Chris@42 1374 T7g = VSUB(T77, T76);
Chris@42 1375 {
Chris@42 1376 V T70, T73, T79, T7a;
Chris@42 1377 T70 = VFMA(LDK(KP923879532), T6Y, VMUL(LDK(KP382683432), T6Z));
Chris@42 1378 T73 = VFNMS(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T71));
Chris@42 1379 T74 = VADD(T70, T73);
Chris@42 1380 T7f = VSUB(T73, T70);
Chris@42 1381 T79 = VFNMS(LDK(KP382683432), T6Y, VMUL(LDK(KP923879532), T6Z));
Chris@42 1382 T7a = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T72));
Chris@42 1383 T7b = VADD(T79, T7a);
Chris@42 1384 T7j = VSUB(T7a, T79);
Chris@42 1385 }
Chris@42 1386 {
Chris@42 1387 V T75, T7c, T7l, T7m;
Chris@42 1388 T75 = VADD(T6X, T74);
Chris@42 1389 T7c = VBYI(VADD(T78, T7b));
Chris@42 1390 T7v = VSUB(T75, T7c);
Chris@42 1391 STM2(&(xo[120]), T7v, ovs, &(xo[0]));
Chris@42 1392 T7w = VADD(T75, T7c);
Chris@42 1393 STM2(&(xo[8]), T7w, ovs, &(xo[0]));
Chris@42 1394 T7l = VBYI(VADD(T7g, T7f));
Chris@42 1395 T7m = VADD(T7i, T7j);
Chris@42 1396 T7x = VADD(T7l, T7m);
Chris@42 1397 STM2(&(xo[24]), T7x, ovs, &(xo[0]));
Chris@42 1398 T7y = VSUB(T7m, T7l);
Chris@42 1399 STM2(&(xo[104]), T7y, ovs, &(xo[0]));
Chris@42 1400 }
Chris@42 1401 {
Chris@42 1402 V T7d, T7e, T7h, T7k;
Chris@42 1403 T7d = VSUB(T6X, T74);
Chris@42 1404 T7e = VBYI(VSUB(T7b, T78));
Chris@42 1405 T7z = VSUB(T7d, T7e);
Chris@42 1406 STM2(&(xo[72]), T7z, ovs, &(xo[0]));
Chris@42 1407 T7A = VADD(T7d, T7e);
Chris@42 1408 STM2(&(xo[56]), T7A, ovs, &(xo[0]));
Chris@42 1409 T7h = VBYI(VSUB(T7f, T7g));
Chris@42 1410 T7k = VSUB(T7i, T7j);
Chris@42 1411 T7B = VADD(T7h, T7k);
Chris@42 1412 STM2(&(xo[40]), T7B, ovs, &(xo[0]));
Chris@42 1413 T7C = VSUB(T7k, T7h);
Chris@42 1414 STM2(&(xo[88]), T7C, ovs, &(xo[0]));
Chris@42 1415 }
Chris@42 1416 }
Chris@42 1417 {
Chris@42 1418 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@42 1419 {
Chris@42 1420 V T5L, T5M, T5Z, T60;
Chris@42 1421 T5L = VSUB(T4p, T4w);
Chris@42 1422 T5M = VSUB(T5u, T5t);
Chris@42 1423 T5N = VADD(T5L, T5M);
Chris@42 1424 T68 = VSUB(T5L, T5M);
Chris@42 1425 T5Z = VFNMS(LDK(KP555570233), T5O, VMUL(LDK(KP831469612), T5P));
Chris@42 1426 T60 = VFMA(LDK(KP555570233), T5R, VMUL(LDK(KP831469612), T5S));
Chris@42 1427 T61 = VADD(T5Z, T60);
Chris@42 1428 T69 = VSUB(T60, T5Z);
Chris@42 1429 }
Chris@42 1430 {
Chris@42 1431 V T5Q, T5T, T5W, T5X;
Chris@42 1432 T5Q = VFMA(LDK(KP831469612), T5O, VMUL(LDK(KP555570233), T5P));
Chris@42 1433 T5T = VFNMS(LDK(KP555570233), T5S, VMUL(LDK(KP831469612), T5R));
Chris@42 1434 T5U = VADD(T5Q, T5T);
Chris@42 1435 T65 = VSUB(T5T, T5Q);
Chris@42 1436 T5W = VSUB(T5r, T5q);
Chris@42 1437 T5X = VSUB(T4L, T4E);
Chris@42 1438 T5Y = VADD(T5W, T5X);
Chris@42 1439 T66 = VSUB(T5X, T5W);
Chris@42 1440 }
Chris@42 1441 {
Chris@42 1442 V T5V, T62, T6b, T6c;
Chris@42 1443 T5V = VADD(T5N, T5U);
Chris@42 1444 T62 = VBYI(VADD(T5Y, T61));
Chris@42 1445 T7D = VSUB(T5V, T62);
Chris@42 1446 STM2(&(xo[116]), T7D, ovs, &(xo[0]));
Chris@42 1447 T7E = VADD(T5V, T62);
Chris@42 1448 STM2(&(xo[12]), T7E, ovs, &(xo[0]));
Chris@42 1449 T6b = VBYI(VADD(T66, T65));
Chris@42 1450 T6c = VADD(T68, T69);
Chris@42 1451 T7F = VADD(T6b, T6c);
Chris@42 1452 STM2(&(xo[20]), T7F, ovs, &(xo[0]));
Chris@42 1453 T7G = VSUB(T6c, T6b);
Chris@42 1454 STM2(&(xo[108]), T7G, ovs, &(xo[0]));
Chris@42 1455 }
Chris@42 1456 {
Chris@42 1457 V T63, T64, T67, T6a;
Chris@42 1458 T63 = VSUB(T5N, T5U);
Chris@42 1459 T64 = VBYI(VSUB(T61, T5Y));
Chris@42 1460 T7H = VSUB(T63, T64);
Chris@42 1461 STM2(&(xo[76]), T7H, ovs, &(xo[0]));
Chris@42 1462 T7I = VADD(T63, T64);
Chris@42 1463 STM2(&(xo[52]), T7I, ovs, &(xo[0]));
Chris@42 1464 T67 = VBYI(VSUB(T65, T66));
Chris@42 1465 T6a = VSUB(T68, T69);
Chris@42 1466 T7J = VADD(T67, T6a);
Chris@42 1467 STM2(&(xo[44]), T7J, ovs, &(xo[0]));
Chris@42 1468 T7K = VSUB(T6a, T67);
Chris@42 1469 STM2(&(xo[84]), T7K, ovs, &(xo[0]));
Chris@42 1470 }
Chris@42 1471 }
Chris@42 1472 {
Chris@42 1473 V T7U, T7W, T7X, T7Z;
Chris@42 1474 {
Chris@42 1475 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@42 1476 {
Chris@42 1477 V Tr, T10, T2t, T2u;
Chris@42 1478 Tr = VSUB(Tb, Tq);
Chris@42 1479 T10 = VSUB(TI, TZ);
Chris@42 1480 T11 = VADD(Tr, T10);
Chris@42 1481 T2C = VSUB(Tr, T10);
Chris@42 1482 T2t = VFNMS(LDK(KP634393284), T1B, VMUL(LDK(KP773010453), T1s));
Chris@42 1483 T2u = VFMA(LDK(KP773010453), T2c, VMUL(LDK(KP634393284), T23));
Chris@42 1484 T2v = VADD(T2t, T2u);
Chris@42 1485 T2D = VSUB(T2u, T2t);
Chris@42 1486 }
Chris@42 1487 {
Chris@42 1488 V T1C, T2d, T2o, T2r;
Chris@42 1489 T1C = VFMA(LDK(KP634393284), T1s, VMUL(LDK(KP773010453), T1B));
Chris@42 1490 T2d = VFNMS(LDK(KP634393284), T2c, VMUL(LDK(KP773010453), T23));
Chris@42 1491 T2e = VADD(T1C, T2d);
Chris@42 1492 T2z = VSUB(T2d, T1C);
Chris@42 1493 T2o = VSUB(T2i, T2n);
Chris@42 1494 T2r = VSUB(T2p, T2q);
Chris@42 1495 T2s = VADD(T2o, T2r);
Chris@42 1496 T2A = VSUB(T2r, T2o);
Chris@42 1497 }
Chris@42 1498 {
Chris@42 1499 V T2f, T2w, T7L, T7M;
Chris@42 1500 T2f = VADD(T11, T2e);
Chris@42 1501 T2w = VBYI(VADD(T2s, T2v));
Chris@42 1502 T7L = VSUB(T2f, T2w);
Chris@42 1503 STM2(&(xo[114]), T7L, ovs, &(xo[2]));
Chris@42 1504 STN2(&(xo[112]), T7r, T7L, ovs);
Chris@42 1505 T7M = VADD(T2f, T2w);
Chris@42 1506 STM2(&(xo[14]), T7M, ovs, &(xo[2]));
Chris@42 1507 STN2(&(xo[12]), T7E, T7M, ovs);
Chris@42 1508 }
Chris@42 1509 {
Chris@42 1510 V T2F, T2G, T7N, T7O;
Chris@42 1511 T2F = VBYI(VADD(T2A, T2z));
Chris@42 1512 T2G = VADD(T2C, T2D);
Chris@42 1513 T7N = VADD(T2F, T2G);
Chris@42 1514 STM2(&(xo[18]), T7N, ovs, &(xo[2]));
Chris@42 1515 STN2(&(xo[16]), T7s, T7N, ovs);
Chris@42 1516 T7O = VSUB(T2G, T2F);
Chris@42 1517 STM2(&(xo[110]), T7O, ovs, &(xo[2]));
Chris@42 1518 STN2(&(xo[108]), T7G, T7O, ovs);
Chris@42 1519 }
Chris@42 1520 {
Chris@42 1521 V T2x, T2y, T7P, T7Q;
Chris@42 1522 T2x = VSUB(T11, T2e);
Chris@42 1523 T2y = VBYI(VSUB(T2v, T2s));
Chris@42 1524 T7P = VSUB(T2x, T2y);
Chris@42 1525 STM2(&(xo[78]), T7P, ovs, &(xo[2]));
Chris@42 1526 STN2(&(xo[76]), T7H, T7P, ovs);
Chris@42 1527 T7Q = VADD(T2x, T2y);
Chris@42 1528 STM2(&(xo[50]), T7Q, ovs, &(xo[2]));
Chris@42 1529 STN2(&(xo[48]), T7u, T7Q, ovs);
Chris@42 1530 }
Chris@42 1531 {
Chris@42 1532 V T2B, T2E, T7R, T7S;
Chris@42 1533 T2B = VBYI(VSUB(T2z, T2A));
Chris@42 1534 T2E = VSUB(T2C, T2D);
Chris@42 1535 T7R = VADD(T2B, T2E);
Chris@42 1536 STM2(&(xo[46]), T7R, ovs, &(xo[2]));
Chris@42 1537 STN2(&(xo[44]), T7J, T7R, ovs);
Chris@42 1538 T7S = VSUB(T2E, T2B);
Chris@42 1539 STM2(&(xo[82]), T7S, ovs, &(xo[2]));
Chris@42 1540 STN2(&(xo[80]), T7t, T7S, ovs);
Chris@42 1541 }
Chris@42 1542 }
Chris@42 1543 {
Chris@42 1544 V T3j, T3Q, T3J, T3R, T3y, T3N, T3G, T3O;
Chris@42 1545 {
Chris@42 1546 V T3b, T3i, T3H, T3I;
Chris@42 1547 T3b = VADD(T39, T3a);
Chris@42 1548 T3i = VADD(T3e, T3h);
Chris@42 1549 T3j = VADD(T3b, T3i);
Chris@42 1550 T3Q = VSUB(T3b, T3i);
Chris@42 1551 T3H = VFNMS(LDK(KP290284677), T3m, VMUL(LDK(KP956940335), T3p));
Chris@42 1552 T3I = VFMA(LDK(KP290284677), T3t, VMUL(LDK(KP956940335), T3w));
Chris@42 1553 T3J = VADD(T3H, T3I);
Chris@42 1554 T3R = VSUB(T3I, T3H);
Chris@42 1555 }
Chris@42 1556 {
Chris@42 1557 V T3q, T3x, T3C, T3F;
Chris@42 1558 T3q = VFMA(LDK(KP956940335), T3m, VMUL(LDK(KP290284677), T3p));
Chris@42 1559 T3x = VFNMS(LDK(KP290284677), T3w, VMUL(LDK(KP956940335), T3t));
Chris@42 1560 T3y = VADD(T3q, T3x);
Chris@42 1561 T3N = VSUB(T3x, T3q);
Chris@42 1562 T3C = VADD(T3A, T3B);
Chris@42 1563 T3F = VADD(T3D, T3E);
Chris@42 1564 T3G = VADD(T3C, T3F);
Chris@42 1565 T3O = VSUB(T3F, T3C);
Chris@42 1566 }
Chris@42 1567 {
Chris@42 1568 V T3z, T3K, T7T, T3T, T3U, T7V;
Chris@42 1569 T3z = VADD(T3j, T3y);
Chris@42 1570 T3K = VBYI(VADD(T3G, T3J));
Chris@42 1571 T7T = VSUB(T3z, T3K);
Chris@42 1572 STM2(&(xo[122]), T7T, ovs, &(xo[2]));
Chris@42 1573 STN2(&(xo[120]), T7v, T7T, ovs);
Chris@42 1574 T7U = VADD(T3z, T3K);
Chris@42 1575 STM2(&(xo[6]), T7U, ovs, &(xo[2]));
Chris@42 1576 T3T = VBYI(VADD(T3O, T3N));
Chris@42 1577 T3U = VADD(T3Q, T3R);
Chris@42 1578 T7V = VADD(T3T, T3U);
Chris@42 1579 STM2(&(xo[26]), T7V, ovs, &(xo[2]));
Chris@42 1580 STN2(&(xo[24]), T7x, T7V, ovs);
Chris@42 1581 T7W = VSUB(T3U, T3T);
Chris@42 1582 STM2(&(xo[102]), T7W, ovs, &(xo[2]));
Chris@42 1583 }
Chris@42 1584 {
Chris@42 1585 V T3L, T3M, T7Y, T3P, T3S, T80;
Chris@42 1586 T3L = VSUB(T3j, T3y);
Chris@42 1587 T3M = VBYI(VSUB(T3J, T3G));
Chris@42 1588 T7X = VSUB(T3L, T3M);
Chris@42 1589 STM2(&(xo[70]), T7X, ovs, &(xo[2]));
Chris@42 1590 T7Y = VADD(T3L, T3M);
Chris@42 1591 STM2(&(xo[58]), T7Y, ovs, &(xo[2]));
Chris@42 1592 STN2(&(xo[56]), T7A, T7Y, ovs);
Chris@42 1593 T3P = VBYI(VSUB(T3N, T3O));
Chris@42 1594 T3S = VSUB(T3Q, T3R);
Chris@42 1595 T7Z = VADD(T3P, T3S);
Chris@42 1596 STM2(&(xo[38]), T7Z, ovs, &(xo[2]));
Chris@42 1597 T80 = VSUB(T3S, T3P);
Chris@42 1598 STM2(&(xo[90]), T80, ovs, &(xo[2]));
Chris@42 1599 STN2(&(xo[88]), T7C, T80, ovs);
Chris@42 1600 }
Chris@42 1601 }
Chris@42 1602 {
Chris@42 1603 V T81, T83, T86, T88;
Chris@42 1604 {
Chris@42 1605 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@42 1606 {
Chris@42 1607 V T4x, T4M, T5x, T5y;
Chris@42 1608 T4x = VADD(T4p, T4w);
Chris@42 1609 T4M = VADD(T4E, T4L);
Chris@42 1610 T4N = VADD(T4x, T4M);
Chris@42 1611 T5G = VSUB(T4x, T4M);
Chris@42 1612 T5x = VFNMS(LDK(KP195090322), T4Y, VMUL(LDK(KP980785280), T53));
Chris@42 1613 T5y = VFMA(LDK(KP195090322), T5f, VMUL(LDK(KP980785280), T5k));
Chris@42 1614 T5z = VADD(T5x, T5y);
Chris@42 1615 T5H = VSUB(T5y, T5x);
Chris@42 1616 }
Chris@42 1617 {
Chris@42 1618 V T54, T5l, T5s, T5v;
Chris@42 1619 T54 = VFMA(LDK(KP980785280), T4Y, VMUL(LDK(KP195090322), T53));
Chris@42 1620 T5l = VFNMS(LDK(KP195090322), T5k, VMUL(LDK(KP980785280), T5f));
Chris@42 1621 T5m = VADD(T54, T5l);
Chris@42 1622 T5D = VSUB(T5l, T54);
Chris@42 1623 T5s = VADD(T5q, T5r);
Chris@42 1624 T5v = VADD(T5t, T5u);
Chris@42 1625 T5w = VADD(T5s, T5v);
Chris@42 1626 T5E = VSUB(T5v, T5s);
Chris@42 1627 }
Chris@42 1628 {
Chris@42 1629 V T5n, T5A, T82, T5J, T5K, T84;
Chris@42 1630 T5n = VADD(T4N, T5m);
Chris@42 1631 T5A = VBYI(VADD(T5w, T5z));
Chris@42 1632 T81 = VSUB(T5n, T5A);
Chris@42 1633 STM2(&(xo[124]), T81, ovs, &(xo[0]));
Chris@42 1634 T82 = VADD(T5n, T5A);
Chris@42 1635 STM2(&(xo[4]), T82, ovs, &(xo[0]));
Chris@42 1636 STN2(&(xo[4]), T82, T7U, ovs);
Chris@42 1637 T5J = VBYI(VADD(T5E, T5D));
Chris@42 1638 T5K = VADD(T5G, T5H);
Chris@42 1639 T83 = VADD(T5J, T5K);
Chris@42 1640 STM2(&(xo[28]), T83, ovs, &(xo[0]));
Chris@42 1641 T84 = VSUB(T5K, T5J);
Chris@42 1642 STM2(&(xo[100]), T84, ovs, &(xo[0]));
Chris@42 1643 STN2(&(xo[100]), T84, T7W, ovs);
Chris@42 1644 }
Chris@42 1645 {
Chris@42 1646 V T5B, T5C, T85, T5F, T5I, T87;
Chris@42 1647 T5B = VSUB(T4N, T5m);
Chris@42 1648 T5C = VBYI(VSUB(T5z, T5w));
Chris@42 1649 T85 = VSUB(T5B, T5C);
Chris@42 1650 STM2(&(xo[68]), T85, ovs, &(xo[0]));
Chris@42 1651 STN2(&(xo[68]), T85, T7X, ovs);
Chris@42 1652 T86 = VADD(T5B, T5C);
Chris@42 1653 STM2(&(xo[60]), T86, ovs, &(xo[0]));
Chris@42 1654 T5F = VBYI(VSUB(T5D, T5E));
Chris@42 1655 T5I = VSUB(T5G, T5H);
Chris@42 1656 T87 = VADD(T5F, T5I);
Chris@42 1657 STM2(&(xo[36]), T87, ovs, &(xo[0]));
Chris@42 1658 STN2(&(xo[36]), T87, T7Z, ovs);
Chris@42 1659 T88 = VSUB(T5I, T5F);
Chris@42 1660 STM2(&(xo[92]), T88, ovs, &(xo[0]));
Chris@42 1661 }
Chris@42 1662 }
Chris@42 1663 {
Chris@42 1664 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@42 1665 {
Chris@42 1666 V T2H, T2I, T2V, T2W;
Chris@42 1667 T2H = VADD(Tb, Tq);
Chris@42 1668 T2I = VADD(T2q, T2p);
Chris@42 1669 T2J = VADD(T2H, T2I);
Chris@42 1670 T34 = VSUB(T2H, T2I);
Chris@42 1671 T2V = VFNMS(LDK(KP098017140), T2L, VMUL(LDK(KP995184726), T2K));
Chris@42 1672 T2W = VFMA(LDK(KP995184726), T2O, VMUL(LDK(KP098017140), T2N));
Chris@42 1673 T2X = VADD(T2V, T2W);
Chris@42 1674 T35 = VSUB(T2W, T2V);
Chris@42 1675 }
Chris@42 1676 {
Chris@42 1677 V T2M, T2P, T2S, T2T;
Chris@42 1678 T2M = VFMA(LDK(KP098017140), T2K, VMUL(LDK(KP995184726), T2L));
Chris@42 1679 T2P = VFNMS(LDK(KP098017140), T2O, VMUL(LDK(KP995184726), T2N));
Chris@42 1680 T2Q = VADD(T2M, T2P);
Chris@42 1681 T31 = VSUB(T2P, T2M);
Chris@42 1682 T2S = VADD(T2n, T2i);
Chris@42 1683 T2T = VADD(TZ, TI);
Chris@42 1684 T2U = VADD(T2S, T2T);
Chris@42 1685 T32 = VSUB(T2T, T2S);
Chris@42 1686 }
Chris@42 1687 {
Chris@42 1688 V T2R, T2Y, T89, T8a;
Chris@42 1689 T2R = VADD(T2J, T2Q);
Chris@42 1690 T2Y = VBYI(VADD(T2U, T2X));
Chris@42 1691 T89 = VSUB(T2R, T2Y);
Chris@42 1692 STM2(&(xo[126]), T89, ovs, &(xo[2]));
Chris@42 1693 STN2(&(xo[124]), T81, T89, ovs);
Chris@42 1694 T8a = VADD(T2R, T2Y);
Chris@42 1695 STM2(&(xo[2]), T8a, ovs, &(xo[2]));
Chris@42 1696 STN2(&(xo[0]), T7p, T8a, ovs);
Chris@42 1697 }
Chris@42 1698 {
Chris@42 1699 V T37, T38, T8b, T8c;
Chris@42 1700 T37 = VBYI(VADD(T32, T31));
Chris@42 1701 T38 = VADD(T34, T35);
Chris@42 1702 T8b = VADD(T37, T38);
Chris@42 1703 STM2(&(xo[30]), T8b, ovs, &(xo[2]));
Chris@42 1704 STN2(&(xo[28]), T83, T8b, ovs);
Chris@42 1705 T8c = VSUB(T38, T37);
Chris@42 1706 STM2(&(xo[98]), T8c, ovs, &(xo[2]));
Chris@42 1707 STN2(&(xo[96]), T7q, T8c, ovs);
Chris@42 1708 }
Chris@42 1709 {
Chris@42 1710 V T2Z, T30, T8d, T8e;
Chris@42 1711 T2Z = VSUB(T2J, T2Q);
Chris@42 1712 T30 = VBYI(VSUB(T2X, T2U));
Chris@42 1713 T8d = VSUB(T2Z, T30);
Chris@42 1714 STM2(&(xo[66]), T8d, ovs, &(xo[2]));
Chris@42 1715 STN2(&(xo[64]), T7n, T8d, ovs);
Chris@42 1716 T8e = VADD(T2Z, T30);
Chris@42 1717 STM2(&(xo[62]), T8e, ovs, &(xo[2]));
Chris@42 1718 STN2(&(xo[60]), T86, T8e, ovs);
Chris@42 1719 }
Chris@42 1720 {
Chris@42 1721 V T33, T36, T8f, T8g;
Chris@42 1722 T33 = VBYI(VSUB(T31, T32));
Chris@42 1723 T36 = VSUB(T34, T35);
Chris@42 1724 T8f = VADD(T33, T36);
Chris@42 1725 STM2(&(xo[34]), T8f, ovs, &(xo[2]));
Chris@42 1726 STN2(&(xo[32]), T7o, T8f, ovs);
Chris@42 1727 T8g = VSUB(T36, T33);
Chris@42 1728 STM2(&(xo[94]), T8g, ovs, &(xo[2]));
Chris@42 1729 STN2(&(xo[92]), T88, T8g, ovs);
Chris@42 1730 }
Chris@42 1731 }
Chris@42 1732 {
Chris@42 1733 V T3X, T4i, T4b, T4j, T44, T4f, T48, T4g;
Chris@42 1734 {
Chris@42 1735 V T3V, T3W, T49, T4a;
Chris@42 1736 T3V = VSUB(T39, T3a);
Chris@42 1737 T3W = VSUB(T3E, T3D);
Chris@42 1738 T3X = VADD(T3V, T3W);
Chris@42 1739 T4i = VSUB(T3V, T3W);
Chris@42 1740 T49 = VFNMS(LDK(KP471396736), T3Y, VMUL(LDK(KP881921264), T3Z));
Chris@42 1741 T4a = VFMA(LDK(KP471396736), T41, VMUL(LDK(KP881921264), T42));
Chris@42 1742 T4b = VADD(T49, T4a);
Chris@42 1743 T4j = VSUB(T4a, T49);
Chris@42 1744 }
Chris@42 1745 {
Chris@42 1746 V T40, T43, T46, T47;
Chris@42 1747 T40 = VFMA(LDK(KP881921264), T3Y, VMUL(LDK(KP471396736), T3Z));
Chris@42 1748 T43 = VFNMS(LDK(KP471396736), T42, VMUL(LDK(KP881921264), T41));
Chris@42 1749 T44 = VADD(T40, T43);
Chris@42 1750 T4f = VSUB(T43, T40);
Chris@42 1751 T46 = VSUB(T3B, T3A);
Chris@42 1752 T47 = VSUB(T3h, T3e);
Chris@42 1753 T48 = VADD(T46, T47);
Chris@42 1754 T4g = VSUB(T47, T46);
Chris@42 1755 }
Chris@42 1756 {
Chris@42 1757 V T45, T4c, T8h, T8i;
Chris@42 1758 T45 = VADD(T3X, T44);
Chris@42 1759 T4c = VBYI(VADD(T48, T4b));
Chris@42 1760 T8h = VSUB(T45, T4c);
Chris@42 1761 STM2(&(xo[118]), T8h, ovs, &(xo[2]));
Chris@42 1762 STN2(&(xo[116]), T7D, T8h, ovs);
Chris@42 1763 T8i = VADD(T45, T4c);
Chris@42 1764 STM2(&(xo[10]), T8i, ovs, &(xo[2]));
Chris@42 1765 STN2(&(xo[8]), T7w, T8i, ovs);
Chris@42 1766 }
Chris@42 1767 {
Chris@42 1768 V T4l, T4m, T8j, T8k;
Chris@42 1769 T4l = VBYI(VADD(T4g, T4f));
Chris@42 1770 T4m = VADD(T4i, T4j);
Chris@42 1771 T8j = VADD(T4l, T4m);
Chris@42 1772 STM2(&(xo[22]), T8j, ovs, &(xo[2]));
Chris@42 1773 STN2(&(xo[20]), T7F, T8j, ovs);
Chris@42 1774 T8k = VSUB(T4m, T4l);
Chris@42 1775 STM2(&(xo[106]), T8k, ovs, &(xo[2]));
Chris@42 1776 STN2(&(xo[104]), T7y, T8k, ovs);
Chris@42 1777 }
Chris@42 1778 {
Chris@42 1779 V T4d, T4e, T8l, T8m;
Chris@42 1780 T4d = VSUB(T3X, T44);
Chris@42 1781 T4e = VBYI(VSUB(T4b, T48));
Chris@42 1782 T8l = VSUB(T4d, T4e);
Chris@42 1783 STM2(&(xo[74]), T8l, ovs, &(xo[2]));
Chris@42 1784 STN2(&(xo[72]), T7z, T8l, ovs);
Chris@42 1785 T8m = VADD(T4d, T4e);
Chris@42 1786 STM2(&(xo[54]), T8m, ovs, &(xo[2]));
Chris@42 1787 STN2(&(xo[52]), T7I, T8m, ovs);
Chris@42 1788 }
Chris@42 1789 {
Chris@42 1790 V T4h, T4k, T8n, T8o;
Chris@42 1791 T4h = VBYI(VSUB(T4f, T4g));
Chris@42 1792 T4k = VSUB(T4i, T4j);
Chris@42 1793 T8n = VADD(T4h, T4k);
Chris@42 1794 STM2(&(xo[42]), T8n, ovs, &(xo[2]));
Chris@42 1795 STN2(&(xo[40]), T7B, T8n, ovs);
Chris@42 1796 T8o = VSUB(T4k, T4h);
Chris@42 1797 STM2(&(xo[86]), T8o, ovs, &(xo[2]));
Chris@42 1798 STN2(&(xo[84]), T7K, T8o, ovs);
Chris@42 1799 }
Chris@42 1800 }
Chris@42 1801 }
Chris@42 1802 }
Chris@42 1803 }
Chris@42 1804 }
Chris@42 1805 }
Chris@42 1806 VLEAVE();
Chris@42 1807 }
Chris@42 1808
Chris@42 1809 static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 1810
Chris@42 1811 void XSIMD(codelet_n2fv_64) (planner *p) {
Chris@42 1812 X(kdft_register) (p, n2fv_64, &desc);
Chris@42 1813 }
Chris@42 1814
Chris@42 1815 #endif /* HAVE_FMA */