annotate src/fftw-3.3.8/dft/simd/common/n2bv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:15 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include dft/simd/n2b.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@82 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@82 33 * 120 stack variables, 15 constants, and 160 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2b.h"
Chris@82 36
Chris@82 37 static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@82 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@82 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@82 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@82 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 50 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 51 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 54 {
Chris@82 55 INT i;
Chris@82 56 const R *xi;
Chris@82 57 R *xo;
Chris@82 58 xi = ii;
Chris@82 59 xo = io;
Chris@82 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 61 V T7, T26, T47, T69, T5k, T6A, T2V, T3z, Tm, T27, T5n, T6a, T2Y, T3M, T4e;
Chris@82 62 V T6B, TC, T29, T6e, T6D, T3i, T3A, T4o, T5p, TR, T2a, T6h, T6E, T3l, T3B;
Chris@82 63 V T4x, T5q, T1N, T2x, T6t, T71, T6w, T72, T1W, T2y, T39, T3H, T57, T5N, T5e;
Chris@82 64 V T5O, T3c, T3I, T1g, T2u, T6m, T6Y, T6p, T6Z, T1p, T2v, T32, T3E, T4M, T5K;
Chris@82 65 V T4T, T5L, T35, T3F;
Chris@82 66 {
Chris@82 67 V T3, T43, T25, T45, T6, T5i, T22, T44;
Chris@82 68 {
Chris@82 69 V T1, T2, T23, T24;
Chris@82 70 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 71 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 72 T3 = VADD(T1, T2);
Chris@82 73 T43 = VSUB(T1, T2);
Chris@82 74 T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 75 T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 76 T25 = VADD(T23, T24);
Chris@82 77 T45 = VSUB(T23, T24);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V T4, T5, T20, T21;
Chris@82 81 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 82 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 83 T6 = VADD(T4, T5);
Chris@82 84 T5i = VSUB(T4, T5);
Chris@82 85 T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 86 T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 87 T22 = VADD(T20, T21);
Chris@82 88 T44 = VSUB(T20, T21);
Chris@82 89 }
Chris@82 90 T7 = VSUB(T3, T6);
Chris@82 91 T26 = VSUB(T22, T25);
Chris@82 92 {
Chris@82 93 V T46, T5j, T2T, T2U;
Chris@82 94 T46 = VADD(T44, T45);
Chris@82 95 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@82 96 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@82 97 T5j = VSUB(T44, T45);
Chris@82 98 T5k = VFMA(LDK(KP707106781), T5j, T5i);
Chris@82 99 T6A = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@82 100 T2T = VADD(T3, T6);
Chris@82 101 T2U = VADD(T22, T25);
Chris@82 102 T2V = VADD(T2T, T2U);
Chris@82 103 T3z = VSUB(T2T, T2U);
Chris@82 104 }
Chris@82 105 }
Chris@82 106 {
Chris@82 107 V Ta, T48, Tk, T4c, Td, T49, Th, T4b;
Chris@82 108 {
Chris@82 109 V T8, T9, Ti, Tj;
Chris@82 110 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 111 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 112 Ta = VADD(T8, T9);
Chris@82 113 T48 = VSUB(T8, T9);
Chris@82 114 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 115 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 116 Tk = VADD(Ti, Tj);
Chris@82 117 T4c = VSUB(Tj, Ti);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 V Tb, Tc, Tf, Tg;
Chris@82 121 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 122 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 123 Td = VADD(Tb, Tc);
Chris@82 124 T49 = VSUB(Tb, Tc);
Chris@82 125 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 126 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 127 Th = VADD(Tf, Tg);
Chris@82 128 T4b = VSUB(Tf, Tg);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V Te, Tl, T5l, T5m;
Chris@82 132 Te = VSUB(Ta, Td);
Chris@82 133 Tl = VSUB(Th, Tk);
Chris@82 134 Tm = VADD(Te, Tl);
Chris@82 135 T27 = VSUB(Te, Tl);
Chris@82 136 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@82 137 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@82 138 T5n = VSUB(T5l, T5m);
Chris@82 139 T6a = VADD(T5l, T5m);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 V T2W, T2X, T4a, T4d;
Chris@82 143 T2W = VADD(Ta, Td);
Chris@82 144 T2X = VADD(Th, Tk);
Chris@82 145 T2Y = VADD(T2W, T2X);
Chris@82 146 T3M = VSUB(T2W, T2X);
Chris@82 147 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@82 148 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@82 149 T4e = VADD(T4a, T4d);
Chris@82 150 T6B = VSUB(T4a, T4d);
Chris@82 151 }
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V Tq, T4g, Tt, T4l, Tx, T4m, TA, T4j;
Chris@82 155 {
Chris@82 156 V To, Tp, Tr, Ts;
Chris@82 157 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 158 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 159 Tq = VADD(To, Tp);
Chris@82 160 T4g = VSUB(To, Tp);
Chris@82 161 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 162 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 163 Tt = VADD(Tr, Ts);
Chris@82 164 T4l = VSUB(Tr, Ts);
Chris@82 165 {
Chris@82 166 V Tv, Tw, T4h, Ty, Tz, T4i;
Chris@82 167 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 168 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 169 T4h = VSUB(Tv, Tw);
Chris@82 170 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 171 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 172 T4i = VSUB(Ty, Tz);
Chris@82 173 Tx = VADD(Tv, Tw);
Chris@82 174 T4m = VSUB(T4h, T4i);
Chris@82 175 TA = VADD(Ty, Tz);
Chris@82 176 T4j = VADD(T4h, T4i);
Chris@82 177 }
Chris@82 178 }
Chris@82 179 {
Chris@82 180 V Tu, TB, T6c, T6d;
Chris@82 181 Tu = VSUB(Tq, Tt);
Chris@82 182 TB = VSUB(Tx, TA);
Chris@82 183 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@82 184 T29 = VFMA(LDK(KP414213562), Tu, TB);
Chris@82 185 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@82 186 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@82 187 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@82 188 T6D = VFMA(LDK(KP668178637), T6c, T6d);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 V T3g, T3h, T4k, T4n;
Chris@82 192 T3g = VADD(Tq, Tt);
Chris@82 193 T3h = VADD(Tx, TA);
Chris@82 194 T3i = VADD(T3g, T3h);
Chris@82 195 T3A = VSUB(T3g, T3h);
Chris@82 196 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@82 197 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@82 198 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@82 199 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 V TF, T4p, TI, T4u, TM, T4v, TP, T4s;
Chris@82 204 {
Chris@82 205 V TD, TE, TG, TH;
Chris@82 206 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 207 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 208 TF = VADD(TD, TE);
Chris@82 209 T4p = VSUB(TD, TE);
Chris@82 210 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 211 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 212 TI = VADD(TG, TH);
Chris@82 213 T4u = VSUB(TH, TG);
Chris@82 214 {
Chris@82 215 V TK, TL, T4r, TN, TO, T4q;
Chris@82 216 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 217 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 218 T4r = VSUB(TK, TL);
Chris@82 219 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 220 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 221 T4q = VSUB(TN, TO);
Chris@82 222 TM = VADD(TK, TL);
Chris@82 223 T4v = VSUB(T4r, T4q);
Chris@82 224 TP = VADD(TN, TO);
Chris@82 225 T4s = VADD(T4q, T4r);
Chris@82 226 }
Chris@82 227 }
Chris@82 228 {
Chris@82 229 V TJ, TQ, T6f, T6g;
Chris@82 230 TJ = VSUB(TF, TI);
Chris@82 231 TQ = VSUB(TM, TP);
Chris@82 232 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@82 233 T2a = VFMA(LDK(KP414213562), TJ, TQ);
Chris@82 234 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@82 235 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@82 236 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@82 237 T6E = VFMA(LDK(KP668178637), T6f, T6g);
Chris@82 238 }
Chris@82 239 {
Chris@82 240 V T3j, T3k, T4t, T4w;
Chris@82 241 T3j = VADD(TF, TI);
Chris@82 242 T3k = VADD(TP, TM);
Chris@82 243 T3l = VADD(T3j, T3k);
Chris@82 244 T3B = VSUB(T3j, T3k);
Chris@82 245 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@82 246 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@82 247 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@82 248 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@82 249 }
Chris@82 250 }
Chris@82 251 {
Chris@82 252 V T1t, T4V, T1w, T58, T1Q, T59, T1T, T4Y, T1A, T1D, T1E, T5b, T52, T1H, T1K;
Chris@82 253 V T1L, T5c, T55;
Chris@82 254 {
Chris@82 255 V T1r, T1s, T1u, T1v;
Chris@82 256 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 257 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 258 T1t = VADD(T1r, T1s);
Chris@82 259 T4V = VSUB(T1r, T1s);
Chris@82 260 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 261 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 262 T1w = VADD(T1u, T1v);
Chris@82 263 T58 = VSUB(T1v, T1u);
Chris@82 264 }
Chris@82 265 {
Chris@82 266 V T1O, T1P, T4X, T1R, T1S, T4W;
Chris@82 267 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 268 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 269 T4X = VSUB(T1O, T1P);
Chris@82 270 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 271 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 272 T4W = VSUB(T1R, T1S);
Chris@82 273 T1Q = VADD(T1O, T1P);
Chris@82 274 T59 = VSUB(T4X, T4W);
Chris@82 275 T1T = VADD(T1R, T1S);
Chris@82 276 T4Y = VADD(T4W, T4X);
Chris@82 277 }
Chris@82 278 {
Chris@82 279 V T50, T51, T53, T54;
Chris@82 280 {
Chris@82 281 V T1y, T1z, T1B, T1C;
Chris@82 282 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 283 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 284 T1A = VADD(T1y, T1z);
Chris@82 285 T50 = VSUB(T1y, T1z);
Chris@82 286 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 287 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 288 T1D = VADD(T1B, T1C);
Chris@82 289 T51 = VSUB(T1C, T1B);
Chris@82 290 }
Chris@82 291 T1E = VSUB(T1A, T1D);
Chris@82 292 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@82 293 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@82 294 {
Chris@82 295 V T1F, T1G, T1I, T1J;
Chris@82 296 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 297 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 298 T1H = VADD(T1F, T1G);
Chris@82 299 T53 = VSUB(T1F, T1G);
Chris@82 300 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 301 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 302 T1K = VADD(T1I, T1J);
Chris@82 303 T54 = VSUB(T1J, T1I);
Chris@82 304 }
Chris@82 305 T1L = VSUB(T1H, T1K);
Chris@82 306 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@82 307 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@82 308 }
Chris@82 309 {
Chris@82 310 V T1x, T1M, T6r, T6s;
Chris@82 311 T1x = VSUB(T1t, T1w);
Chris@82 312 T1M = VADD(T1E, T1L);
Chris@82 313 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@82 314 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@82 315 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@82 316 T6s = VSUB(T5c, T5b);
Chris@82 317 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@82 318 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@82 319 }
Chris@82 320 {
Chris@82 321 V T6u, T6v, T1U, T1V;
Chris@82 322 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@82 323 T6v = VSUB(T55, T52);
Chris@82 324 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@82 325 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@82 326 T1U = VSUB(T1Q, T1T);
Chris@82 327 T1V = VSUB(T1L, T1E);
Chris@82 328 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@82 329 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@82 330 }
Chris@82 331 {
Chris@82 332 V T37, T38, T4Z, T56;
Chris@82 333 T37 = VADD(T1t, T1w);
Chris@82 334 T38 = VADD(T1T, T1Q);
Chris@82 335 T39 = VADD(T37, T38);
Chris@82 336 T3H = VSUB(T37, T38);
Chris@82 337 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@82 338 T56 = VADD(T52, T55);
Chris@82 339 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@82 340 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@82 341 }
Chris@82 342 {
Chris@82 343 V T5a, T5d, T3a, T3b;
Chris@82 344 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@82 345 T5d = VADD(T5b, T5c);
Chris@82 346 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@82 347 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@82 348 T3a = VADD(T1A, T1D);
Chris@82 349 T3b = VADD(T1H, T1K);
Chris@82 350 T3c = VADD(T3a, T3b);
Chris@82 351 T3I = VSUB(T3b, T3a);
Chris@82 352 }
Chris@82 353 }
Chris@82 354 {
Chris@82 355 V TW, T4A, TZ, T4N, T1j, T4O, T1m, T4D, T13, T16, T17, T4Q, T4H, T1a, T1d;
Chris@82 356 V T1e, T4R, T4K;
Chris@82 357 {
Chris@82 358 V TU, TV, TX, TY;
Chris@82 359 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 360 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 361 TW = VADD(TU, TV);
Chris@82 362 T4A = VSUB(TU, TV);
Chris@82 363 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 364 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 365 TZ = VADD(TX, TY);
Chris@82 366 T4N = VSUB(TX, TY);
Chris@82 367 }
Chris@82 368 {
Chris@82 369 V T1h, T1i, T4B, T1k, T1l, T4C;
Chris@82 370 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 371 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 372 T4B = VSUB(T1h, T1i);
Chris@82 373 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 374 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 375 T4C = VSUB(T1k, T1l);
Chris@82 376 T1j = VADD(T1h, T1i);
Chris@82 377 T4O = VSUB(T4B, T4C);
Chris@82 378 T1m = VADD(T1k, T1l);
Chris@82 379 T4D = VADD(T4B, T4C);
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V T4F, T4G, T4I, T4J;
Chris@82 383 {
Chris@82 384 V T11, T12, T14, T15;
Chris@82 385 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 386 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 387 T13 = VADD(T11, T12);
Chris@82 388 T4F = VSUB(T11, T12);
Chris@82 389 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 390 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 391 T16 = VADD(T14, T15);
Chris@82 392 T4G = VSUB(T14, T15);
Chris@82 393 }
Chris@82 394 T17 = VSUB(T13, T16);
Chris@82 395 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@82 396 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@82 397 {
Chris@82 398 V T18, T19, T1b, T1c;
Chris@82 399 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 400 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 401 T1a = VADD(T18, T19);
Chris@82 402 T4I = VSUB(T18, T19);
Chris@82 403 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 404 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 405 T1d = VADD(T1b, T1c);
Chris@82 406 T4J = VSUB(T1b, T1c);
Chris@82 407 }
Chris@82 408 T1e = VSUB(T1a, T1d);
Chris@82 409 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@82 410 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@82 411 }
Chris@82 412 {
Chris@82 413 V T10, T1f, T6k, T6l;
Chris@82 414 T10 = VSUB(TW, TZ);
Chris@82 415 T1f = VADD(T17, T1e);
Chris@82 416 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@82 417 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@82 418 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@82 419 T6l = VSUB(T4Q, T4R);
Chris@82 420 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@82 421 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@82 422 }
Chris@82 423 {
Chris@82 424 V T6n, T6o, T1n, T1o;
Chris@82 425 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@82 426 T6o = VSUB(T4H, T4K);
Chris@82 427 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@82 428 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@82 429 T1n = VSUB(T1j, T1m);
Chris@82 430 T1o = VSUB(T17, T1e);
Chris@82 431 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@82 432 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@82 433 }
Chris@82 434 {
Chris@82 435 V T30, T31, T4E, T4L;
Chris@82 436 T30 = VADD(TW, TZ);
Chris@82 437 T31 = VADD(T1j, T1m);
Chris@82 438 T32 = VADD(T30, T31);
Chris@82 439 T3E = VSUB(T30, T31);
Chris@82 440 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@82 441 T4L = VADD(T4H, T4K);
Chris@82 442 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@82 443 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@82 444 }
Chris@82 445 {
Chris@82 446 V T4P, T4S, T33, T34;
Chris@82 447 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@82 448 T4S = VADD(T4Q, T4R);
Chris@82 449 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@82 450 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@82 451 T33 = VADD(T13, T16);
Chris@82 452 T34 = VADD(T1a, T1d);
Chris@82 453 T35 = VADD(T33, T34);
Chris@82 454 T3F = VSUB(T33, T34);
Chris@82 455 }
Chris@82 456 }
Chris@82 457 {
Chris@82 458 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7w, T7y, T7A, T7B, T7D, T7G, T7H;
Chris@82 459 V T7J;
Chris@82 460 {
Chris@82 461 V T3t, T3x, T3w, T3y;
Chris@82 462 {
Chris@82 463 V T3r, T3s, T3u, T3v;
Chris@82 464 T3r = VADD(T2V, T2Y);
Chris@82 465 T3s = VADD(T3i, T3l);
Chris@82 466 T3t = VSUB(T3r, T3s);
Chris@82 467 T3x = VADD(T3r, T3s);
Chris@82 468 T3u = VADD(T32, T35);
Chris@82 469 T3v = VADD(T39, T3c);
Chris@82 470 T3w = VSUB(T3u, T3v);
Chris@82 471 T3y = VADD(T3u, T3v);
Chris@82 472 }
Chris@82 473 T7n = VFNMSI(T3w, T3t);
Chris@82 474 STM2(&(xo[96]), T7n, ovs, &(xo[0]));
Chris@82 475 T7o = VADD(T3x, T3y);
Chris@82 476 STM2(&(xo[0]), T7o, ovs, &(xo[0]));
Chris@82 477 T7p = VFMAI(T3w, T3t);
Chris@82 478 STM2(&(xo[32]), T7p, ovs, &(xo[0]));
Chris@82 479 T7q = VSUB(T3x, T3y);
Chris@82 480 STM2(&(xo[64]), T7q, ovs, &(xo[0]));
Chris@82 481 }
Chris@82 482 {
Chris@82 483 V T2Z, T3m, T3e, T3n, T36, T3d;
Chris@82 484 T2Z = VSUB(T2V, T2Y);
Chris@82 485 T3m = VSUB(T3i, T3l);
Chris@82 486 T36 = VSUB(T32, T35);
Chris@82 487 T3d = VSUB(T39, T3c);
Chris@82 488 T3e = VADD(T36, T3d);
Chris@82 489 T3n = VSUB(T36, T3d);
Chris@82 490 {
Chris@82 491 V T3f, T3o, T3p, T3q;
Chris@82 492 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@82 493 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@82 494 T7r = VFNMSI(T3o, T3f);
Chris@82 495 STM2(&(xo[48]), T7r, ovs, &(xo[0]));
Chris@82 496 T7s = VFMAI(T3o, T3f);
Chris@82 497 STM2(&(xo[80]), T7s, ovs, &(xo[0]));
Chris@82 498 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@82 499 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@82 500 T7t = VFMAI(T3q, T3p);
Chris@82 501 STM2(&(xo[16]), T7t, ovs, &(xo[0]));
Chris@82 502 T7u = VFNMSI(T3q, T3p);
Chris@82 503 STM2(&(xo[112]), T7u, ovs, &(xo[0]));
Chris@82 504 }
Chris@82 505 }
Chris@82 506 {
Chris@82 507 V T7v, T7x, T7z, T7C;
Chris@82 508 {
Chris@82 509 V T3D, T3V, T3O, T3Y, T3K, T3Z, T3R, T3W, T3C, T3N;
Chris@82 510 T3C = VADD(T3A, T3B);
Chris@82 511 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@82 512 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@82 513 T3N = VSUB(T3A, T3B);
Chris@82 514 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@82 515 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@82 516 {
Chris@82 517 V T3G, T3J, T3P, T3Q;
Chris@82 518 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@82 519 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@82 520 T3K = VADD(T3G, T3J);
Chris@82 521 T3Z = VSUB(T3G, T3J);
Chris@82 522 T3P = VFMA(LDK(KP414213562), T3E, T3F);
Chris@82 523 T3Q = VFMA(LDK(KP414213562), T3H, T3I);
Chris@82 524 T3R = VSUB(T3P, T3Q);
Chris@82 525 T3W = VADD(T3P, T3Q);
Chris@82 526 }
Chris@82 527 {
Chris@82 528 V T3L, T3S, T41, T42;
Chris@82 529 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@82 530 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@82 531 T7v = VFNMSI(T3S, T3L);
Chris@82 532 STM2(&(xo[56]), T7v, ovs, &(xo[0]));
Chris@82 533 T7w = VFMAI(T3S, T3L);
Chris@82 534 STM2(&(xo[72]), T7w, ovs, &(xo[0]));
Chris@82 535 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@82 536 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@82 537 T7x = VFNMSI(T42, T41);
Chris@82 538 STM2(&(xo[24]), T7x, ovs, &(xo[0]));
Chris@82 539 T7y = VFMAI(T42, T41);
Chris@82 540 STM2(&(xo[104]), T7y, ovs, &(xo[0]));
Chris@82 541 }
Chris@82 542 {
Chris@82 543 V T3T, T3U, T3X, T40;
Chris@82 544 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@82 545 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@82 546 T7z = VFNMSI(T3U, T3T);
Chris@82 547 STM2(&(xo[120]), T7z, ovs, &(xo[0]));
Chris@82 548 T7A = VFMAI(T3U, T3T);
Chris@82 549 STM2(&(xo[8]), T7A, ovs, &(xo[0]));
Chris@82 550 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@82 551 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@82 552 T7B = VFMAI(T40, T3X);
Chris@82 553 STM2(&(xo[40]), T7B, ovs, &(xo[0]));
Chris@82 554 T7C = VFNMSI(T40, T3X);
Chris@82 555 STM2(&(xo[88]), T7C, ovs, &(xo[0]));
Chris@82 556 }
Chris@82 557 }
Chris@82 558 {
Chris@82 559 V T6X, T7f, T7b, T7g, T74, T7j, T78, T7i;
Chris@82 560 {
Chris@82 561 V T6V, T6W, T79, T7a;
Chris@82 562 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@82 563 T6W = VADD(T6D, T6E);
Chris@82 564 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@82 565 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@82 566 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@82 567 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@82 568 T7b = VSUB(T79, T7a);
Chris@82 569 T7g = VADD(T79, T7a);
Chris@82 570 }
Chris@82 571 {
Chris@82 572 V T70, T73, T76, T77;
Chris@82 573 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@82 574 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@82 575 T74 = VADD(T70, T73);
Chris@82 576 T7j = VSUB(T70, T73);
Chris@82 577 T76 = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@82 578 T77 = VSUB(T6e, T6h);
Chris@82 579 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@82 580 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@82 581 }
Chris@82 582 {
Chris@82 583 V T75, T7c, T7E, T7l, T7m, T7F;
Chris@82 584 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@82 585 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@82 586 T7D = VFNMSI(T7c, T75);
Chris@82 587 STM2(&(xo[70]), T7D, ovs, &(xo[2]));
Chris@82 588 T7E = VFMAI(T7c, T75);
Chris@82 589 STM2(&(xo[58]), T7E, ovs, &(xo[2]));
Chris@82 590 STN2(&(xo[56]), T7v, T7E, ovs);
Chris@82 591 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@82 592 T7m = VFMA(LDK(KP956940335), T7j, T7i);
Chris@82 593 T7F = VFMAI(T7m, T7l);
Chris@82 594 STM2(&(xo[26]), T7F, ovs, &(xo[2]));
Chris@82 595 STN2(&(xo[24]), T7x, T7F, ovs);
Chris@82 596 T7G = VFNMSI(T7m, T7l);
Chris@82 597 STM2(&(xo[102]), T7G, ovs, &(xo[2]));
Chris@82 598 }
Chris@82 599 {
Chris@82 600 V T7d, T7e, T7I, T7h, T7k, T7K;
Chris@82 601 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@82 602 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@82 603 T7H = VFNMSI(T7e, T7d);
Chris@82 604 STM2(&(xo[6]), T7H, ovs, &(xo[2]));
Chris@82 605 T7I = VFMAI(T7e, T7d);
Chris@82 606 STM2(&(xo[122]), T7I, ovs, &(xo[2]));
Chris@82 607 STN2(&(xo[120]), T7z, T7I, ovs);
Chris@82 608 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@82 609 T7k = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@82 610 T7J = VFNMSI(T7k, T7h);
Chris@82 611 STM2(&(xo[38]), T7J, ovs, &(xo[2]));
Chris@82 612 T7K = VFMAI(T7k, T7h);
Chris@82 613 STM2(&(xo[90]), T7K, ovs, &(xo[2]));
Chris@82 614 STN2(&(xo[88]), T7C, T7K, ovs);
Chris@82 615 }
Chris@82 616 }
Chris@82 617 }
Chris@82 618 {
Chris@82 619 V T7L, T7N, T7P, T7S;
Chris@82 620 {
Chris@82 621 V TT, T2j, T2f, T2k, T1Y, T2n, T2c, T2m;
Chris@82 622 {
Chris@82 623 V Tn, TS, T2d, T2e;
Chris@82 624 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@82 625 TS = VADD(TC, TR);
Chris@82 626 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@82 627 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@82 628 T2d = VFMA(LDK(KP198912367), T1g, T1p);
Chris@82 629 T2e = VFMA(LDK(KP198912367), T1N, T1W);
Chris@82 630 T2f = VSUB(T2d, T2e);
Chris@82 631 T2k = VADD(T2d, T2e);
Chris@82 632 }
Chris@82 633 {
Chris@82 634 V T1q, T1X, T28, T2b;
Chris@82 635 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@82 636 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@82 637 T1Y = VADD(T1q, T1X);
Chris@82 638 T2n = VSUB(T1q, T1X);
Chris@82 639 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@82 640 T2b = VSUB(T29, T2a);
Chris@82 641 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@82 642 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 V T1Z, T2g, T7M, T2p, T2q, T7O;
Chris@82 646 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@82 647 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@82 648 T7L = VFNMSI(T2g, T1Z);
Chris@82 649 STM2(&(xo[60]), T7L, ovs, &(xo[0]));
Chris@82 650 T7M = VFMAI(T2g, T1Z);
Chris@82 651 STM2(&(xo[68]), T7M, ovs, &(xo[0]));
Chris@82 652 STN2(&(xo[68]), T7M, T7D, ovs);
Chris@82 653 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@82 654 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@82 655 T7N = VFNMSI(T2q, T2p);
Chris@82 656 STM2(&(xo[28]), T7N, ovs, &(xo[0]));
Chris@82 657 T7O = VFMAI(T2q, T2p);
Chris@82 658 STM2(&(xo[100]), T7O, ovs, &(xo[0]));
Chris@82 659 STN2(&(xo[100]), T7O, T7G, ovs);
Chris@82 660 }
Chris@82 661 {
Chris@82 662 V T2h, T2i, T7Q, T2l, T2o, T7R;
Chris@82 663 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@82 664 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@82 665 T7P = VFNMSI(T2i, T2h);
Chris@82 666 STM2(&(xo[124]), T7P, ovs, &(xo[0]));
Chris@82 667 T7Q = VFMAI(T2i, T2h);
Chris@82 668 STM2(&(xo[4]), T7Q, ovs, &(xo[0]));
Chris@82 669 STN2(&(xo[4]), T7Q, T7H, ovs);
Chris@82 670 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@82 671 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@82 672 T7R = VFMAI(T2o, T2l);
Chris@82 673 STM2(&(xo[36]), T7R, ovs, &(xo[0]));
Chris@82 674 STN2(&(xo[36]), T7R, T7J, ovs);
Chris@82 675 T7S = VFNMSI(T2o, T2l);
Chris@82 676 STM2(&(xo[92]), T7S, ovs, &(xo[0]));
Chris@82 677 }
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V T4z, T5z, T5v, T5A, T5g, T5D, T5s, T5C;
Chris@82 681 {
Chris@82 682 V T4f, T4y, T5t, T5u;
Chris@82 683 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@82 684 T4y = VADD(T4o, T4x);
Chris@82 685 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@82 686 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@82 687 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@82 688 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@82 689 T5v = VSUB(T5t, T5u);
Chris@82 690 T5A = VADD(T5t, T5u);
Chris@82 691 }
Chris@82 692 {
Chris@82 693 V T4U, T5f, T5o, T5r;
Chris@82 694 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@82 695 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@82 696 T5g = VADD(T4U, T5f);
Chris@82 697 T5D = VSUB(T4U, T5f);
Chris@82 698 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@82 699 T5r = VSUB(T5p, T5q);
Chris@82 700 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@82 701 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@82 702 }
Chris@82 703 {
Chris@82 704 V T5h, T5w, T7T, T7U;
Chris@82 705 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@82 706 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@82 707 T7T = VFNMSI(T5w, T5h);
Chris@82 708 STM2(&(xo[62]), T7T, ovs, &(xo[2]));
Chris@82 709 STN2(&(xo[60]), T7L, T7T, ovs);
Chris@82 710 T7U = VFMAI(T5w, T5h);
Chris@82 711 STM2(&(xo[66]), T7U, ovs, &(xo[2]));
Chris@82 712 STN2(&(xo[64]), T7q, T7U, ovs);
Chris@82 713 }
Chris@82 714 {
Chris@82 715 V T5F, T5G, T7V, T7W;
Chris@82 716 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@82 717 T5G = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@82 718 T7V = VFNMSI(T5G, T5F);
Chris@82 719 STM2(&(xo[30]), T7V, ovs, &(xo[2]));
Chris@82 720 STN2(&(xo[28]), T7N, T7V, ovs);
Chris@82 721 T7W = VFMAI(T5G, T5F);
Chris@82 722 STM2(&(xo[98]), T7W, ovs, &(xo[2]));
Chris@82 723 STN2(&(xo[96]), T7n, T7W, ovs);
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V T5x, T5y, T7X, T7Y;
Chris@82 727 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@82 728 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@82 729 T7X = VFNMSI(T5y, T5x);
Chris@82 730 STM2(&(xo[126]), T7X, ovs, &(xo[2]));
Chris@82 731 STN2(&(xo[124]), T7P, T7X, ovs);
Chris@82 732 T7Y = VFMAI(T5y, T5x);
Chris@82 733 STM2(&(xo[2]), T7Y, ovs, &(xo[2]));
Chris@82 734 STN2(&(xo[0]), T7o, T7Y, ovs);
Chris@82 735 }
Chris@82 736 {
Chris@82 737 V T5B, T5E, T7Z, T80;
Chris@82 738 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@82 739 T5E = VFMA(LDK(KP995184726), T5D, T5C);
Chris@82 740 T7Z = VFMAI(T5E, T5B);
Chris@82 741 STM2(&(xo[34]), T7Z, ovs, &(xo[2]));
Chris@82 742 STN2(&(xo[32]), T7p, T7Z, ovs);
Chris@82 743 T80 = VFNMSI(T5E, T5B);
Chris@82 744 STM2(&(xo[94]), T80, ovs, &(xo[2]));
Chris@82 745 STN2(&(xo[92]), T7S, T80, ovs);
Chris@82 746 }
Chris@82 747 }
Chris@82 748 }
Chris@82 749 {
Chris@82 750 V T81, T83, T85, T88;
Chris@82 751 {
Chris@82 752 V T6j, T6N, T6J, T6O, T6y, T6R, T6G, T6Q;
Chris@82 753 {
Chris@82 754 V T6b, T6i, T6H, T6I;
Chris@82 755 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@82 756 T6i = VADD(T6e, T6h);
Chris@82 757 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@82 758 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@82 759 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@82 760 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@82 761 T6J = VSUB(T6H, T6I);
Chris@82 762 T6O = VADD(T6H, T6I);
Chris@82 763 }
Chris@82 764 {
Chris@82 765 V T6q, T6x, T6C, T6F;
Chris@82 766 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@82 767 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@82 768 T6y = VADD(T6q, T6x);
Chris@82 769 T6R = VSUB(T6q, T6x);
Chris@82 770 T6C = VFMA(LDK(KP923879532), T6B, T6A);
Chris@82 771 T6F = VSUB(T6D, T6E);
Chris@82 772 T6G = VFMA(LDK(KP831469612), T6F, T6C);
Chris@82 773 T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@82 774 }
Chris@82 775 {
Chris@82 776 V T6z, T6K, T82, T6T, T6U, T84;
Chris@82 777 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@82 778 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@82 779 T81 = VFNMSI(T6K, T6z);
Chris@82 780 STM2(&(xo[54]), T81, ovs, &(xo[2]));
Chris@82 781 T82 = VFMAI(T6K, T6z);
Chris@82 782 STM2(&(xo[74]), T82, ovs, &(xo[2]));
Chris@82 783 STN2(&(xo[72]), T7w, T82, ovs);
Chris@82 784 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@82 785 T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@82 786 T83 = VFNMSI(T6U, T6T);
Chris@82 787 STM2(&(xo[22]), T83, ovs, &(xo[2]));
Chris@82 788 T84 = VFMAI(T6U, T6T);
Chris@82 789 STM2(&(xo[106]), T84, ovs, &(xo[2]));
Chris@82 790 STN2(&(xo[104]), T7y, T84, ovs);
Chris@82 791 }
Chris@82 792 {
Chris@82 793 V T6L, T6M, T86, T6P, T6S, T87;
Chris@82 794 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@82 795 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@82 796 T85 = VFNMSI(T6M, T6L);
Chris@82 797 STM2(&(xo[118]), T85, ovs, &(xo[2]));
Chris@82 798 T86 = VFMAI(T6M, T6L);
Chris@82 799 STM2(&(xo[10]), T86, ovs, &(xo[2]));
Chris@82 800 STN2(&(xo[8]), T7A, T86, ovs);
Chris@82 801 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@82 802 T6S = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@82 803 T87 = VFMAI(T6S, T6P);
Chris@82 804 STM2(&(xo[42]), T87, ovs, &(xo[2]));
Chris@82 805 STN2(&(xo[40]), T7B, T87, ovs);
Chris@82 806 T88 = VFNMSI(T6S, T6P);
Chris@82 807 STM2(&(xo[86]), T88, ovs, &(xo[2]));
Chris@82 808 }
Chris@82 809 }
Chris@82 810 {
Chris@82 811 V T89, T8c, T8d, T8f;
Chris@82 812 {
Chris@82 813 V T2t, T2L, T2H, T2M, T2A, T2P, T2E, T2O;
Chris@82 814 {
Chris@82 815 V T2r, T2s, T2F, T2G;
Chris@82 816 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@82 817 T2s = VADD(T29, T2a);
Chris@82 818 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@82 819 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@82 820 T2F = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@82 821 T2G = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@82 822 T2H = VSUB(T2F, T2G);
Chris@82 823 T2M = VADD(T2F, T2G);
Chris@82 824 }
Chris@82 825 {
Chris@82 826 V T2w, T2z, T2C, T2D;
Chris@82 827 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@82 828 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@82 829 T2A = VADD(T2w, T2z);
Chris@82 830 T2P = VSUB(T2w, T2z);
Chris@82 831 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@82 832 T2D = VSUB(TC, TR);
Chris@82 833 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@82 834 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@82 835 }
Chris@82 836 {
Chris@82 837 V T2B, T2I, T8a, T2R, T2S, T8b;
Chris@82 838 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@82 839 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@82 840 T89 = VFNMSI(T2I, T2B);
Chris@82 841 STM2(&(xo[76]), T89, ovs, &(xo[0]));
Chris@82 842 T8a = VFMAI(T2I, T2B);
Chris@82 843 STM2(&(xo[52]), T8a, ovs, &(xo[0]));
Chris@82 844 STN2(&(xo[52]), T8a, T81, ovs);
Chris@82 845 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@82 846 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@82 847 T8b = VFMAI(T2S, T2R);
Chris@82 848 STM2(&(xo[20]), T8b, ovs, &(xo[0]));
Chris@82 849 STN2(&(xo[20]), T8b, T83, ovs);
Chris@82 850 T8c = VFNMSI(T2S, T2R);
Chris@82 851 STM2(&(xo[108]), T8c, ovs, &(xo[0]));
Chris@82 852 }
Chris@82 853 {
Chris@82 854 V T2J, T2K, T8e, T2N, T2Q, T8g;
Chris@82 855 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@82 856 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@82 857 T8d = VFNMSI(T2K, T2J);
Chris@82 858 STM2(&(xo[12]), T8d, ovs, &(xo[0]));
Chris@82 859 T8e = VFMAI(T2K, T2J);
Chris@82 860 STM2(&(xo[116]), T8e, ovs, &(xo[0]));
Chris@82 861 STN2(&(xo[116]), T8e, T85, ovs);
Chris@82 862 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@82 863 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@82 864 T8f = VFNMSI(T2Q, T2N);
Chris@82 865 STM2(&(xo[44]), T8f, ovs, &(xo[0]));
Chris@82 866 T8g = VFMAI(T2Q, T2N);
Chris@82 867 STM2(&(xo[84]), T8g, ovs, &(xo[0]));
Chris@82 868 STN2(&(xo[84]), T8g, T88, ovs);
Chris@82 869 }
Chris@82 870 }
Chris@82 871 {
Chris@82 872 V T5J, T61, T5X, T62, T5Q, T65, T5U, T64;
Chris@82 873 {
Chris@82 874 V T5H, T5I, T5V, T5W;
Chris@82 875 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@82 876 T5I = VADD(T5p, T5q);
Chris@82 877 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@82 878 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@82 879 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@82 880 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@82 881 T5X = VSUB(T5V, T5W);
Chris@82 882 T62 = VADD(T5V, T5W);
Chris@82 883 }
Chris@82 884 {
Chris@82 885 V T5M, T5P, T5S, T5T;
Chris@82 886 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@82 887 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@82 888 T5Q = VADD(T5M, T5P);
Chris@82 889 T65 = VSUB(T5M, T5P);
Chris@82 890 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@82 891 T5T = VSUB(T4o, T4x);
Chris@82 892 T5U = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@82 893 T64 = VFMA(LDK(KP980785280), T5T, T5S);
Chris@82 894 }
Chris@82 895 {
Chris@82 896 V T5R, T5Y, T8h, T8i;
Chris@82 897 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@82 898 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@82 899 T8h = VFNMSI(T5Y, T5R);
Chris@82 900 STM2(&(xo[78]), T8h, ovs, &(xo[2]));
Chris@82 901 STN2(&(xo[76]), T89, T8h, ovs);
Chris@82 902 T8i = VFMAI(T5Y, T5R);
Chris@82 903 STM2(&(xo[50]), T8i, ovs, &(xo[2]));
Chris@82 904 STN2(&(xo[48]), T7r, T8i, ovs);
Chris@82 905 }
Chris@82 906 {
Chris@82 907 V T67, T68, T8j, T8k;
Chris@82 908 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@82 909 T68 = VFMA(LDK(KP773010453), T65, T64);
Chris@82 910 T8j = VFMAI(T68, T67);
Chris@82 911 STM2(&(xo[18]), T8j, ovs, &(xo[2]));
Chris@82 912 STN2(&(xo[16]), T7t, T8j, ovs);
Chris@82 913 T8k = VFNMSI(T68, T67);
Chris@82 914 STM2(&(xo[110]), T8k, ovs, &(xo[2]));
Chris@82 915 STN2(&(xo[108]), T8c, T8k, ovs);
Chris@82 916 }
Chris@82 917 {
Chris@82 918 V T5Z, T60, T8l, T8m;
Chris@82 919 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@82 920 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@82 921 T8l = VFNMSI(T60, T5Z);
Chris@82 922 STM2(&(xo[14]), T8l, ovs, &(xo[2]));
Chris@82 923 STN2(&(xo[12]), T8d, T8l, ovs);
Chris@82 924 T8m = VFMAI(T60, T5Z);
Chris@82 925 STM2(&(xo[114]), T8m, ovs, &(xo[2]));
Chris@82 926 STN2(&(xo[112]), T7u, T8m, ovs);
Chris@82 927 }
Chris@82 928 {
Chris@82 929 V T63, T66, T8n, T8o;
Chris@82 930 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@82 931 T66 = VFNMS(LDK(KP773010453), T65, T64);
Chris@82 932 T8n = VFNMSI(T66, T63);
Chris@82 933 STM2(&(xo[46]), T8n, ovs, &(xo[2]));
Chris@82 934 STN2(&(xo[44]), T8f, T8n, ovs);
Chris@82 935 T8o = VFMAI(T66, T63);
Chris@82 936 STM2(&(xo[82]), T8o, ovs, &(xo[2]));
Chris@82 937 STN2(&(xo[80]), T7s, T8o, ovs);
Chris@82 938 }
Chris@82 939 }
Chris@82 940 }
Chris@82 941 }
Chris@82 942 }
Chris@82 943 }
Chris@82 944 }
Chris@82 945 VLEAVE();
Chris@82 946 }
Chris@82 947
Chris@82 948 static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 949
Chris@82 950 void XSIMD(codelet_n2bv_64) (planner *p) {
Chris@82 951 X(kdft_register) (p, n2bv_64, &desc);
Chris@82 952 }
Chris@82 953
Chris@82 954 #else
Chris@82 955
Chris@82 956 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include dft/simd/n2b.h -store-multiple 2 */
Chris@82 957
Chris@82 958 /*
Chris@82 959 * This function contains 456 FP additions, 124 FP multiplications,
Chris@82 960 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@82 961 * 128 stack variables, 15 constants, and 160 memory accesses
Chris@82 962 */
Chris@82 963 #include "dft/simd/n2b.h"
Chris@82 964
Chris@82 965 static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 966 {
Chris@82 967 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@82 968 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 969 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@82 970 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 971 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@82 972 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 973 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@82 974 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 975 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 976 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 977 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 978 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 979 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 980 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 981 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 982 {
Chris@82 983 INT i;
Chris@82 984 const R *xi;
Chris@82 985 R *xo;
Chris@82 986 xi = ii;
Chris@82 987 xo = io;
Chris@82 988 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 989 V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
Chris@82 990 V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
Chris@82 991 V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
Chris@82 992 V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
Chris@82 993 V T53, T5P, T3e, T3W;
Chris@82 994 {
Chris@82 995 V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
Chris@82 996 {
Chris@82 997 V T1, T2, T2n, T2o;
Chris@82 998 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 999 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 1000 T3 = VSUB(T1, T2);
Chris@82 1001 T4n = VADD(T1, T2);
Chris@82 1002 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 1003 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 1004 T2p = VSUB(T2n, T2o);
Chris@82 1005 T4o = VADD(T2n, T2o);
Chris@82 1006 }
Chris@82 1007 {
Chris@82 1008 V T4, T5, T7, T8;
Chris@82 1009 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 1010 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 1011 T6 = VSUB(T4, T5);
Chris@82 1012 T5s = VADD(T4, T5);
Chris@82 1013 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 1014 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 1015 T9 = VSUB(T7, T8);
Chris@82 1016 T5t = VADD(T7, T8);
Chris@82 1017 }
Chris@82 1018 T4p = VSUB(T4n, T4o);
Chris@82 1019 T5u = VSUB(T5s, T5t);
Chris@82 1020 {
Chris@82 1021 V Ta, T2m, T6E, T6F;
Chris@82 1022 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 1023 Tb = VSUB(T3, Ta);
Chris@82 1024 T3A = VADD(T3, Ta);
Chris@82 1025 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@82 1026 T2q = VSUB(T2m, T2p);
Chris@82 1027 T3v = VADD(T2p, T2m);
Chris@82 1028 T6E = VADD(T4n, T4o);
Chris@82 1029 T6F = VADD(T5s, T5t);
Chris@82 1030 T6G = VSUB(T6E, T6F);
Chris@82 1031 T78 = VADD(T6E, T6F);
Chris@82 1032 }
Chris@82 1033 }
Chris@82 1034 {
Chris@82 1035 V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
Chris@82 1036 {
Chris@82 1037 V Tc, Td, Tm, Tn;
Chris@82 1038 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 1039 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 1040 Te = VSUB(Tc, Td);
Chris@82 1041 T4q = VADD(Tc, Td);
Chris@82 1042 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 1043 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 1044 To = VSUB(Tm, Tn);
Chris@82 1045 T4t = VADD(Tm, Tn);
Chris@82 1046 }
Chris@82 1047 {
Chris@82 1048 V Tf, Tg, Tj, Tk;
Chris@82 1049 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 1050 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 1051 Th = VSUB(Tf, Tg);
Chris@82 1052 T4r = VADD(Tf, Tg);
Chris@82 1053 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 1054 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 1055 Tl = VSUB(Tj, Tk);
Chris@82 1056 T4u = VADD(Tj, Tk);
Chris@82 1057 }
Chris@82 1058 {
Chris@82 1059 V Ti, Tp, T6z, T6A;
Chris@82 1060 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 1061 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@82 1062 Tq = VSUB(Ti, Tp);
Chris@82 1063 T3w = VADD(Ti, Tp);
Chris@82 1064 T6z = VADD(T4q, T4r);
Chris@82 1065 T6A = VADD(T4t, T4u);
Chris@82 1066 T6B = VSUB(T6z, T6A);
Chris@82 1067 T79 = VADD(T6z, T6A);
Chris@82 1068 }
Chris@82 1069 {
Chris@82 1070 V T2j, T2k, T4s, T4v;
Chris@82 1071 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 1072 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 1073 T2l = VSUB(T2j, T2k);
Chris@82 1074 T3B = VADD(T2j, T2k);
Chris@82 1075 T4s = VSUB(T4q, T4r);
Chris@82 1076 T4v = VSUB(T4t, T4u);
Chris@82 1077 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@82 1078 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
Chris@82 1079 }
Chris@82 1080 }
Chris@82 1081 {
Chris@82 1082 V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
Chris@82 1083 {
Chris@82 1084 V Tz, TA, TD, TE;
Chris@82 1085 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 1086 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 1087 TB = VSUB(Tz, TA);
Chris@82 1088 T4z = VADD(Tz, TA);
Chris@82 1089 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 1090 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 1091 TF = VSUB(TD, TE);
Chris@82 1092 T4y = VADD(TD, TE);
Chris@82 1093 {
Chris@82 1094 V Ts, Tt, Tu, Tv, Tw, Tx;
Chris@82 1095 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 1096 Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 1097 Tu = VSUB(Ts, Tt);
Chris@82 1098 Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 1099 Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 1100 Tx = VSUB(Tv, Tw);
Chris@82 1101 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@82 1102 T4C = VADD(Tv, Tw);
Chris@82 1103 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@82 1104 T4B = VADD(Ts, Tt);
Chris@82 1105 }
Chris@82 1106 }
Chris@82 1107 {
Chris@82 1108 V TC, TH, T6s, T6t;
Chris@82 1109 TC = VSUB(Ty, TB);
Chris@82 1110 TH = VSUB(TF, TG);
Chris@82 1111 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@82 1112 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@82 1113 T6s = VADD(T4y, T4z);
Chris@82 1114 T6t = VADD(T4B, T4C);
Chris@82 1115 T6u = VSUB(T6s, T6t);
Chris@82 1116 T74 = VADD(T6s, T6t);
Chris@82 1117 }
Chris@82 1118 {
Chris@82 1119 V T3o, T3p, T4A, T4D;
Chris@82 1120 T3o = VADD(TB, Ty);
Chris@82 1121 T3p = VADD(TF, TG);
Chris@82 1122 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
Chris@82 1123 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
Chris@82 1124 T4A = VSUB(T4y, T4z);
Chris@82 1125 T4D = VSUB(T4B, T4C);
Chris@82 1126 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@82 1127 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
Chris@82 1128 }
Chris@82 1129 }
Chris@82 1130 {
Chris@82 1131 V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
Chris@82 1132 {
Chris@82 1133 V TQ, TR, TU, TV;
Chris@82 1134 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 1135 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 1136 TS = VSUB(TQ, TR);
Chris@82 1137 T4J = VADD(TQ, TR);
Chris@82 1138 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 1139 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 1140 TW = VSUB(TU, TV);
Chris@82 1141 T4I = VADD(TU, TV);
Chris@82 1142 {
Chris@82 1143 V TJ, TK, TL, TM, TN, TO;
Chris@82 1144 TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 1145 TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 1146 TL = VSUB(TJ, TK);
Chris@82 1147 TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 1148 TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 1149 TO = VSUB(TM, TN);
Chris@82 1150 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@82 1151 T4G = VADD(TM, TN);
Chris@82 1152 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@82 1153 T4F = VADD(TJ, TK);
Chris@82 1154 }
Chris@82 1155 }
Chris@82 1156 {
Chris@82 1157 V TT, TY, T6v, T6w;
Chris@82 1158 TT = VSUB(TP, TS);
Chris@82 1159 TY = VSUB(TW, TX);
Chris@82 1160 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@82 1161 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@82 1162 T6v = VADD(T4I, T4J);
Chris@82 1163 T6w = VADD(T4F, T4G);
Chris@82 1164 T6x = VSUB(T6v, T6w);
Chris@82 1165 T75 = VADD(T6v, T6w);
Chris@82 1166 }
Chris@82 1167 {
Chris@82 1168 V T3r, T3s, T4H, T4K;
Chris@82 1169 T3r = VADD(TS, TP);
Chris@82 1170 T3s = VADD(TW, TX);
Chris@82 1171 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
Chris@82 1172 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
Chris@82 1173 T4H = VSUB(T4F, T4G);
Chris@82 1174 T4K = VSUB(T4I, T4J);
Chris@82 1175 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@82 1176 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
Chris@82 1177 }
Chris@82 1178 }
Chris@82 1179 {
Chris@82 1180 V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
Chris@82 1181 V T1Q, T5a, T2a;
Chris@82 1182 {
Chris@82 1183 V T1Z, T20, T24, T25;
Chris@82 1184 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1185 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1186 T21 = VSUB(T1Z, T20);
Chris@82 1187 T5h = VADD(T1Z, T20);
Chris@82 1188 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1189 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1190 T26 = VSUB(T24, T25);
Chris@82 1191 T5g = VADD(T24, T25);
Chris@82 1192 }
Chris@82 1193 {
Chris@82 1194 V T1S, T1T, T1U, T1V, T1W, T1X;
Chris@82 1195 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1196 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1197 T1U = VSUB(T1S, T1T);
Chris@82 1198 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1199 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1200 T1X = VSUB(T1V, T1W);
Chris@82 1201 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
Chris@82 1202 T5d = VADD(T1V, T1W);
Chris@82 1203 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
Chris@82 1204 T5c = VADD(T1S, T1T);
Chris@82 1205 }
Chris@82 1206 {
Chris@82 1207 V T1F, T1I, T1M, T1P;
Chris@82 1208 {
Chris@82 1209 V T1D, T1E, T1G, T1H;
Chris@82 1210 T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1211 T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1212 T1F = VSUB(T1D, T1E);
Chris@82 1213 T55 = VADD(T1D, T1E);
Chris@82 1214 T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1215 T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1216 T1I = VSUB(T1G, T1H);
Chris@82 1217 T56 = VADD(T1G, T1H);
Chris@82 1218 }
Chris@82 1219 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
Chris@82 1220 T57 = VSUB(T55, T56);
Chris@82 1221 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
Chris@82 1222 {
Chris@82 1223 V T1K, T1L, T1N, T1O;
Chris@82 1224 T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1225 T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1226 T1M = VSUB(T1K, T1L);
Chris@82 1227 T58 = VADD(T1K, T1L);
Chris@82 1228 T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1229 T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1230 T1P = VSUB(T1N, T1O);
Chris@82 1231 T59 = VADD(T1N, T1O);
Chris@82 1232 }
Chris@82 1233 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
Chris@82 1234 T5a = VSUB(T58, T59);
Chris@82 1235 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
Chris@82 1236 }
Chris@82 1237 {
Chris@82 1238 V T1R, T22, T6k, T6l;
Chris@82 1239 T1R = VSUB(T1J, T1Q);
Chris@82 1240 T22 = VSUB(T1Y, T21);
Chris@82 1241 T23 = VSUB(T1R, T22);
Chris@82 1242 T2N = VADD(T22, T1R);
Chris@82 1243 T6k = VADD(T5g, T5h);
Chris@82 1244 T6l = VADD(T5c, T5d);
Chris@82 1245 T6m = VSUB(T6k, T6l);
Chris@82 1246 T70 = VADD(T6k, T6l);
Chris@82 1247 }
Chris@82 1248 {
Chris@82 1249 V T6n, T6o, T28, T2b;
Chris@82 1250 T6n = VADD(T55, T56);
Chris@82 1251 T6o = VADD(T58, T59);
Chris@82 1252 T6p = VSUB(T6n, T6o);
Chris@82 1253 T71 = VADD(T6n, T6o);
Chris@82 1254 T28 = VSUB(T26, T27);
Chris@82 1255 T2b = VSUB(T29, T2a);
Chris@82 1256 T2c = VSUB(T28, T2b);
Chris@82 1257 T2O = VADD(T28, T2b);
Chris@82 1258 }
Chris@82 1259 {
Chris@82 1260 V T3g, T3h, T5b, T5e;
Chris@82 1261 T3g = VADD(T26, T27);
Chris@82 1262 T3h = VADD(T1J, T1Q);
Chris@82 1263 T3i = VADD(T3g, T3h);
Chris@82 1264 T3Y = VSUB(T3g, T3h);
Chris@82 1265 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
Chris@82 1266 T5e = VSUB(T5c, T5d);
Chris@82 1267 T5f = VSUB(T5b, T5e);
Chris@82 1268 T5R = VADD(T5e, T5b);
Chris@82 1269 }
Chris@82 1270 {
Chris@82 1271 V T5i, T5j, T3j, T3k;
Chris@82 1272 T5i = VSUB(T5g, T5h);
Chris@82 1273 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
Chris@82 1274 T5k = VSUB(T5i, T5j);
Chris@82 1275 T5S = VADD(T5i, T5j);
Chris@82 1276 T3j = VADD(T21, T1Y);
Chris@82 1277 T3k = VADD(T29, T2a);
Chris@82 1278 T3l = VADD(T3j, T3k);
Chris@82 1279 T3Z = VSUB(T3k, T3j);
Chris@82 1280 }
Chris@82 1281 }
Chris@82 1282 {
Chris@82 1283 V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
Chris@82 1284 V T1f, T4T, T1z;
Chris@82 1285 {
Chris@82 1286 V T1o, T1p, T1t, T1u;
Chris@82 1287 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1288 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1289 T1q = VSUB(T1o, T1p);
Chris@82 1290 T50 = VADD(T1o, T1p);
Chris@82 1291 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1292 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1293 T1v = VSUB(T1t, T1u);
Chris@82 1294 T4Z = VADD(T1t, T1u);
Chris@82 1295 }
Chris@82 1296 {
Chris@82 1297 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@82 1298 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1299 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1300 T1j = VSUB(T1h, T1i);
Chris@82 1301 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1302 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1303 T1m = VSUB(T1k, T1l);
Chris@82 1304 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@82 1305 T4W = VADD(T1k, T1l);
Chris@82 1306 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
Chris@82 1307 T4V = VADD(T1h, T1i);
Chris@82 1308 }
Chris@82 1309 {
Chris@82 1310 V T14, T17, T1b, T1e;
Chris@82 1311 {
Chris@82 1312 V T12, T13, T15, T16;
Chris@82 1313 T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1314 T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1315 T14 = VSUB(T12, T13);
Chris@82 1316 T4O = VADD(T12, T13);
Chris@82 1317 T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1318 T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1319 T17 = VSUB(T15, T16);
Chris@82 1320 T4P = VADD(T15, T16);
Chris@82 1321 }
Chris@82 1322 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
Chris@82 1323 T4Q = VSUB(T4O, T4P);
Chris@82 1324 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
Chris@82 1325 {
Chris@82 1326 V T19, T1a, T1c, T1d;
Chris@82 1327 T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1328 T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1329 T1b = VSUB(T19, T1a);
Chris@82 1330 T4R = VADD(T19, T1a);
Chris@82 1331 T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1332 T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1333 T1e = VSUB(T1c, T1d);
Chris@82 1334 T4S = VADD(T1c, T1d);
Chris@82 1335 }
Chris@82 1336 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
Chris@82 1337 T4T = VSUB(T4R, T4S);
Chris@82 1338 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@82 1339 }
Chris@82 1340 {
Chris@82 1341 V T1g, T1r, T6d, T6e;
Chris@82 1342 T1g = VSUB(T18, T1f);
Chris@82 1343 T1r = VSUB(T1n, T1q);
Chris@82 1344 T1s = VSUB(T1g, T1r);
Chris@82 1345 T2K = VADD(T1r, T1g);
Chris@82 1346 T6d = VADD(T4Z, T50);
Chris@82 1347 T6e = VADD(T4V, T4W);
Chris@82 1348 T6f = VSUB(T6d, T6e);
Chris@82 1349 T6X = VADD(T6d, T6e);
Chris@82 1350 }
Chris@82 1351 {
Chris@82 1352 V T6g, T6h, T1x, T1A;
Chris@82 1353 T6g = VADD(T4O, T4P);
Chris@82 1354 T6h = VADD(T4R, T4S);
Chris@82 1355 T6i = VSUB(T6g, T6h);
Chris@82 1356 T6Y = VADD(T6g, T6h);
Chris@82 1357 T1x = VSUB(T1v, T1w);
Chris@82 1358 T1A = VSUB(T1y, T1z);
Chris@82 1359 T1B = VSUB(T1x, T1A);
Chris@82 1360 T2L = VADD(T1x, T1A);
Chris@82 1361 }
Chris@82 1362 {
Chris@82 1363 V T39, T3a, T4U, T4X;
Chris@82 1364 T39 = VADD(T1v, T1w);
Chris@82 1365 T3a = VADD(T18, T1f);
Chris@82 1366 T3b = VADD(T39, T3a);
Chris@82 1367 T3V = VSUB(T39, T3a);
Chris@82 1368 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
Chris@82 1369 T4X = VSUB(T4V, T4W);
Chris@82 1370 T4Y = VSUB(T4U, T4X);
Chris@82 1371 T5O = VADD(T4X, T4U);
Chris@82 1372 }
Chris@82 1373 {
Chris@82 1374 V T51, T52, T3c, T3d;
Chris@82 1375 T51 = VSUB(T4Z, T50);
Chris@82 1376 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
Chris@82 1377 T53 = VSUB(T51, T52);
Chris@82 1378 T5P = VADD(T51, T52);
Chris@82 1379 T3c = VADD(T1q, T1n);
Chris@82 1380 T3d = VADD(T1y, T1z);
Chris@82 1381 T3e = VADD(T3c, T3d);
Chris@82 1382 T3W = VSUB(T3d, T3c);
Chris@82 1383 }
Chris@82 1384 }
Chris@82 1385 {
Chris@82 1386 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
Chris@82 1387 V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
Chris@82 1388 {
Chris@82 1389 V T7h, T7l, T7k, T7m;
Chris@82 1390 {
Chris@82 1391 V T7f, T7g, T7i, T7j;
Chris@82 1392 T7f = VADD(T78, T79);
Chris@82 1393 T7g = VADD(T74, T75);
Chris@82 1394 T7h = VSUB(T7f, T7g);
Chris@82 1395 T7l = VADD(T7f, T7g);
Chris@82 1396 T7i = VADD(T6X, T6Y);
Chris@82 1397 T7j = VADD(T70, T71);
Chris@82 1398 T7k = VBYI(VSUB(T7i, T7j));
Chris@82 1399 T7m = VADD(T7i, T7j);
Chris@82 1400 }
Chris@82 1401 T7n = VSUB(T7h, T7k);
Chris@82 1402 STM2(&(xo[96]), T7n, ovs, &(xo[0]));
Chris@82 1403 T7o = VADD(T7l, T7m);
Chris@82 1404 STM2(&(xo[0]), T7o, ovs, &(xo[0]));
Chris@82 1405 T7p = VADD(T7h, T7k);
Chris@82 1406 STM2(&(xo[32]), T7p, ovs, &(xo[0]));
Chris@82 1407 T7q = VSUB(T7l, T7m);
Chris@82 1408 STM2(&(xo[64]), T7q, ovs, &(xo[0]));
Chris@82 1409 }
Chris@82 1410 {
Chris@82 1411 V T76, T7a, T73, T7b, T6Z, T72;
Chris@82 1412 T76 = VSUB(T74, T75);
Chris@82 1413 T7a = VSUB(T78, T79);
Chris@82 1414 T6Z = VSUB(T6X, T6Y);
Chris@82 1415 T72 = VSUB(T70, T71);
Chris@82 1416 T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
Chris@82 1417 T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
Chris@82 1418 {
Chris@82 1419 V T77, T7c, T7d, T7e;
Chris@82 1420 T77 = VBYI(VSUB(T73, T76));
Chris@82 1421 T7c = VSUB(T7a, T7b);
Chris@82 1422 T7r = VADD(T77, T7c);
Chris@82 1423 STM2(&(xo[48]), T7r, ovs, &(xo[0]));
Chris@82 1424 T7s = VSUB(T7c, T77);
Chris@82 1425 STM2(&(xo[80]), T7s, ovs, &(xo[0]));
Chris@82 1426 T7d = VBYI(VADD(T76, T73));
Chris@82 1427 T7e = VADD(T7a, T7b);
Chris@82 1428 T7t = VADD(T7d, T7e);
Chris@82 1429 STM2(&(xo[16]), T7t, ovs, &(xo[0]));
Chris@82 1430 T7u = VSUB(T7e, T7d);
Chris@82 1431 STM2(&(xo[112]), T7u, ovs, &(xo[0]));
Chris@82 1432 }
Chris@82 1433 }
Chris@82 1434 {
Chris@82 1435 V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
Chris@82 1436 T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
Chris@82 1437 T6C = VSUB(T6y, T6B);
Chris@82 1438 T6S = VADD(T6B, T6y);
Chris@82 1439 T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
Chris@82 1440 T6I = VSUB(T6G, T6H);
Chris@82 1441 T6P = VADD(T6G, T6H);
Chris@82 1442 {
Chris@82 1443 V T6j, T6q, T6J, T6K;
Chris@82 1444 T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
Chris@82 1445 T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
Chris@82 1446 T6r = VSUB(T6j, T6q);
Chris@82 1447 T6Q = VADD(T6j, T6q);
Chris@82 1448 T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
Chris@82 1449 T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
Chris@82 1450 T6L = VSUB(T6J, T6K);
Chris@82 1451 T6T = VADD(T6J, T6K);
Chris@82 1452 }
Chris@82 1453 {
Chris@82 1454 V T6D, T6M, T6V, T6W;
Chris@82 1455 T6D = VBYI(VSUB(T6r, T6C));
Chris@82 1456 T6M = VSUB(T6I, T6L);
Chris@82 1457 T7v = VADD(T6D, T6M);
Chris@82 1458 STM2(&(xo[40]), T7v, ovs, &(xo[0]));
Chris@82 1459 T7w = VSUB(T6M, T6D);
Chris@82 1460 STM2(&(xo[88]), T7w, ovs, &(xo[0]));
Chris@82 1461 T6V = VSUB(T6P, T6Q);
Chris@82 1462 T6W = VBYI(VSUB(T6T, T6S));
Chris@82 1463 T7x = VSUB(T6V, T6W);
Chris@82 1464 STM2(&(xo[72]), T7x, ovs, &(xo[0]));
Chris@82 1465 T7y = VADD(T6V, T6W);
Chris@82 1466 STM2(&(xo[56]), T7y, ovs, &(xo[0]));
Chris@82 1467 }
Chris@82 1468 {
Chris@82 1469 V T6N, T6O, T6R, T6U;
Chris@82 1470 T6N = VBYI(VADD(T6C, T6r));
Chris@82 1471 T6O = VADD(T6I, T6L);
Chris@82 1472 T7z = VADD(T6N, T6O);
Chris@82 1473 STM2(&(xo[24]), T7z, ovs, &(xo[0]));
Chris@82 1474 T7A = VSUB(T6O, T6N);
Chris@82 1475 STM2(&(xo[104]), T7A, ovs, &(xo[0]));
Chris@82 1476 T6R = VADD(T6P, T6Q);
Chris@82 1477 T6U = VBYI(VADD(T6S, T6T));
Chris@82 1478 T7B = VSUB(T6R, T6U);
Chris@82 1479 STM2(&(xo[120]), T7B, ovs, &(xo[0]));
Chris@82 1480 T7C = VADD(T6R, T6U);
Chris@82 1481 STM2(&(xo[8]), T7C, ovs, &(xo[0]));
Chris@82 1482 }
Chris@82 1483 }
Chris@82 1484 {
Chris@82 1485 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@82 1486 {
Chris@82 1487 V T5L, T5M, T5Z, T60;
Chris@82 1488 T5L = VADD(T4p, T4w);
Chris@82 1489 T5M = VADD(T5o, T5p);
Chris@82 1490 T5N = VSUB(T5L, T5M);
Chris@82 1491 T68 = VADD(T5L, T5M);
Chris@82 1492 T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
Chris@82 1493 T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
Chris@82 1494 T61 = VSUB(T5Z, T60);
Chris@82 1495 T69 = VADD(T5Z, T60);
Chris@82 1496 }
Chris@82 1497 {
Chris@82 1498 V T5Q, T5T, T5W, T5X;
Chris@82 1499 T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
Chris@82 1500 T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
Chris@82 1501 T5U = VSUB(T5Q, T5T);
Chris@82 1502 T65 = VADD(T5Q, T5T);
Chris@82 1503 T5W = VADD(T4E, T4L);
Chris@82 1504 T5X = VADD(T5u, T5r);
Chris@82 1505 T5Y = VSUB(T5W, T5X);
Chris@82 1506 T66 = VADD(T5X, T5W);
Chris@82 1507 }
Chris@82 1508 {
Chris@82 1509 V T5V, T62, T6b, T6c;
Chris@82 1510 T5V = VADD(T5N, T5U);
Chris@82 1511 T62 = VBYI(VADD(T5Y, T61));
Chris@82 1512 T7D = VSUB(T5V, T62);
Chris@82 1513 STM2(&(xo[100]), T7D, ovs, &(xo[0]));
Chris@82 1514 T7E = VADD(T5V, T62);
Chris@82 1515 STM2(&(xo[28]), T7E, ovs, &(xo[0]));
Chris@82 1516 T6b = VBYI(VADD(T66, T65));
Chris@82 1517 T6c = VADD(T68, T69);
Chris@82 1518 T7F = VADD(T6b, T6c);
Chris@82 1519 STM2(&(xo[4]), T7F, ovs, &(xo[0]));
Chris@82 1520 T7G = VSUB(T6c, T6b);
Chris@82 1521 STM2(&(xo[124]), T7G, ovs, &(xo[0]));
Chris@82 1522 }
Chris@82 1523 {
Chris@82 1524 V T63, T64, T67, T6a;
Chris@82 1525 T63 = VSUB(T5N, T5U);
Chris@82 1526 T64 = VBYI(VSUB(T61, T5Y));
Chris@82 1527 T7H = VSUB(T63, T64);
Chris@82 1528 STM2(&(xo[92]), T7H, ovs, &(xo[0]));
Chris@82 1529 T7I = VADD(T63, T64);
Chris@82 1530 STM2(&(xo[36]), T7I, ovs, &(xo[0]));
Chris@82 1531 T67 = VBYI(VSUB(T65, T66));
Chris@82 1532 T6a = VSUB(T68, T69);
Chris@82 1533 T7J = VADD(T67, T6a);
Chris@82 1534 STM2(&(xo[60]), T7J, ovs, &(xo[0]));
Chris@82 1535 T7K = VSUB(T6a, T67);
Chris@82 1536 STM2(&(xo[68]), T7K, ovs, &(xo[0]));
Chris@82 1537 }
Chris@82 1538 }
Chris@82 1539 {
Chris@82 1540 V T7M, T7O, T7P, T7R;
Chris@82 1541 {
Chris@82 1542 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@82 1543 {
Chris@82 1544 V Tr, T10, T2t, T2u;
Chris@82 1545 Tr = VSUB(Tb, Tq);
Chris@82 1546 T10 = VSUB(TI, TZ);
Chris@82 1547 T11 = VSUB(Tr, T10);
Chris@82 1548 T2C = VADD(Tr, T10);
Chris@82 1549 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
Chris@82 1550 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
Chris@82 1551 T2v = VSUB(T2t, T2u);
Chris@82 1552 T2D = VADD(T2t, T2u);
Chris@82 1553 }
Chris@82 1554 {
Chris@82 1555 V T1C, T2d, T2i, T2r;
Chris@82 1556 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
Chris@82 1557 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
Chris@82 1558 T2e = VSUB(T1C, T2d);
Chris@82 1559 T2z = VADD(T1C, T2d);
Chris@82 1560 T2i = VSUB(T2g, T2h);
Chris@82 1561 T2r = VSUB(T2l, T2q);
Chris@82 1562 T2s = VSUB(T2i, T2r);
Chris@82 1563 T2A = VADD(T2r, T2i);
Chris@82 1564 }
Chris@82 1565 {
Chris@82 1566 V T2f, T2w, T7L, T2F, T2G, T7N;
Chris@82 1567 T2f = VADD(T11, T2e);
Chris@82 1568 T2w = VBYI(VADD(T2s, T2v));
Chris@82 1569 T7L = VSUB(T2f, T2w);
Chris@82 1570 STM2(&(xo[106]), T7L, ovs, &(xo[2]));
Chris@82 1571 STN2(&(xo[104]), T7A, T7L, ovs);
Chris@82 1572 T7M = VADD(T2f, T2w);
Chris@82 1573 STM2(&(xo[22]), T7M, ovs, &(xo[2]));
Chris@82 1574 T2F = VBYI(VADD(T2A, T2z));
Chris@82 1575 T2G = VADD(T2C, T2D);
Chris@82 1576 T7N = VADD(T2F, T2G);
Chris@82 1577 STM2(&(xo[10]), T7N, ovs, &(xo[2]));
Chris@82 1578 STN2(&(xo[8]), T7C, T7N, ovs);
Chris@82 1579 T7O = VSUB(T2G, T2F);
Chris@82 1580 STM2(&(xo[118]), T7O, ovs, &(xo[2]));
Chris@82 1581 }
Chris@82 1582 {
Chris@82 1583 V T2x, T2y, T7Q, T2B, T2E, T7S;
Chris@82 1584 T2x = VSUB(T11, T2e);
Chris@82 1585 T2y = VBYI(VSUB(T2v, T2s));
Chris@82 1586 T7P = VSUB(T2x, T2y);
Chris@82 1587 STM2(&(xo[86]), T7P, ovs, &(xo[2]));
Chris@82 1588 T7Q = VADD(T2x, T2y);
Chris@82 1589 STM2(&(xo[42]), T7Q, ovs, &(xo[2]));
Chris@82 1590 STN2(&(xo[40]), T7v, T7Q, ovs);
Chris@82 1591 T2B = VBYI(VSUB(T2z, T2A));
Chris@82 1592 T2E = VSUB(T2C, T2D);
Chris@82 1593 T7R = VADD(T2B, T2E);
Chris@82 1594 STM2(&(xo[54]), T7R, ovs, &(xo[2]));
Chris@82 1595 T7S = VSUB(T2E, T2B);
Chris@82 1596 STM2(&(xo[74]), T7S, ovs, &(xo[2]));
Chris@82 1597 STN2(&(xo[72]), T7x, T7S, ovs);
Chris@82 1598 }
Chris@82 1599 }
Chris@82 1600 {
Chris@82 1601 V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
Chris@82 1602 {
Chris@82 1603 V T3f, T3m, T3H, T3I;
Chris@82 1604 T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
Chris@82 1605 T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
Chris@82 1606 T3n = VSUB(T3f, T3m);
Chris@82 1607 T3O = VADD(T3f, T3m);
Chris@82 1608 T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
Chris@82 1609 T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
Chris@82 1610 T3J = VSUB(T3H, T3I);
Chris@82 1611 T3R = VADD(T3H, T3I);
Chris@82 1612 }
Chris@82 1613 {
Chris@82 1614 V T3u, T3x, T3C, T3F;
Chris@82 1615 T3u = VADD(T3q, T3t);
Chris@82 1616 T3x = VADD(T3v, T3w);
Chris@82 1617 T3y = VSUB(T3u, T3x);
Chris@82 1618 T3Q = VADD(T3x, T3u);
Chris@82 1619 T3C = VADD(T3A, T3B);
Chris@82 1620 T3F = VADD(T3D, T3E);
Chris@82 1621 T3G = VSUB(T3C, T3F);
Chris@82 1622 T3N = VADD(T3C, T3F);
Chris@82 1623 }
Chris@82 1624 {
Chris@82 1625 V T3z, T3K, T7T, T7U;
Chris@82 1626 T3z = VBYI(VSUB(T3n, T3y));
Chris@82 1627 T3K = VSUB(T3G, T3J);
Chris@82 1628 T7T = VADD(T3z, T3K);
Chris@82 1629 STM2(&(xo[34]), T7T, ovs, &(xo[2]));
Chris@82 1630 STN2(&(xo[32]), T7p, T7T, ovs);
Chris@82 1631 T7U = VSUB(T3K, T3z);
Chris@82 1632 STM2(&(xo[94]), T7U, ovs, &(xo[2]));
Chris@82 1633 STN2(&(xo[92]), T7H, T7U, ovs);
Chris@82 1634 }
Chris@82 1635 {
Chris@82 1636 V T3T, T3U, T7V, T7W;
Chris@82 1637 T3T = VSUB(T3N, T3O);
Chris@82 1638 T3U = VBYI(VSUB(T3R, T3Q));
Chris@82 1639 T7V = VSUB(T3T, T3U);
Chris@82 1640 STM2(&(xo[66]), T7V, ovs, &(xo[2]));
Chris@82 1641 STN2(&(xo[64]), T7q, T7V, ovs);
Chris@82 1642 T7W = VADD(T3T, T3U);
Chris@82 1643 STM2(&(xo[62]), T7W, ovs, &(xo[2]));
Chris@82 1644 STN2(&(xo[60]), T7J, T7W, ovs);
Chris@82 1645 }
Chris@82 1646 {
Chris@82 1647 V T3L, T3M, T7X, T7Y;
Chris@82 1648 T3L = VBYI(VADD(T3y, T3n));
Chris@82 1649 T3M = VADD(T3G, T3J);
Chris@82 1650 T7X = VADD(T3L, T3M);
Chris@82 1651 STM2(&(xo[30]), T7X, ovs, &(xo[2]));
Chris@82 1652 STN2(&(xo[28]), T7E, T7X, ovs);
Chris@82 1653 T7Y = VSUB(T3M, T3L);
Chris@82 1654 STM2(&(xo[98]), T7Y, ovs, &(xo[2]));
Chris@82 1655 STN2(&(xo[96]), T7n, T7Y, ovs);
Chris@82 1656 }
Chris@82 1657 {
Chris@82 1658 V T3P, T3S, T7Z, T80;
Chris@82 1659 T3P = VADD(T3N, T3O);
Chris@82 1660 T3S = VBYI(VADD(T3Q, T3R));
Chris@82 1661 T7Z = VSUB(T3P, T3S);
Chris@82 1662 STM2(&(xo[126]), T7Z, ovs, &(xo[2]));
Chris@82 1663 STN2(&(xo[124]), T7G, T7Z, ovs);
Chris@82 1664 T80 = VADD(T3P, T3S);
Chris@82 1665 STM2(&(xo[2]), T80, ovs, &(xo[2]));
Chris@82 1666 STN2(&(xo[0]), T7o, T80, ovs);
Chris@82 1667 }
Chris@82 1668 }
Chris@82 1669 {
Chris@82 1670 V T81, T83, T86, T88;
Chris@82 1671 {
Chris@82 1672 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@82 1673 {
Chris@82 1674 V T4x, T4M, T5x, T5y;
Chris@82 1675 T4x = VSUB(T4p, T4w);
Chris@82 1676 T4M = VSUB(T4E, T4L);
Chris@82 1677 T4N = VSUB(T4x, T4M);
Chris@82 1678 T5G = VADD(T4x, T4M);
Chris@82 1679 T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
Chris@82 1680 T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
Chris@82 1681 T5z = VSUB(T5x, T5y);
Chris@82 1682 T5H = VADD(T5x, T5y);
Chris@82 1683 }
Chris@82 1684 {
Chris@82 1685 V T54, T5l, T5q, T5v;
Chris@82 1686 T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
Chris@82 1687 T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
Chris@82 1688 T5m = VSUB(T54, T5l);
Chris@82 1689 T5D = VADD(T54, T5l);
Chris@82 1690 T5q = VSUB(T5o, T5p);
Chris@82 1691 T5v = VSUB(T5r, T5u);
Chris@82 1692 T5w = VSUB(T5q, T5v);
Chris@82 1693 T5E = VADD(T5v, T5q);
Chris@82 1694 }
Chris@82 1695 {
Chris@82 1696 V T5n, T5A, T82, T5J, T5K, T84;
Chris@82 1697 T5n = VADD(T4N, T5m);
Chris@82 1698 T5A = VBYI(VADD(T5w, T5z));
Chris@82 1699 T81 = VSUB(T5n, T5A);
Chris@82 1700 STM2(&(xo[108]), T81, ovs, &(xo[0]));
Chris@82 1701 T82 = VADD(T5n, T5A);
Chris@82 1702 STM2(&(xo[20]), T82, ovs, &(xo[0]));
Chris@82 1703 STN2(&(xo[20]), T82, T7M, ovs);
Chris@82 1704 T5J = VBYI(VADD(T5E, T5D));
Chris@82 1705 T5K = VADD(T5G, T5H);
Chris@82 1706 T83 = VADD(T5J, T5K);
Chris@82 1707 STM2(&(xo[12]), T83, ovs, &(xo[0]));
Chris@82 1708 T84 = VSUB(T5K, T5J);
Chris@82 1709 STM2(&(xo[116]), T84, ovs, &(xo[0]));
Chris@82 1710 STN2(&(xo[116]), T84, T7O, ovs);
Chris@82 1711 }
Chris@82 1712 {
Chris@82 1713 V T5B, T5C, T85, T5F, T5I, T87;
Chris@82 1714 T5B = VSUB(T4N, T5m);
Chris@82 1715 T5C = VBYI(VSUB(T5z, T5w));
Chris@82 1716 T85 = VSUB(T5B, T5C);
Chris@82 1717 STM2(&(xo[84]), T85, ovs, &(xo[0]));
Chris@82 1718 STN2(&(xo[84]), T85, T7P, ovs);
Chris@82 1719 T86 = VADD(T5B, T5C);
Chris@82 1720 STM2(&(xo[44]), T86, ovs, &(xo[0]));
Chris@82 1721 T5F = VBYI(VSUB(T5D, T5E));
Chris@82 1722 T5I = VSUB(T5G, T5H);
Chris@82 1723 T87 = VADD(T5F, T5I);
Chris@82 1724 STM2(&(xo[52]), T87, ovs, &(xo[0]));
Chris@82 1725 STN2(&(xo[52]), T87, T7R, ovs);
Chris@82 1726 T88 = VSUB(T5I, T5F);
Chris@82 1727 STM2(&(xo[76]), T88, ovs, &(xo[0]));
Chris@82 1728 }
Chris@82 1729 }
Chris@82 1730 {
Chris@82 1731 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@82 1732 {
Chris@82 1733 V T2H, T2I, T2V, T2W;
Chris@82 1734 T2H = VADD(Tb, Tq);
Chris@82 1735 T2I = VADD(T2g, T2h);
Chris@82 1736 T2J = VSUB(T2H, T2I);
Chris@82 1737 T34 = VADD(T2H, T2I);
Chris@82 1738 T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
Chris@82 1739 T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
Chris@82 1740 T2X = VSUB(T2V, T2W);
Chris@82 1741 T35 = VADD(T2V, T2W);
Chris@82 1742 }
Chris@82 1743 {
Chris@82 1744 V T2M, T2P, T2S, T2T;
Chris@82 1745 T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
Chris@82 1746 T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
Chris@82 1747 T2Q = VSUB(T2M, T2P);
Chris@82 1748 T31 = VADD(T2M, T2P);
Chris@82 1749 T2S = VADD(TI, TZ);
Chris@82 1750 T2T = VADD(T2q, T2l);
Chris@82 1751 T2U = VSUB(T2S, T2T);
Chris@82 1752 T32 = VADD(T2T, T2S);
Chris@82 1753 }
Chris@82 1754 {
Chris@82 1755 V T2R, T2Y, T89, T8a;
Chris@82 1756 T2R = VADD(T2J, T2Q);
Chris@82 1757 T2Y = VBYI(VADD(T2U, T2X));
Chris@82 1758 T89 = VSUB(T2R, T2Y);
Chris@82 1759 STM2(&(xo[102]), T89, ovs, &(xo[2]));
Chris@82 1760 STN2(&(xo[100]), T7D, T89, ovs);
Chris@82 1761 T8a = VADD(T2R, T2Y);
Chris@82 1762 STM2(&(xo[26]), T8a, ovs, &(xo[2]));
Chris@82 1763 STN2(&(xo[24]), T7z, T8a, ovs);
Chris@82 1764 }
Chris@82 1765 {
Chris@82 1766 V T37, T38, T8b, T8c;
Chris@82 1767 T37 = VBYI(VADD(T32, T31));
Chris@82 1768 T38 = VADD(T34, T35);
Chris@82 1769 T8b = VADD(T37, T38);
Chris@82 1770 STM2(&(xo[6]), T8b, ovs, &(xo[2]));
Chris@82 1771 STN2(&(xo[4]), T7F, T8b, ovs);
Chris@82 1772 T8c = VSUB(T38, T37);
Chris@82 1773 STM2(&(xo[122]), T8c, ovs, &(xo[2]));
Chris@82 1774 STN2(&(xo[120]), T7B, T8c, ovs);
Chris@82 1775 }
Chris@82 1776 {
Chris@82 1777 V T2Z, T30, T8d, T8e;
Chris@82 1778 T2Z = VSUB(T2J, T2Q);
Chris@82 1779 T30 = VBYI(VSUB(T2X, T2U));
Chris@82 1780 T8d = VSUB(T2Z, T30);
Chris@82 1781 STM2(&(xo[90]), T8d, ovs, &(xo[2]));
Chris@82 1782 STN2(&(xo[88]), T7w, T8d, ovs);
Chris@82 1783 T8e = VADD(T2Z, T30);
Chris@82 1784 STM2(&(xo[38]), T8e, ovs, &(xo[2]));
Chris@82 1785 STN2(&(xo[36]), T7I, T8e, ovs);
Chris@82 1786 }
Chris@82 1787 {
Chris@82 1788 V T33, T36, T8f, T8g;
Chris@82 1789 T33 = VBYI(VSUB(T31, T32));
Chris@82 1790 T36 = VSUB(T34, T35);
Chris@82 1791 T8f = VADD(T33, T36);
Chris@82 1792 STM2(&(xo[58]), T8f, ovs, &(xo[2]));
Chris@82 1793 STN2(&(xo[56]), T7y, T8f, ovs);
Chris@82 1794 T8g = VSUB(T36, T33);
Chris@82 1795 STM2(&(xo[70]), T8g, ovs, &(xo[2]));
Chris@82 1796 STN2(&(xo[68]), T7K, T8g, ovs);
Chris@82 1797 }
Chris@82 1798 }
Chris@82 1799 {
Chris@82 1800 V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
Chris@82 1801 {
Chris@82 1802 V T3X, T40, T49, T4a;
Chris@82 1803 T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
Chris@82 1804 T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
Chris@82 1805 T41 = VSUB(T3X, T40);
Chris@82 1806 T4g = VADD(T3X, T40);
Chris@82 1807 T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
Chris@82 1808 T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
Chris@82 1809 T4b = VSUB(T49, T4a);
Chris@82 1810 T4j = VADD(T49, T4a);
Chris@82 1811 }
Chris@82 1812 {
Chris@82 1813 V T42, T43, T46, T47;
Chris@82 1814 T42 = VSUB(T3D, T3E);
Chris@82 1815 T43 = VSUB(T3w, T3v);
Chris@82 1816 T44 = VSUB(T42, T43);
Chris@82 1817 T4i = VADD(T43, T42);
Chris@82 1818 T46 = VSUB(T3A, T3B);
Chris@82 1819 T47 = VSUB(T3q, T3t);
Chris@82 1820 T48 = VSUB(T46, T47);
Chris@82 1821 T4f = VADD(T46, T47);
Chris@82 1822 }
Chris@82 1823 {
Chris@82 1824 V T45, T4c, T8h, T8i;
Chris@82 1825 T45 = VBYI(VSUB(T41, T44));
Chris@82 1826 T4c = VSUB(T48, T4b);
Chris@82 1827 T8h = VADD(T45, T4c);
Chris@82 1828 STM2(&(xo[46]), T8h, ovs, &(xo[2]));
Chris@82 1829 STN2(&(xo[44]), T86, T8h, ovs);
Chris@82 1830 T8i = VSUB(T4c, T45);
Chris@82 1831 STM2(&(xo[82]), T8i, ovs, &(xo[2]));
Chris@82 1832 STN2(&(xo[80]), T7s, T8i, ovs);
Chris@82 1833 }
Chris@82 1834 {
Chris@82 1835 V T4l, T4m, T8j, T8k;
Chris@82 1836 T4l = VSUB(T4f, T4g);
Chris@82 1837 T4m = VBYI(VSUB(T4j, T4i));
Chris@82 1838 T8j = VSUB(T4l, T4m);
Chris@82 1839 STM2(&(xo[78]), T8j, ovs, &(xo[2]));
Chris@82 1840 STN2(&(xo[76]), T88, T8j, ovs);
Chris@82 1841 T8k = VADD(T4l, T4m);
Chris@82 1842 STM2(&(xo[50]), T8k, ovs, &(xo[2]));
Chris@82 1843 STN2(&(xo[48]), T7r, T8k, ovs);
Chris@82 1844 }
Chris@82 1845 {
Chris@82 1846 V T4d, T4e, T8l, T8m;
Chris@82 1847 T4d = VBYI(VADD(T44, T41));
Chris@82 1848 T4e = VADD(T48, T4b);
Chris@82 1849 T8l = VADD(T4d, T4e);
Chris@82 1850 STM2(&(xo[18]), T8l, ovs, &(xo[2]));
Chris@82 1851 STN2(&(xo[16]), T7t, T8l, ovs);
Chris@82 1852 T8m = VSUB(T4e, T4d);
Chris@82 1853 STM2(&(xo[110]), T8m, ovs, &(xo[2]));
Chris@82 1854 STN2(&(xo[108]), T81, T8m, ovs);
Chris@82 1855 }
Chris@82 1856 {
Chris@82 1857 V T4h, T4k, T8n, T8o;
Chris@82 1858 T4h = VADD(T4f, T4g);
Chris@82 1859 T4k = VBYI(VADD(T4i, T4j));
Chris@82 1860 T8n = VSUB(T4h, T4k);
Chris@82 1861 STM2(&(xo[114]), T8n, ovs, &(xo[2]));
Chris@82 1862 STN2(&(xo[112]), T7u, T8n, ovs);
Chris@82 1863 T8o = VADD(T4h, T4k);
Chris@82 1864 STM2(&(xo[14]), T8o, ovs, &(xo[2]));
Chris@82 1865 STN2(&(xo[12]), T83, T8o, ovs);
Chris@82 1866 }
Chris@82 1867 }
Chris@82 1868 }
Chris@82 1869 }
Chris@82 1870 }
Chris@82 1871 }
Chris@82 1872 }
Chris@82 1873 VLEAVE();
Chris@82 1874 }
Chris@82 1875
Chris@82 1876 static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 1877
Chris@82 1878 void XSIMD(codelet_n2bv_64) (planner *p) {
Chris@82 1879 X(kdft_register) (p, n2bv_64, &desc);
Chris@82 1880 }
Chris@82 1881
Chris@82 1882 #endif