annotate src/fftw-3.3.8/dft/simd/common/n2fv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:09 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@82 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@82 33 * 120 stack variables, 15 constants, and 160 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2f.h"
Chris@82 36
Chris@82 37 static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@82 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@82 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@82 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@82 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 50 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 51 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 54 {
Chris@82 55 INT i;
Chris@82 56 const R *xi;
Chris@82 57 R *xo;
Chris@82 58 xi = ri;
Chris@82 59 xo = ro;
Chris@82 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 61 V T7, T26, T47, T69, T5k, T6A, T2V, T3z, Tm, T27, T5n, T6a, T2Y, T3M, T4e;
Chris@82 62 V T6B, TC, T2a, T6e, T6E, T3l, T3A, T4o, T5p, TR, T29, T6h, T6D, T3i, T3B;
Chris@82 63 V T4x, T5q, T1N, T2x, T6t, T71, T6w, T72, T1W, T2y, T39, T3H, T57, T5N, T5e;
Chris@82 64 V T5O, T3c, T3I, T1g, T2u, T6m, T6Y, T6p, T6Z, T1p, T2v, T32, T3E, T4M, T5K;
Chris@82 65 V T4T, T5L, T35, T3F;
Chris@82 66 {
Chris@82 67 V T3, T43, T25, T44, T6, T5i, T22, T45;
Chris@82 68 {
Chris@82 69 V T1, T2, T23, T24;
Chris@82 70 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 71 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 72 T3 = VADD(T1, T2);
Chris@82 73 T43 = VSUB(T1, T2);
Chris@82 74 T23 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 75 T24 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 76 T25 = VADD(T23, T24);
Chris@82 77 T44 = VSUB(T23, T24);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V T4, T5, T20, T21;
Chris@82 81 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 82 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 83 T6 = VADD(T4, T5);
Chris@82 84 T5i = VSUB(T4, T5);
Chris@82 85 T20 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 86 T21 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 87 T22 = VADD(T20, T21);
Chris@82 88 T45 = VSUB(T20, T21);
Chris@82 89 }
Chris@82 90 T7 = VSUB(T3, T6);
Chris@82 91 T26 = VSUB(T22, T25);
Chris@82 92 {
Chris@82 93 V T46, T5j, T2T, T2U;
Chris@82 94 T46 = VADD(T44, T45);
Chris@82 95 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@82 96 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@82 97 T5j = VSUB(T45, T44);
Chris@82 98 T5k = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@82 99 T6A = VFMA(LDK(KP707106781), T5j, T5i);
Chris@82 100 T2T = VADD(T3, T6);
Chris@82 101 T2U = VADD(T25, T22);
Chris@82 102 T2V = VADD(T2T, T2U);
Chris@82 103 T3z = VSUB(T2T, T2U);
Chris@82 104 }
Chris@82 105 }
Chris@82 106 {
Chris@82 107 V Ta, T48, Tk, T4c, Td, T49, Th, T4b;
Chris@82 108 {
Chris@82 109 V T8, T9, Ti, Tj;
Chris@82 110 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 111 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 112 Ta = VADD(T8, T9);
Chris@82 113 T48 = VSUB(T8, T9);
Chris@82 114 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 115 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 116 Tk = VADD(Ti, Tj);
Chris@82 117 T4c = VSUB(Tj, Ti);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 V Tb, Tc, Tf, Tg;
Chris@82 121 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 122 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 123 Td = VADD(Tb, Tc);
Chris@82 124 T49 = VSUB(Tb, Tc);
Chris@82 125 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 126 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 127 Th = VADD(Tf, Tg);
Chris@82 128 T4b = VSUB(Tf, Tg);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V Te, Tl, T5l, T5m;
Chris@82 132 Te = VSUB(Ta, Td);
Chris@82 133 Tl = VSUB(Th, Tk);
Chris@82 134 Tm = VADD(Te, Tl);
Chris@82 135 T27 = VSUB(Tl, Te);
Chris@82 136 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@82 137 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@82 138 T5n = VSUB(T5l, T5m);
Chris@82 139 T6a = VADD(T5l, T5m);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 V T2W, T2X, T4a, T4d;
Chris@82 143 T2W = VADD(Ta, Td);
Chris@82 144 T2X = VADD(Th, Tk);
Chris@82 145 T2Y = VADD(T2W, T2X);
Chris@82 146 T3M = VSUB(T2X, T2W);
Chris@82 147 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@82 148 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@82 149 T4e = VADD(T4a, T4d);
Chris@82 150 T6B = VSUB(T4d, T4a);
Chris@82 151 }
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V Tq, T4g, Tt, T4l, Tx, T4m, TA, T4j;
Chris@82 155 {
Chris@82 156 V To, Tp, Tr, Ts;
Chris@82 157 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 158 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 159 Tq = VADD(To, Tp);
Chris@82 160 T4g = VSUB(To, Tp);
Chris@82 161 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 162 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 163 Tt = VADD(Tr, Ts);
Chris@82 164 T4l = VSUB(Tr, Ts);
Chris@82 165 {
Chris@82 166 V Tv, Tw, T4h, Ty, Tz, T4i;
Chris@82 167 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 168 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 169 T4h = VSUB(Tv, Tw);
Chris@82 170 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 171 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 172 T4i = VSUB(Ty, Tz);
Chris@82 173 Tx = VADD(Tv, Tw);
Chris@82 174 T4m = VSUB(T4h, T4i);
Chris@82 175 TA = VADD(Ty, Tz);
Chris@82 176 T4j = VADD(T4h, T4i);
Chris@82 177 }
Chris@82 178 }
Chris@82 179 {
Chris@82 180 V Tu, TB, T6c, T6d;
Chris@82 181 Tu = VSUB(Tq, Tt);
Chris@82 182 TB = VSUB(Tx, TA);
Chris@82 183 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@82 184 T2a = VFMA(LDK(KP414213562), Tu, TB);
Chris@82 185 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@82 186 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@82 187 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@82 188 T6E = VFMA(LDK(KP668178637), T6c, T6d);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 V T3j, T3k, T4k, T4n;
Chris@82 192 T3j = VADD(Tq, Tt);
Chris@82 193 T3k = VADD(Tx, TA);
Chris@82 194 T3l = VADD(T3j, T3k);
Chris@82 195 T3A = VSUB(T3j, T3k);
Chris@82 196 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@82 197 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@82 198 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@82 199 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 V TF, T4p, TI, T4u, TM, T4v, TP, T4s;
Chris@82 204 {
Chris@82 205 V TD, TE, TG, TH;
Chris@82 206 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 207 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 208 TF = VADD(TD, TE);
Chris@82 209 T4p = VSUB(TD, TE);
Chris@82 210 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 211 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 212 TI = VADD(TG, TH);
Chris@82 213 T4u = VSUB(TH, TG);
Chris@82 214 {
Chris@82 215 V TK, TL, T4r, TN, TO, T4q;
Chris@82 216 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 217 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 218 T4r = VSUB(TK, TL);
Chris@82 219 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 220 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 221 T4q = VSUB(TN, TO);
Chris@82 222 TM = VADD(TK, TL);
Chris@82 223 T4v = VSUB(T4r, T4q);
Chris@82 224 TP = VADD(TN, TO);
Chris@82 225 T4s = VADD(T4q, T4r);
Chris@82 226 }
Chris@82 227 }
Chris@82 228 {
Chris@82 229 V TJ, TQ, T6f, T6g;
Chris@82 230 TJ = VSUB(TF, TI);
Chris@82 231 TQ = VSUB(TM, TP);
Chris@82 232 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@82 233 T29 = VFMA(LDK(KP414213562), TJ, TQ);
Chris@82 234 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@82 235 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@82 236 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@82 237 T6D = VFMA(LDK(KP668178637), T6f, T6g);
Chris@82 238 }
Chris@82 239 {
Chris@82 240 V T3g, T3h, T4t, T4w;
Chris@82 241 T3g = VADD(TF, TI);
Chris@82 242 T3h = VADD(TP, TM);
Chris@82 243 T3i = VADD(T3g, T3h);
Chris@82 244 T3B = VSUB(T3g, T3h);
Chris@82 245 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@82 246 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@82 247 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@82 248 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@82 249 }
Chris@82 250 }
Chris@82 251 {
Chris@82 252 V T1t, T4V, T1w, T58, T1Q, T59, T1T, T4Y, T1A, T1D, T1E, T5b, T52, T1H, T1K;
Chris@82 253 V T1L, T5c, T55;
Chris@82 254 {
Chris@82 255 V T1r, T1s, T1u, T1v;
Chris@82 256 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 257 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 258 T1t = VADD(T1r, T1s);
Chris@82 259 T4V = VSUB(T1r, T1s);
Chris@82 260 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 261 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 262 T1w = VADD(T1u, T1v);
Chris@82 263 T58 = VSUB(T1v, T1u);
Chris@82 264 }
Chris@82 265 {
Chris@82 266 V T1O, T1P, T4X, T1R, T1S, T4W;
Chris@82 267 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 268 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 269 T4X = VSUB(T1O, T1P);
Chris@82 270 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 271 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 272 T4W = VSUB(T1R, T1S);
Chris@82 273 T1Q = VADD(T1O, T1P);
Chris@82 274 T59 = VSUB(T4X, T4W);
Chris@82 275 T1T = VADD(T1R, T1S);
Chris@82 276 T4Y = VADD(T4W, T4X);
Chris@82 277 }
Chris@82 278 {
Chris@82 279 V T50, T51, T53, T54;
Chris@82 280 {
Chris@82 281 V T1y, T1z, T1B, T1C;
Chris@82 282 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 283 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 284 T1A = VADD(T1y, T1z);
Chris@82 285 T50 = VSUB(T1y, T1z);
Chris@82 286 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 287 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 288 T1D = VADD(T1B, T1C);
Chris@82 289 T51 = VSUB(T1C, T1B);
Chris@82 290 }
Chris@82 291 T1E = VSUB(T1A, T1D);
Chris@82 292 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@82 293 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@82 294 {
Chris@82 295 V T1F, T1G, T1I, T1J;
Chris@82 296 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 297 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 298 T1H = VADD(T1F, T1G);
Chris@82 299 T53 = VSUB(T1F, T1G);
Chris@82 300 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 301 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 302 T1K = VADD(T1I, T1J);
Chris@82 303 T54 = VSUB(T1J, T1I);
Chris@82 304 }
Chris@82 305 T1L = VSUB(T1H, T1K);
Chris@82 306 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@82 307 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@82 308 }
Chris@82 309 {
Chris@82 310 V T1x, T1M, T6r, T6s;
Chris@82 311 T1x = VSUB(T1t, T1w);
Chris@82 312 T1M = VADD(T1E, T1L);
Chris@82 313 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@82 314 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@82 315 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@82 316 T6s = VSUB(T5c, T5b);
Chris@82 317 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@82 318 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@82 319 }
Chris@82 320 {
Chris@82 321 V T6u, T6v, T1U, T1V;
Chris@82 322 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@82 323 T6v = VSUB(T55, T52);
Chris@82 324 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@82 325 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@82 326 T1U = VSUB(T1Q, T1T);
Chris@82 327 T1V = VSUB(T1L, T1E);
Chris@82 328 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@82 329 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@82 330 }
Chris@82 331 {
Chris@82 332 V T37, T38, T4Z, T56;
Chris@82 333 T37 = VADD(T1t, T1w);
Chris@82 334 T38 = VADD(T1T, T1Q);
Chris@82 335 T39 = VADD(T37, T38);
Chris@82 336 T3H = VSUB(T37, T38);
Chris@82 337 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@82 338 T56 = VADD(T52, T55);
Chris@82 339 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@82 340 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@82 341 }
Chris@82 342 {
Chris@82 343 V T5a, T5d, T3a, T3b;
Chris@82 344 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@82 345 T5d = VADD(T5b, T5c);
Chris@82 346 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@82 347 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@82 348 T3a = VADD(T1A, T1D);
Chris@82 349 T3b = VADD(T1H, T1K);
Chris@82 350 T3c = VADD(T3a, T3b);
Chris@82 351 T3I = VSUB(T3b, T3a);
Chris@82 352 }
Chris@82 353 }
Chris@82 354 {
Chris@82 355 V TW, T4A, TZ, T4N, T1j, T4O, T1m, T4D, T13, T16, T17, T4Q, T4H, T1a, T1d;
Chris@82 356 V T1e, T4R, T4K;
Chris@82 357 {
Chris@82 358 V TU, TV, TX, TY;
Chris@82 359 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 360 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 361 TW = VADD(TU, TV);
Chris@82 362 T4A = VSUB(TU, TV);
Chris@82 363 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 364 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 365 TZ = VADD(TX, TY);
Chris@82 366 T4N = VSUB(TX, TY);
Chris@82 367 }
Chris@82 368 {
Chris@82 369 V T1h, T1i, T4B, T1k, T1l, T4C;
Chris@82 370 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 371 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 372 T4B = VSUB(T1h, T1i);
Chris@82 373 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 374 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 375 T4C = VSUB(T1k, T1l);
Chris@82 376 T1j = VADD(T1h, T1i);
Chris@82 377 T4O = VSUB(T4B, T4C);
Chris@82 378 T1m = VADD(T1k, T1l);
Chris@82 379 T4D = VADD(T4B, T4C);
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V T4F, T4G, T4I, T4J;
Chris@82 383 {
Chris@82 384 V T11, T12, T14, T15;
Chris@82 385 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 386 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 387 T13 = VADD(T11, T12);
Chris@82 388 T4F = VSUB(T11, T12);
Chris@82 389 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 390 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 391 T16 = VADD(T14, T15);
Chris@82 392 T4G = VSUB(T14, T15);
Chris@82 393 }
Chris@82 394 T17 = VSUB(T13, T16);
Chris@82 395 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@82 396 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@82 397 {
Chris@82 398 V T18, T19, T1b, T1c;
Chris@82 399 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 400 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 401 T1a = VADD(T18, T19);
Chris@82 402 T4I = VSUB(T18, T19);
Chris@82 403 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 404 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 405 T1d = VADD(T1b, T1c);
Chris@82 406 T4J = VSUB(T1b, T1c);
Chris@82 407 }
Chris@82 408 T1e = VSUB(T1a, T1d);
Chris@82 409 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@82 410 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@82 411 }
Chris@82 412 {
Chris@82 413 V T10, T1f, T6k, T6l;
Chris@82 414 T10 = VSUB(TW, TZ);
Chris@82 415 T1f = VADD(T17, T1e);
Chris@82 416 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@82 417 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@82 418 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@82 419 T6l = VSUB(T4Q, T4R);
Chris@82 420 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@82 421 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@82 422 }
Chris@82 423 {
Chris@82 424 V T6n, T6o, T1n, T1o;
Chris@82 425 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@82 426 T6o = VSUB(T4H, T4K);
Chris@82 427 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@82 428 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@82 429 T1n = VSUB(T1j, T1m);
Chris@82 430 T1o = VSUB(T17, T1e);
Chris@82 431 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@82 432 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@82 433 }
Chris@82 434 {
Chris@82 435 V T30, T31, T4E, T4L;
Chris@82 436 T30 = VADD(TW, TZ);
Chris@82 437 T31 = VADD(T1j, T1m);
Chris@82 438 T32 = VADD(T30, T31);
Chris@82 439 T3E = VSUB(T30, T31);
Chris@82 440 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@82 441 T4L = VADD(T4H, T4K);
Chris@82 442 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@82 443 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@82 444 }
Chris@82 445 {
Chris@82 446 V T4P, T4S, T33, T34;
Chris@82 447 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@82 448 T4S = VADD(T4Q, T4R);
Chris@82 449 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@82 450 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@82 451 T33 = VADD(T13, T16);
Chris@82 452 T34 = VADD(T1a, T1d);
Chris@82 453 T35 = VADD(T33, T34);
Chris@82 454 T3F = VSUB(T33, T34);
Chris@82 455 }
Chris@82 456 }
Chris@82 457 {
Chris@82 458 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7w, T7y, T7A, T7B, T7E, T7G, T7I;
Chris@82 459 V T7J;
Chris@82 460 {
Chris@82 461 V T3t, T3x, T3w, T3y;
Chris@82 462 {
Chris@82 463 V T3r, T3s, T3u, T3v;
Chris@82 464 T3r = VADD(T2V, T2Y);
Chris@82 465 T3s = VADD(T3l, T3i);
Chris@82 466 T3t = VADD(T3r, T3s);
Chris@82 467 T3x = VSUB(T3r, T3s);
Chris@82 468 T3u = VADD(T32, T35);
Chris@82 469 T3v = VADD(T39, T3c);
Chris@82 470 T3w = VADD(T3u, T3v);
Chris@82 471 T3y = VSUB(T3v, T3u);
Chris@82 472 }
Chris@82 473 T7n = VSUB(T3t, T3w);
Chris@82 474 STM2(&(xo[64]), T7n, ovs, &(xo[0]));
Chris@82 475 T7o = VFMAI(T3y, T3x);
Chris@82 476 STM2(&(xo[32]), T7o, ovs, &(xo[0]));
Chris@82 477 T7p = VADD(T3t, T3w);
Chris@82 478 STM2(&(xo[0]), T7p, ovs, &(xo[0]));
Chris@82 479 T7q = VFNMSI(T3y, T3x);
Chris@82 480 STM2(&(xo[96]), T7q, ovs, &(xo[0]));
Chris@82 481 }
Chris@82 482 {
Chris@82 483 V T2Z, T3m, T3e, T3n, T36, T3d;
Chris@82 484 T2Z = VSUB(T2V, T2Y);
Chris@82 485 T3m = VSUB(T3i, T3l);
Chris@82 486 T36 = VSUB(T32, T35);
Chris@82 487 T3d = VSUB(T39, T3c);
Chris@82 488 T3e = VADD(T36, T3d);
Chris@82 489 T3n = VSUB(T3d, T36);
Chris@82 490 {
Chris@82 491 V T3f, T3o, T3p, T3q;
Chris@82 492 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@82 493 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@82 494 T7r = VFNMSI(T3o, T3f);
Chris@82 495 STM2(&(xo[48]), T7r, ovs, &(xo[0]));
Chris@82 496 T7s = VFMAI(T3o, T3f);
Chris@82 497 STM2(&(xo[80]), T7s, ovs, &(xo[0]));
Chris@82 498 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@82 499 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@82 500 T7t = VFNMSI(T3q, T3p);
Chris@82 501 STM2(&(xo[112]), T7t, ovs, &(xo[0]));
Chris@82 502 T7u = VFMAI(T3q, T3p);
Chris@82 503 STM2(&(xo[16]), T7u, ovs, &(xo[0]));
Chris@82 504 }
Chris@82 505 }
Chris@82 506 {
Chris@82 507 V T7v, T7x, T7z, T7C;
Chris@82 508 {
Chris@82 509 V T3D, T3V, T3O, T3Y, T3K, T3Z, T3R, T3W, T3C, T3N;
Chris@82 510 T3C = VADD(T3A, T3B);
Chris@82 511 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@82 512 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@82 513 T3N = VSUB(T3B, T3A);
Chris@82 514 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@82 515 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@82 516 {
Chris@82 517 V T3G, T3J, T3P, T3Q;
Chris@82 518 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@82 519 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@82 520 T3K = VADD(T3G, T3J);
Chris@82 521 T3Z = VSUB(T3J, T3G);
Chris@82 522 T3P = VFMA(LDK(KP414213562), T3H, T3I);
Chris@82 523 T3Q = VFMA(LDK(KP414213562), T3E, T3F);
Chris@82 524 T3R = VSUB(T3P, T3Q);
Chris@82 525 T3W = VADD(T3Q, T3P);
Chris@82 526 }
Chris@82 527 {
Chris@82 528 V T3L, T3S, T41, T42;
Chris@82 529 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@82 530 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@82 531 T7v = VFNMSI(T3S, T3L);
Chris@82 532 STM2(&(xo[56]), T7v, ovs, &(xo[0]));
Chris@82 533 T7w = VFMAI(T3S, T3L);
Chris@82 534 STM2(&(xo[72]), T7w, ovs, &(xo[0]));
Chris@82 535 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@82 536 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@82 537 T7x = VFNMSI(T42, T41);
Chris@82 538 STM2(&(xo[24]), T7x, ovs, &(xo[0]));
Chris@82 539 T7y = VFMAI(T42, T41);
Chris@82 540 STM2(&(xo[104]), T7y, ovs, &(xo[0]));
Chris@82 541 }
Chris@82 542 {
Chris@82 543 V T3T, T3U, T3X, T40;
Chris@82 544 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@82 545 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@82 546 T7z = VFNMSI(T3U, T3T);
Chris@82 547 STM2(&(xo[120]), T7z, ovs, &(xo[0]));
Chris@82 548 T7A = VFMAI(T3U, T3T);
Chris@82 549 STM2(&(xo[8]), T7A, ovs, &(xo[0]));
Chris@82 550 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@82 551 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@82 552 T7B = VFMAI(T40, T3X);
Chris@82 553 STM2(&(xo[40]), T7B, ovs, &(xo[0]));
Chris@82 554 T7C = VFNMSI(T40, T3X);
Chris@82 555 STM2(&(xo[88]), T7C, ovs, &(xo[0]));
Chris@82 556 }
Chris@82 557 }
Chris@82 558 {
Chris@82 559 V T6X, T7f, T7b, T7g, T74, T7j, T78, T7i;
Chris@82 560 {
Chris@82 561 V T6V, T6W, T79, T7a;
Chris@82 562 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@82 563 T6W = VADD(T6E, T6D);
Chris@82 564 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@82 565 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@82 566 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@82 567 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@82 568 T7b = VSUB(T79, T7a);
Chris@82 569 T7g = VADD(T79, T7a);
Chris@82 570 }
Chris@82 571 {
Chris@82 572 V T70, T73, T76, T77;
Chris@82 573 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@82 574 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@82 575 T74 = VADD(T70, T73);
Chris@82 576 T7j = VSUB(T73, T70);
Chris@82 577 T76 = VFMA(LDK(KP923879532), T6B, T6A);
Chris@82 578 T77 = VSUB(T6e, T6h);
Chris@82 579 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@82 580 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@82 581 }
Chris@82 582 {
Chris@82 583 V T75, T7c, T7D, T7l, T7m, T7F;
Chris@82 584 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@82 585 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@82 586 T7D = VFNMSI(T7c, T75);
Chris@82 587 STM2(&(xo[58]), T7D, ovs, &(xo[2]));
Chris@82 588 STN2(&(xo[56]), T7v, T7D, ovs);
Chris@82 589 T7E = VFMAI(T7c, T75);
Chris@82 590 STM2(&(xo[70]), T7E, ovs, &(xo[2]));
Chris@82 591 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@82 592 T7m = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@82 593 T7F = VFNMSI(T7m, T7l);
Chris@82 594 STM2(&(xo[26]), T7F, ovs, &(xo[2]));
Chris@82 595 STN2(&(xo[24]), T7x, T7F, ovs);
Chris@82 596 T7G = VFMAI(T7m, T7l);
Chris@82 597 STM2(&(xo[102]), T7G, ovs, &(xo[2]));
Chris@82 598 }
Chris@82 599 {
Chris@82 600 V T7d, T7e, T7H, T7h, T7k, T7K;
Chris@82 601 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@82 602 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@82 603 T7H = VFNMSI(T7e, T7d);
Chris@82 604 STM2(&(xo[122]), T7H, ovs, &(xo[2]));
Chris@82 605 STN2(&(xo[120]), T7z, T7H, ovs);
Chris@82 606 T7I = VFMAI(T7e, T7d);
Chris@82 607 STM2(&(xo[6]), T7I, ovs, &(xo[2]));
Chris@82 608 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@82 609 T7k = VFMA(LDK(KP956940335), T7j, T7i);
Chris@82 610 T7J = VFMAI(T7k, T7h);
Chris@82 611 STM2(&(xo[38]), T7J, ovs, &(xo[2]));
Chris@82 612 T7K = VFNMSI(T7k, T7h);
Chris@82 613 STM2(&(xo[90]), T7K, ovs, &(xo[2]));
Chris@82 614 STN2(&(xo[88]), T7C, T7K, ovs);
Chris@82 615 }
Chris@82 616 }
Chris@82 617 }
Chris@82 618 {
Chris@82 619 V T7L, T7N, T7P, T7S;
Chris@82 620 {
Chris@82 621 V TT, T2j, T2f, T2k, T1Y, T2n, T2c, T2m;
Chris@82 622 {
Chris@82 623 V Tn, TS, T2d, T2e;
Chris@82 624 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@82 625 TS = VADD(TC, TR);
Chris@82 626 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@82 627 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@82 628 T2d = VFMA(LDK(KP198912367), T1N, T1W);
Chris@82 629 T2e = VFMA(LDK(KP198912367), T1g, T1p);
Chris@82 630 T2f = VSUB(T2d, T2e);
Chris@82 631 T2k = VADD(T2e, T2d);
Chris@82 632 }
Chris@82 633 {
Chris@82 634 V T1q, T1X, T28, T2b;
Chris@82 635 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@82 636 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@82 637 T1Y = VADD(T1q, T1X);
Chris@82 638 T2n = VSUB(T1X, T1q);
Chris@82 639 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@82 640 T2b = VSUB(T29, T2a);
Chris@82 641 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@82 642 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 V T1Z, T2g, T7M, T2p, T2q, T7O;
Chris@82 646 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@82 647 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@82 648 T7L = VFNMSI(T2g, T1Z);
Chris@82 649 STM2(&(xo[60]), T7L, ovs, &(xo[0]));
Chris@82 650 T7M = VFMAI(T2g, T1Z);
Chris@82 651 STM2(&(xo[68]), T7M, ovs, &(xo[0]));
Chris@82 652 STN2(&(xo[68]), T7M, T7E, ovs);
Chris@82 653 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@82 654 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@82 655 T7N = VFNMSI(T2q, T2p);
Chris@82 656 STM2(&(xo[28]), T7N, ovs, &(xo[0]));
Chris@82 657 T7O = VFMAI(T2q, T2p);
Chris@82 658 STM2(&(xo[100]), T7O, ovs, &(xo[0]));
Chris@82 659 STN2(&(xo[100]), T7O, T7G, ovs);
Chris@82 660 }
Chris@82 661 {
Chris@82 662 V T2h, T2i, T7Q, T2l, T2o, T7R;
Chris@82 663 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@82 664 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@82 665 T7P = VFNMSI(T2i, T2h);
Chris@82 666 STM2(&(xo[124]), T7P, ovs, &(xo[0]));
Chris@82 667 T7Q = VFMAI(T2i, T2h);
Chris@82 668 STM2(&(xo[4]), T7Q, ovs, &(xo[0]));
Chris@82 669 STN2(&(xo[4]), T7Q, T7I, ovs);
Chris@82 670 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@82 671 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@82 672 T7R = VFMAI(T2o, T2l);
Chris@82 673 STM2(&(xo[36]), T7R, ovs, &(xo[0]));
Chris@82 674 STN2(&(xo[36]), T7R, T7J, ovs);
Chris@82 675 T7S = VFNMSI(T2o, T2l);
Chris@82 676 STM2(&(xo[92]), T7S, ovs, &(xo[0]));
Chris@82 677 }
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V T4z, T5z, T5v, T5A, T5g, T5D, T5s, T5C;
Chris@82 681 {
Chris@82 682 V T4f, T4y, T5t, T5u;
Chris@82 683 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@82 684 T4y = VADD(T4o, T4x);
Chris@82 685 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@82 686 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@82 687 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@82 688 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@82 689 T5v = VSUB(T5t, T5u);
Chris@82 690 T5A = VADD(T5t, T5u);
Chris@82 691 }
Chris@82 692 {
Chris@82 693 V T4U, T5f, T5o, T5r;
Chris@82 694 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@82 695 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@82 696 T5g = VADD(T4U, T5f);
Chris@82 697 T5D = VSUB(T5f, T4U);
Chris@82 698 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@82 699 T5r = VSUB(T5p, T5q);
Chris@82 700 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@82 701 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@82 702 }
Chris@82 703 {
Chris@82 704 V T5h, T5w, T7T, T7U;
Chris@82 705 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@82 706 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@82 707 T7T = VFNMSI(T5w, T5h);
Chris@82 708 STM2(&(xo[66]), T7T, ovs, &(xo[2]));
Chris@82 709 STN2(&(xo[64]), T7n, T7T, ovs);
Chris@82 710 T7U = VFMAI(T5w, T5h);
Chris@82 711 STM2(&(xo[62]), T7U, ovs, &(xo[2]));
Chris@82 712 STN2(&(xo[60]), T7L, T7U, ovs);
Chris@82 713 }
Chris@82 714 {
Chris@82 715 V T5F, T5G, T7V, T7W;
Chris@82 716 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@82 717 T5G = VFMA(LDK(KP995184726), T5D, T5C);
Chris@82 718 T7V = VFMAI(T5G, T5F);
Chris@82 719 STM2(&(xo[30]), T7V, ovs, &(xo[2]));
Chris@82 720 STN2(&(xo[28]), T7N, T7V, ovs);
Chris@82 721 T7W = VFNMSI(T5G, T5F);
Chris@82 722 STM2(&(xo[98]), T7W, ovs, &(xo[2]));
Chris@82 723 STN2(&(xo[96]), T7q, T7W, ovs);
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V T5x, T5y, T7X, T7Y;
Chris@82 727 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@82 728 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@82 729 T7X = VFNMSI(T5y, T5x);
Chris@82 730 STM2(&(xo[2]), T7X, ovs, &(xo[2]));
Chris@82 731 STN2(&(xo[0]), T7p, T7X, ovs);
Chris@82 732 T7Y = VFMAI(T5y, T5x);
Chris@82 733 STM2(&(xo[126]), T7Y, ovs, &(xo[2]));
Chris@82 734 STN2(&(xo[124]), T7P, T7Y, ovs);
Chris@82 735 }
Chris@82 736 {
Chris@82 737 V T5B, T5E, T7Z, T80;
Chris@82 738 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@82 739 T5E = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@82 740 T7Z = VFNMSI(T5E, T5B);
Chris@82 741 STM2(&(xo[34]), T7Z, ovs, &(xo[2]));
Chris@82 742 STN2(&(xo[32]), T7o, T7Z, ovs);
Chris@82 743 T80 = VFMAI(T5E, T5B);
Chris@82 744 STM2(&(xo[94]), T80, ovs, &(xo[2]));
Chris@82 745 STN2(&(xo[92]), T7S, T80, ovs);
Chris@82 746 }
Chris@82 747 }
Chris@82 748 }
Chris@82 749 {
Chris@82 750 V T82, T83, T86, T88;
Chris@82 751 {
Chris@82 752 V T6j, T6N, T6J, T6O, T6y, T6R, T6G, T6Q;
Chris@82 753 {
Chris@82 754 V T6b, T6i, T6H, T6I;
Chris@82 755 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@82 756 T6i = VADD(T6e, T6h);
Chris@82 757 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@82 758 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@82 759 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@82 760 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@82 761 T6J = VSUB(T6H, T6I);
Chris@82 762 T6O = VADD(T6H, T6I);
Chris@82 763 }
Chris@82 764 {
Chris@82 765 V T6q, T6x, T6C, T6F;
Chris@82 766 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@82 767 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@82 768 T6y = VADD(T6q, T6x);
Chris@82 769 T6R = VSUB(T6x, T6q);
Chris@82 770 T6C = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@82 771 T6F = VSUB(T6D, T6E);
Chris@82 772 T6G = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@82 773 T6Q = VFMA(LDK(KP831469612), T6F, T6C);
Chris@82 774 }
Chris@82 775 {
Chris@82 776 V T6z, T6K, T81, T6T, T6U, T84;
Chris@82 777 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@82 778 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@82 779 T81 = VFNMSI(T6K, T6z);
Chris@82 780 STM2(&(xo[74]), T81, ovs, &(xo[2]));
Chris@82 781 STN2(&(xo[72]), T7w, T81, ovs);
Chris@82 782 T82 = VFMAI(T6K, T6z);
Chris@82 783 STM2(&(xo[54]), T82, ovs, &(xo[2]));
Chris@82 784 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@82 785 T6U = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@82 786 T83 = VFMAI(T6U, T6T);
Chris@82 787 STM2(&(xo[22]), T83, ovs, &(xo[2]));
Chris@82 788 T84 = VFNMSI(T6U, T6T);
Chris@82 789 STM2(&(xo[106]), T84, ovs, &(xo[2]));
Chris@82 790 STN2(&(xo[104]), T7y, T84, ovs);
Chris@82 791 }
Chris@82 792 {
Chris@82 793 V T6L, T6M, T85, T6P, T6S, T87;
Chris@82 794 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@82 795 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@82 796 T85 = VFNMSI(T6M, T6L);
Chris@82 797 STM2(&(xo[10]), T85, ovs, &(xo[2]));
Chris@82 798 STN2(&(xo[8]), T7A, T85, ovs);
Chris@82 799 T86 = VFMAI(T6M, T6L);
Chris@82 800 STM2(&(xo[118]), T86, ovs, &(xo[2]));
Chris@82 801 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@82 802 T6S = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@82 803 T87 = VFNMSI(T6S, T6P);
Chris@82 804 STM2(&(xo[42]), T87, ovs, &(xo[2]));
Chris@82 805 STN2(&(xo[40]), T7B, T87, ovs);
Chris@82 806 T88 = VFMAI(T6S, T6P);
Chris@82 807 STM2(&(xo[86]), T88, ovs, &(xo[2]));
Chris@82 808 }
Chris@82 809 }
Chris@82 810 {
Chris@82 811 V T89, T8c, T8d, T8f;
Chris@82 812 {
Chris@82 813 V T2t, T2L, T2H, T2M, T2A, T2P, T2E, T2O;
Chris@82 814 {
Chris@82 815 V T2r, T2s, T2F, T2G;
Chris@82 816 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@82 817 T2s = VADD(T2a, T29);
Chris@82 818 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@82 819 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@82 820 T2F = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@82 821 T2G = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@82 822 T2H = VSUB(T2F, T2G);
Chris@82 823 T2M = VADD(T2G, T2F);
Chris@82 824 }
Chris@82 825 {
Chris@82 826 V T2w, T2z, T2C, T2D;
Chris@82 827 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@82 828 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@82 829 T2A = VADD(T2w, T2z);
Chris@82 830 T2P = VSUB(T2z, T2w);
Chris@82 831 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@82 832 T2D = VSUB(TR, TC);
Chris@82 833 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@82 834 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@82 835 }
Chris@82 836 {
Chris@82 837 V T2B, T2I, T8a, T2R, T2S, T8b;
Chris@82 838 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@82 839 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@82 840 T89 = VFNMSI(T2I, T2B);
Chris@82 841 STM2(&(xo[76]), T89, ovs, &(xo[0]));
Chris@82 842 T8a = VFMAI(T2I, T2B);
Chris@82 843 STM2(&(xo[52]), T8a, ovs, &(xo[0]));
Chris@82 844 STN2(&(xo[52]), T8a, T82, ovs);
Chris@82 845 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@82 846 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@82 847 T8b = VFMAI(T2S, T2R);
Chris@82 848 STM2(&(xo[20]), T8b, ovs, &(xo[0]));
Chris@82 849 STN2(&(xo[20]), T8b, T83, ovs);
Chris@82 850 T8c = VFNMSI(T2S, T2R);
Chris@82 851 STM2(&(xo[108]), T8c, ovs, &(xo[0]));
Chris@82 852 }
Chris@82 853 {
Chris@82 854 V T2J, T2K, T8e, T2N, T2Q, T8g;
Chris@82 855 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@82 856 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@82 857 T8d = VFNMSI(T2K, T2J);
Chris@82 858 STM2(&(xo[12]), T8d, ovs, &(xo[0]));
Chris@82 859 T8e = VFMAI(T2K, T2J);
Chris@82 860 STM2(&(xo[116]), T8e, ovs, &(xo[0]));
Chris@82 861 STN2(&(xo[116]), T8e, T86, ovs);
Chris@82 862 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@82 863 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@82 864 T8f = VFNMSI(T2Q, T2N);
Chris@82 865 STM2(&(xo[44]), T8f, ovs, &(xo[0]));
Chris@82 866 T8g = VFMAI(T2Q, T2N);
Chris@82 867 STM2(&(xo[84]), T8g, ovs, &(xo[0]));
Chris@82 868 STN2(&(xo[84]), T8g, T88, ovs);
Chris@82 869 }
Chris@82 870 }
Chris@82 871 {
Chris@82 872 V T5J, T61, T5X, T62, T5Q, T65, T5U, T64;
Chris@82 873 {
Chris@82 874 V T5H, T5I, T5V, T5W;
Chris@82 875 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@82 876 T5I = VADD(T5p, T5q);
Chris@82 877 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@82 878 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@82 879 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@82 880 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@82 881 T5X = VSUB(T5V, T5W);
Chris@82 882 T62 = VADD(T5V, T5W);
Chris@82 883 }
Chris@82 884 {
Chris@82 885 V T5M, T5P, T5S, T5T;
Chris@82 886 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@82 887 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@82 888 T5Q = VADD(T5M, T5P);
Chris@82 889 T65 = VSUB(T5P, T5M);
Chris@82 890 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@82 891 T5T = VSUB(T4x, T4o);
Chris@82 892 T5U = VFMA(LDK(KP980785280), T5T, T5S);
Chris@82 893 T64 = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@82 894 }
Chris@82 895 {
Chris@82 896 V T5R, T5Y, T8h, T8i;
Chris@82 897 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@82 898 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@82 899 T8h = VFNMSI(T5Y, T5R);
Chris@82 900 STM2(&(xo[50]), T8h, ovs, &(xo[2]));
Chris@82 901 STN2(&(xo[48]), T7r, T8h, ovs);
Chris@82 902 T8i = VFMAI(T5Y, T5R);
Chris@82 903 STM2(&(xo[78]), T8i, ovs, &(xo[2]));
Chris@82 904 STN2(&(xo[76]), T89, T8i, ovs);
Chris@82 905 }
Chris@82 906 {
Chris@82 907 V T67, T68, T8j, T8k;
Chris@82 908 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@82 909 T68 = VFNMS(LDK(KP773010453), T65, T64);
Chris@82 910 T8j = VFNMSI(T68, T67);
Chris@82 911 STM2(&(xo[18]), T8j, ovs, &(xo[2]));
Chris@82 912 STN2(&(xo[16]), T7u, T8j, ovs);
Chris@82 913 T8k = VFMAI(T68, T67);
Chris@82 914 STM2(&(xo[110]), T8k, ovs, &(xo[2]));
Chris@82 915 STN2(&(xo[108]), T8c, T8k, ovs);
Chris@82 916 }
Chris@82 917 {
Chris@82 918 V T5Z, T60, T8l, T8m;
Chris@82 919 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@82 920 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@82 921 T8l = VFNMSI(T60, T5Z);
Chris@82 922 STM2(&(xo[114]), T8l, ovs, &(xo[2]));
Chris@82 923 STN2(&(xo[112]), T7t, T8l, ovs);
Chris@82 924 T8m = VFMAI(T60, T5Z);
Chris@82 925 STM2(&(xo[14]), T8m, ovs, &(xo[2]));
Chris@82 926 STN2(&(xo[12]), T8d, T8m, ovs);
Chris@82 927 }
Chris@82 928 {
Chris@82 929 V T63, T66, T8n, T8o;
Chris@82 930 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@82 931 T66 = VFMA(LDK(KP773010453), T65, T64);
Chris@82 932 T8n = VFMAI(T66, T63);
Chris@82 933 STM2(&(xo[46]), T8n, ovs, &(xo[2]));
Chris@82 934 STN2(&(xo[44]), T8f, T8n, ovs);
Chris@82 935 T8o = VFNMSI(T66, T63);
Chris@82 936 STM2(&(xo[82]), T8o, ovs, &(xo[2]));
Chris@82 937 STN2(&(xo[80]), T7s, T8o, ovs);
Chris@82 938 }
Chris@82 939 }
Chris@82 940 }
Chris@82 941 }
Chris@82 942 }
Chris@82 943 }
Chris@82 944 }
Chris@82 945 VLEAVE();
Chris@82 946 }
Chris@82 947
Chris@82 948 static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 949
Chris@82 950 void XSIMD(codelet_n2fv_64) (planner *p) {
Chris@82 951 X(kdft_register) (p, n2fv_64, &desc);
Chris@82 952 }
Chris@82 953
Chris@82 954 #else
Chris@82 955
Chris@82 956 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name n2fv_64 -with-ostride 2 -include dft/simd/n2f.h -store-multiple 2 */
Chris@82 957
Chris@82 958 /*
Chris@82 959 * This function contains 456 FP additions, 124 FP multiplications,
Chris@82 960 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@82 961 * 128 stack variables, 15 constants, and 160 memory accesses
Chris@82 962 */
Chris@82 963 #include "dft/simd/n2f.h"
Chris@82 964
Chris@82 965 static void n2fv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 966 {
Chris@82 967 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@82 968 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 969 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@82 970 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 971 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@82 972 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 973 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@82 974 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 975 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 976 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 977 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 978 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 979 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 980 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 981 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 982 {
Chris@82 983 INT i;
Chris@82 984 const R *xi;
Chris@82 985 R *xo;
Chris@82 986 xi = ri;
Chris@82 987 xo = ro;
Chris@82 988 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 989 V T4p, T5q, Tb, T39, T2n, T3A, T6f, T6T, Tq, T3B, T6i, T76, T2i, T3a, T4w;
Chris@82 990 V T5r, TI, T2p, T6C, T6V, T3h, T3E, T4L, T5u, TZ, T2q, T6F, T6U, T3e, T3D;
Chris@82 991 V T4E, T5t, T23, T2N, T6t, T71, T6w, T72, T2c, T2O, T3t, T41, T5f, T5R, T5k;
Chris@82 992 V T5S, T3w, T42, T1s, T2K, T6m, T6Y, T6p, T6Z, T1B, T2L, T3m, T3Y, T4Y, T5O;
Chris@82 993 V T53, T5P, T3p, T3Z;
Chris@82 994 {
Chris@82 995 V T3, T4n, T2m, T4o, T6, T5p, T9, T5o;
Chris@82 996 {
Chris@82 997 V T1, T2, T2k, T2l;
Chris@82 998 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 999 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 1000 T3 = VSUB(T1, T2);
Chris@82 1001 T4n = VADD(T1, T2);
Chris@82 1002 T2k = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 1003 T2l = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 1004 T2m = VSUB(T2k, T2l);
Chris@82 1005 T4o = VADD(T2k, T2l);
Chris@82 1006 }
Chris@82 1007 {
Chris@82 1008 V T4, T5, T7, T8;
Chris@82 1009 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 1010 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 1011 T6 = VSUB(T4, T5);
Chris@82 1012 T5p = VADD(T4, T5);
Chris@82 1013 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 1014 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 1015 T9 = VSUB(T7, T8);
Chris@82 1016 T5o = VADD(T7, T8);
Chris@82 1017 }
Chris@82 1018 T4p = VSUB(T4n, T4o);
Chris@82 1019 T5q = VSUB(T5o, T5p);
Chris@82 1020 {
Chris@82 1021 V Ta, T2j, T6d, T6e;
Chris@82 1022 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 1023 Tb = VADD(T3, Ta);
Chris@82 1024 T39 = VSUB(T3, Ta);
Chris@82 1025 T2j = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@82 1026 T2n = VSUB(T2j, T2m);
Chris@82 1027 T3A = VADD(T2m, T2j);
Chris@82 1028 T6d = VADD(T4n, T4o);
Chris@82 1029 T6e = VADD(T5p, T5o);
Chris@82 1030 T6f = VADD(T6d, T6e);
Chris@82 1031 T6T = VSUB(T6d, T6e);
Chris@82 1032 }
Chris@82 1033 }
Chris@82 1034 {
Chris@82 1035 V Te, T4q, To, T4u, Th, T4r, Tl, T4t;
Chris@82 1036 {
Chris@82 1037 V Tc, Td, Tm, Tn;
Chris@82 1038 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 1039 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 1040 Te = VSUB(Tc, Td);
Chris@82 1041 T4q = VADD(Tc, Td);
Chris@82 1042 Tm = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 1043 Tn = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 1044 To = VSUB(Tm, Tn);
Chris@82 1045 T4u = VADD(Tm, Tn);
Chris@82 1046 }
Chris@82 1047 {
Chris@82 1048 V Tf, Tg, Tj, Tk;
Chris@82 1049 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 1050 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 1051 Th = VSUB(Tf, Tg);
Chris@82 1052 T4r = VADD(Tf, Tg);
Chris@82 1053 Tj = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 1054 Tk = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 1055 Tl = VSUB(Tj, Tk);
Chris@82 1056 T4t = VADD(Tj, Tk);
Chris@82 1057 }
Chris@82 1058 {
Chris@82 1059 V Ti, Tp, T6g, T6h;
Chris@82 1060 Ti = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 1061 Tp = VFMA(LDK(KP923879532), Tl, VMUL(LDK(KP382683432), To));
Chris@82 1062 Tq = VADD(Ti, Tp);
Chris@82 1063 T3B = VSUB(Tp, Ti);
Chris@82 1064 T6g = VADD(T4q, T4r);
Chris@82 1065 T6h = VADD(T4t, T4u);
Chris@82 1066 T6i = VADD(T6g, T6h);
Chris@82 1067 T76 = VSUB(T6h, T6g);
Chris@82 1068 }
Chris@82 1069 {
Chris@82 1070 V T2g, T2h, T4s, T4v;
Chris@82 1071 T2g = VFNMS(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 1072 T2h = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 1073 T2i = VSUB(T2g, T2h);
Chris@82 1074 T3a = VADD(T2h, T2g);
Chris@82 1075 T4s = VSUB(T4q, T4r);
Chris@82 1076 T4v = VSUB(T4t, T4u);
Chris@82 1077 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@82 1078 T5r = VMUL(LDK(KP707106781), VSUB(T4v, T4s));
Chris@82 1079 }
Chris@82 1080 }
Chris@82 1081 {
Chris@82 1082 V Tu, T4F, TG, T4G, TB, T4J, TD, T4I;
Chris@82 1083 {
Chris@82 1084 V Ts, Tt, TE, TF;
Chris@82 1085 Ts = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 1086 Tt = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 1087 Tu = VSUB(Ts, Tt);
Chris@82 1088 T4F = VADD(Ts, Tt);
Chris@82 1089 TE = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 1090 TF = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 1091 TG = VSUB(TE, TF);
Chris@82 1092 T4G = VADD(TE, TF);
Chris@82 1093 {
Chris@82 1094 V Tv, Tw, Tx, Ty, Tz, TA;
Chris@82 1095 Tv = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 1096 Tw = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 1097 Tx = VSUB(Tv, Tw);
Chris@82 1098 Ty = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 1099 Tz = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 1100 TA = VSUB(Ty, Tz);
Chris@82 1101 TB = VMUL(LDK(KP707106781), VADD(Tx, TA));
Chris@82 1102 T4J = VADD(Tv, Tw);
Chris@82 1103 TD = VMUL(LDK(KP707106781), VSUB(TA, Tx));
Chris@82 1104 T4I = VADD(Ty, Tz);
Chris@82 1105 }
Chris@82 1106 }
Chris@82 1107 {
Chris@82 1108 V TC, TH, T6A, T6B;
Chris@82 1109 TC = VADD(Tu, TB);
Chris@82 1110 TH = VSUB(TD, TG);
Chris@82 1111 TI = VFMA(LDK(KP195090322), TC, VMUL(LDK(KP980785280), TH));
Chris@82 1112 T2p = VFNMS(LDK(KP195090322), TH, VMUL(LDK(KP980785280), TC));
Chris@82 1113 T6A = VADD(T4F, T4G);
Chris@82 1114 T6B = VADD(T4J, T4I);
Chris@82 1115 T6C = VADD(T6A, T6B);
Chris@82 1116 T6V = VSUB(T6A, T6B);
Chris@82 1117 }
Chris@82 1118 {
Chris@82 1119 V T3f, T3g, T4H, T4K;
Chris@82 1120 T3f = VSUB(Tu, TB);
Chris@82 1121 T3g = VADD(TG, TD);
Chris@82 1122 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
Chris@82 1123 T3E = VFMA(LDK(KP555570233), T3f, VMUL(LDK(KP831469612), T3g));
Chris@82 1124 T4H = VSUB(T4F, T4G);
Chris@82 1125 T4K = VSUB(T4I, T4J);
Chris@82 1126 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@82 1127 T5u = VFMA(LDK(KP382683432), T4H, VMUL(LDK(KP923879532), T4K));
Chris@82 1128 }
Chris@82 1129 }
Chris@82 1130 {
Chris@82 1131 V TS, T4z, TW, T4y, TP, T4C, TX, T4B;
Chris@82 1132 {
Chris@82 1133 V TQ, TR, TU, TV;
Chris@82 1134 TQ = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 1135 TR = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 1136 TS = VSUB(TQ, TR);
Chris@82 1137 T4z = VADD(TQ, TR);
Chris@82 1138 TU = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 1139 TV = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 1140 TW = VSUB(TU, TV);
Chris@82 1141 T4y = VADD(TU, TV);
Chris@82 1142 {
Chris@82 1143 V TJ, TK, TL, TM, TN, TO;
Chris@82 1144 TJ = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 1145 TK = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 1146 TL = VSUB(TJ, TK);
Chris@82 1147 TM = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 1148 TN = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 1149 TO = VSUB(TM, TN);
Chris@82 1150 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@82 1151 T4C = VADD(TM, TN);
Chris@82 1152 TX = VMUL(LDK(KP707106781), VADD(TO, TL));
Chris@82 1153 T4B = VADD(TJ, TK);
Chris@82 1154 }
Chris@82 1155 }
Chris@82 1156 {
Chris@82 1157 V TT, TY, T6D, T6E;
Chris@82 1158 TT = VSUB(TP, TS);
Chris@82 1159 TY = VADD(TW, TX);
Chris@82 1160 TZ = VFNMS(LDK(KP195090322), TY, VMUL(LDK(KP980785280), TT));
Chris@82 1161 T2q = VFMA(LDK(KP980785280), TY, VMUL(LDK(KP195090322), TT));
Chris@82 1162 T6D = VADD(T4y, T4z);
Chris@82 1163 T6E = VADD(T4C, T4B);
Chris@82 1164 T6F = VADD(T6D, T6E);
Chris@82 1165 T6U = VSUB(T6D, T6E);
Chris@82 1166 }
Chris@82 1167 {
Chris@82 1168 V T3c, T3d, T4A, T4D;
Chris@82 1169 T3c = VSUB(TW, TX);
Chris@82 1170 T3d = VADD(TS, TP);
Chris@82 1171 T3e = VFMA(LDK(KP831469612), T3c, VMUL(LDK(KP555570233), T3d));
Chris@82 1172 T3D = VFNMS(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
Chris@82 1173 T4A = VSUB(T4y, T4z);
Chris@82 1174 T4D = VSUB(T4B, T4C);
Chris@82 1175 T4E = VFMA(LDK(KP923879532), T4A, VMUL(LDK(KP382683432), T4D));
Chris@82 1176 T5t = VFNMS(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@82 1177 }
Chris@82 1178 }
Chris@82 1179 {
Chris@82 1180 V T1F, T55, T2a, T56, T1M, T5h, T27, T5g, T58, T59, T1U, T5a, T25, T5b, T5c;
Chris@82 1181 V T21, T5d, T24;
Chris@82 1182 {
Chris@82 1183 V T1D, T1E, T28, T29;
Chris@82 1184 T1D = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1185 T1E = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1186 T1F = VSUB(T1D, T1E);
Chris@82 1187 T55 = VADD(T1D, T1E);
Chris@82 1188 T28 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1189 T29 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1190 T2a = VSUB(T28, T29);
Chris@82 1191 T56 = VADD(T28, T29);
Chris@82 1192 }
Chris@82 1193 {
Chris@82 1194 V T1G, T1H, T1I, T1J, T1K, T1L;
Chris@82 1195 T1G = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1196 T1H = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1197 T1I = VSUB(T1G, T1H);
Chris@82 1198 T1J = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1199 T1K = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1200 T1L = VSUB(T1J, T1K);
Chris@82 1201 T1M = VMUL(LDK(KP707106781), VADD(T1I, T1L));
Chris@82 1202 T5h = VADD(T1G, T1H);
Chris@82 1203 T27 = VMUL(LDK(KP707106781), VSUB(T1L, T1I));
Chris@82 1204 T5g = VADD(T1J, T1K);
Chris@82 1205 }
Chris@82 1206 {
Chris@82 1207 V T1Q, T1T, T1X, T20;
Chris@82 1208 {
Chris@82 1209 V T1O, T1P, T1R, T1S;
Chris@82 1210 T1O = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1211 T1P = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1212 T1Q = VSUB(T1O, T1P);
Chris@82 1213 T58 = VADD(T1O, T1P);
Chris@82 1214 T1R = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1215 T1S = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1216 T1T = VSUB(T1R, T1S);
Chris@82 1217 T59 = VADD(T1R, T1S);
Chris@82 1218 }
Chris@82 1219 T1U = VFNMS(LDK(KP382683432), T1T, VMUL(LDK(KP923879532), T1Q));
Chris@82 1220 T5a = VSUB(T58, T59);
Chris@82 1221 T25 = VFMA(LDK(KP382683432), T1Q, VMUL(LDK(KP923879532), T1T));
Chris@82 1222 {
Chris@82 1223 V T1V, T1W, T1Y, T1Z;
Chris@82 1224 T1V = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1225 T1W = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1226 T1X = VSUB(T1V, T1W);
Chris@82 1227 T5b = VADD(T1V, T1W);
Chris@82 1228 T1Y = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1229 T1Z = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1230 T20 = VSUB(T1Y, T1Z);
Chris@82 1231 T5c = VADD(T1Y, T1Z);
Chris@82 1232 }
Chris@82 1233 T21 = VFMA(LDK(KP923879532), T1X, VMUL(LDK(KP382683432), T20));
Chris@82 1234 T5d = VSUB(T5b, T5c);
Chris@82 1235 T24 = VFNMS(LDK(KP923879532), T20, VMUL(LDK(KP382683432), T1X));
Chris@82 1236 }
Chris@82 1237 {
Chris@82 1238 V T1N, T22, T6r, T6s;
Chris@82 1239 T1N = VADD(T1F, T1M);
Chris@82 1240 T22 = VADD(T1U, T21);
Chris@82 1241 T23 = VSUB(T1N, T22);
Chris@82 1242 T2N = VADD(T1N, T22);
Chris@82 1243 T6r = VADD(T55, T56);
Chris@82 1244 T6s = VADD(T5h, T5g);
Chris@82 1245 T6t = VADD(T6r, T6s);
Chris@82 1246 T71 = VSUB(T6r, T6s);
Chris@82 1247 }
Chris@82 1248 {
Chris@82 1249 V T6u, T6v, T26, T2b;
Chris@82 1250 T6u = VADD(T58, T59);
Chris@82 1251 T6v = VADD(T5b, T5c);
Chris@82 1252 T6w = VADD(T6u, T6v);
Chris@82 1253 T72 = VSUB(T6v, T6u);
Chris@82 1254 T26 = VSUB(T24, T25);
Chris@82 1255 T2b = VSUB(T27, T2a);
Chris@82 1256 T2c = VSUB(T26, T2b);
Chris@82 1257 T2O = VADD(T2b, T26);
Chris@82 1258 }
Chris@82 1259 {
Chris@82 1260 V T3r, T3s, T57, T5e;
Chris@82 1261 T3r = VSUB(T1F, T1M);
Chris@82 1262 T3s = VADD(T25, T24);
Chris@82 1263 T3t = VADD(T3r, T3s);
Chris@82 1264 T41 = VSUB(T3r, T3s);
Chris@82 1265 T57 = VSUB(T55, T56);
Chris@82 1266 T5e = VMUL(LDK(KP707106781), VADD(T5a, T5d));
Chris@82 1267 T5f = VADD(T57, T5e);
Chris@82 1268 T5R = VSUB(T57, T5e);
Chris@82 1269 }
Chris@82 1270 {
Chris@82 1271 V T5i, T5j, T3u, T3v;
Chris@82 1272 T5i = VSUB(T5g, T5h);
Chris@82 1273 T5j = VMUL(LDK(KP707106781), VSUB(T5d, T5a));
Chris@82 1274 T5k = VADD(T5i, T5j);
Chris@82 1275 T5S = VSUB(T5j, T5i);
Chris@82 1276 T3u = VADD(T2a, T27);
Chris@82 1277 T3v = VSUB(T21, T1U);
Chris@82 1278 T3w = VADD(T3u, T3v);
Chris@82 1279 T42 = VSUB(T3v, T3u);
Chris@82 1280 }
Chris@82 1281 }
Chris@82 1282 {
Chris@82 1283 V T1q, T4P, T1v, T4O, T1n, T50, T1w, T4Z, T4U, T4V, T18, T4W, T1z, T4R, T4S;
Chris@82 1284 V T1f, T4T, T1y;
Chris@82 1285 {
Chris@82 1286 V T1o, T1p, T1t, T1u;
Chris@82 1287 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1288 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1289 T1q = VSUB(T1o, T1p);
Chris@82 1290 T4P = VADD(T1o, T1p);
Chris@82 1291 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1292 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1293 T1v = VSUB(T1t, T1u);
Chris@82 1294 T4O = VADD(T1t, T1u);
Chris@82 1295 }
Chris@82 1296 {
Chris@82 1297 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@82 1298 T1h = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1299 T1i = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1300 T1j = VSUB(T1h, T1i);
Chris@82 1301 T1k = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1302 T1l = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1303 T1m = VSUB(T1k, T1l);
Chris@82 1304 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@82 1305 T50 = VADD(T1k, T1l);
Chris@82 1306 T1w = VMUL(LDK(KP707106781), VADD(T1m, T1j));
Chris@82 1307 T4Z = VADD(T1h, T1i);
Chris@82 1308 }
Chris@82 1309 {
Chris@82 1310 V T14, T17, T1b, T1e;
Chris@82 1311 {
Chris@82 1312 V T12, T13, T15, T16;
Chris@82 1313 T12 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1314 T13 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1315 T14 = VSUB(T12, T13);
Chris@82 1316 T4U = VADD(T12, T13);
Chris@82 1317 T15 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1318 T16 = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1319 T17 = VSUB(T15, T16);
Chris@82 1320 T4V = VADD(T15, T16);
Chris@82 1321 }
Chris@82 1322 T18 = VFNMS(LDK(KP923879532), T17, VMUL(LDK(KP382683432), T14));
Chris@82 1323 T4W = VSUB(T4U, T4V);
Chris@82 1324 T1z = VFMA(LDK(KP923879532), T14, VMUL(LDK(KP382683432), T17));
Chris@82 1325 {
Chris@82 1326 V T19, T1a, T1c, T1d;
Chris@82 1327 T19 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1328 T1a = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1329 T1b = VSUB(T19, T1a);
Chris@82 1330 T4R = VADD(T19, T1a);
Chris@82 1331 T1c = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1332 T1d = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1333 T1e = VSUB(T1c, T1d);
Chris@82 1334 T4S = VADD(T1c, T1d);
Chris@82 1335 }
Chris@82 1336 T1f = VFMA(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@82 1337 T4T = VSUB(T4R, T4S);
Chris@82 1338 T1y = VFNMS(LDK(KP382683432), T1e, VMUL(LDK(KP923879532), T1b));
Chris@82 1339 }
Chris@82 1340 {
Chris@82 1341 V T1g, T1r, T6k, T6l;
Chris@82 1342 T1g = VSUB(T18, T1f);
Chris@82 1343 T1r = VSUB(T1n, T1q);
Chris@82 1344 T1s = VSUB(T1g, T1r);
Chris@82 1345 T2K = VADD(T1r, T1g);
Chris@82 1346 T6k = VADD(T4O, T4P);
Chris@82 1347 T6l = VADD(T50, T4Z);
Chris@82 1348 T6m = VADD(T6k, T6l);
Chris@82 1349 T6Y = VSUB(T6k, T6l);
Chris@82 1350 }
Chris@82 1351 {
Chris@82 1352 V T6n, T6o, T1x, T1A;
Chris@82 1353 T6n = VADD(T4R, T4S);
Chris@82 1354 T6o = VADD(T4U, T4V);
Chris@82 1355 T6p = VADD(T6n, T6o);
Chris@82 1356 T6Z = VSUB(T6o, T6n);
Chris@82 1357 T1x = VADD(T1v, T1w);
Chris@82 1358 T1A = VADD(T1y, T1z);
Chris@82 1359 T1B = VSUB(T1x, T1A);
Chris@82 1360 T2L = VADD(T1x, T1A);
Chris@82 1361 }
Chris@82 1362 {
Chris@82 1363 V T3k, T3l, T4Q, T4X;
Chris@82 1364 T3k = VSUB(T1v, T1w);
Chris@82 1365 T3l = VADD(T1f, T18);
Chris@82 1366 T3m = VADD(T3k, T3l);
Chris@82 1367 T3Y = VSUB(T3k, T3l);
Chris@82 1368 T4Q = VSUB(T4O, T4P);
Chris@82 1369 T4X = VMUL(LDK(KP707106781), VADD(T4T, T4W));
Chris@82 1370 T4Y = VADD(T4Q, T4X);
Chris@82 1371 T5O = VSUB(T4Q, T4X);
Chris@82 1372 }
Chris@82 1373 {
Chris@82 1374 V T51, T52, T3n, T3o;
Chris@82 1375 T51 = VSUB(T4Z, T50);
Chris@82 1376 T52 = VMUL(LDK(KP707106781), VSUB(T4W, T4T));
Chris@82 1377 T53 = VADD(T51, T52);
Chris@82 1378 T5P = VSUB(T52, T51);
Chris@82 1379 T3n = VADD(T1q, T1n);
Chris@82 1380 T3o = VSUB(T1z, T1y);
Chris@82 1381 T3p = VADD(T3n, T3o);
Chris@82 1382 T3Z = VSUB(T3o, T3n);
Chris@82 1383 }
Chris@82 1384 }
Chris@82 1385 {
Chris@82 1386 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
Chris@82 1387 V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
Chris@82 1388 {
Chris@82 1389 V T6N, T6R, T6Q, T6S;
Chris@82 1390 {
Chris@82 1391 V T6L, T6M, T6O, T6P;
Chris@82 1392 T6L = VADD(T6f, T6i);
Chris@82 1393 T6M = VADD(T6F, T6C);
Chris@82 1394 T6N = VADD(T6L, T6M);
Chris@82 1395 T6R = VSUB(T6L, T6M);
Chris@82 1396 T6O = VADD(T6m, T6p);
Chris@82 1397 T6P = VADD(T6t, T6w);
Chris@82 1398 T6Q = VADD(T6O, T6P);
Chris@82 1399 T6S = VBYI(VSUB(T6P, T6O));
Chris@82 1400 }
Chris@82 1401 T7n = VSUB(T6N, T6Q);
Chris@82 1402 STM2(&(xo[64]), T7n, ovs, &(xo[0]));
Chris@82 1403 T7o = VADD(T6R, T6S);
Chris@82 1404 STM2(&(xo[32]), T7o, ovs, &(xo[0]));
Chris@82 1405 T7p = VADD(T6N, T6Q);
Chris@82 1406 STM2(&(xo[0]), T7p, ovs, &(xo[0]));
Chris@82 1407 T7q = VSUB(T6R, T6S);
Chris@82 1408 STM2(&(xo[96]), T7q, ovs, &(xo[0]));
Chris@82 1409 }
Chris@82 1410 {
Chris@82 1411 V T6j, T6G, T6y, T6H, T6q, T6x;
Chris@82 1412 T6j = VSUB(T6f, T6i);
Chris@82 1413 T6G = VSUB(T6C, T6F);
Chris@82 1414 T6q = VSUB(T6m, T6p);
Chris@82 1415 T6x = VSUB(T6t, T6w);
Chris@82 1416 T6y = VMUL(LDK(KP707106781), VADD(T6q, T6x));
Chris@82 1417 T6H = VMUL(LDK(KP707106781), VSUB(T6x, T6q));
Chris@82 1418 {
Chris@82 1419 V T6z, T6I, T6J, T6K;
Chris@82 1420 T6z = VADD(T6j, T6y);
Chris@82 1421 T6I = VBYI(VADD(T6G, T6H));
Chris@82 1422 T7r = VSUB(T6z, T6I);
Chris@82 1423 STM2(&(xo[112]), T7r, ovs, &(xo[0]));
Chris@82 1424 T7s = VADD(T6z, T6I);
Chris@82 1425 STM2(&(xo[16]), T7s, ovs, &(xo[0]));
Chris@82 1426 T6J = VSUB(T6j, T6y);
Chris@82 1427 T6K = VBYI(VSUB(T6H, T6G));
Chris@82 1428 T7t = VSUB(T6J, T6K);
Chris@82 1429 STM2(&(xo[80]), T7t, ovs, &(xo[0]));
Chris@82 1430 T7u = VADD(T6J, T6K);
Chris@82 1431 STM2(&(xo[48]), T7u, ovs, &(xo[0]));
Chris@82 1432 }
Chris@82 1433 }
Chris@82 1434 {
Chris@82 1435 V T6X, T7i, T78, T7g, T74, T7f, T7b, T7j, T6W, T77;
Chris@82 1436 T6W = VMUL(LDK(KP707106781), VADD(T6U, T6V));
Chris@82 1437 T6X = VADD(T6T, T6W);
Chris@82 1438 T7i = VSUB(T6T, T6W);
Chris@82 1439 T77 = VMUL(LDK(KP707106781), VSUB(T6V, T6U));
Chris@82 1440 T78 = VADD(T76, T77);
Chris@82 1441 T7g = VSUB(T77, T76);
Chris@82 1442 {
Chris@82 1443 V T70, T73, T79, T7a;
Chris@82 1444 T70 = VFMA(LDK(KP923879532), T6Y, VMUL(LDK(KP382683432), T6Z));
Chris@82 1445 T73 = VFNMS(LDK(KP382683432), T72, VMUL(LDK(KP923879532), T71));
Chris@82 1446 T74 = VADD(T70, T73);
Chris@82 1447 T7f = VSUB(T73, T70);
Chris@82 1448 T79 = VFNMS(LDK(KP382683432), T6Y, VMUL(LDK(KP923879532), T6Z));
Chris@82 1449 T7a = VFMA(LDK(KP382683432), T71, VMUL(LDK(KP923879532), T72));
Chris@82 1450 T7b = VADD(T79, T7a);
Chris@82 1451 T7j = VSUB(T7a, T79);
Chris@82 1452 }
Chris@82 1453 {
Chris@82 1454 V T75, T7c, T7l, T7m;
Chris@82 1455 T75 = VADD(T6X, T74);
Chris@82 1456 T7c = VBYI(VADD(T78, T7b));
Chris@82 1457 T7v = VSUB(T75, T7c);
Chris@82 1458 STM2(&(xo[120]), T7v, ovs, &(xo[0]));
Chris@82 1459 T7w = VADD(T75, T7c);
Chris@82 1460 STM2(&(xo[8]), T7w, ovs, &(xo[0]));
Chris@82 1461 T7l = VBYI(VADD(T7g, T7f));
Chris@82 1462 T7m = VADD(T7i, T7j);
Chris@82 1463 T7x = VADD(T7l, T7m);
Chris@82 1464 STM2(&(xo[24]), T7x, ovs, &(xo[0]));
Chris@82 1465 T7y = VSUB(T7m, T7l);
Chris@82 1466 STM2(&(xo[104]), T7y, ovs, &(xo[0]));
Chris@82 1467 }
Chris@82 1468 {
Chris@82 1469 V T7d, T7e, T7h, T7k;
Chris@82 1470 T7d = VSUB(T6X, T74);
Chris@82 1471 T7e = VBYI(VSUB(T7b, T78));
Chris@82 1472 T7z = VSUB(T7d, T7e);
Chris@82 1473 STM2(&(xo[72]), T7z, ovs, &(xo[0]));
Chris@82 1474 T7A = VADD(T7d, T7e);
Chris@82 1475 STM2(&(xo[56]), T7A, ovs, &(xo[0]));
Chris@82 1476 T7h = VBYI(VSUB(T7f, T7g));
Chris@82 1477 T7k = VSUB(T7i, T7j);
Chris@82 1478 T7B = VADD(T7h, T7k);
Chris@82 1479 STM2(&(xo[40]), T7B, ovs, &(xo[0]));
Chris@82 1480 T7C = VSUB(T7k, T7h);
Chris@82 1481 STM2(&(xo[88]), T7C, ovs, &(xo[0]));
Chris@82 1482 }
Chris@82 1483 }
Chris@82 1484 {
Chris@82 1485 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@82 1486 {
Chris@82 1487 V T5L, T5M, T5Z, T60;
Chris@82 1488 T5L = VSUB(T4p, T4w);
Chris@82 1489 T5M = VSUB(T5u, T5t);
Chris@82 1490 T5N = VADD(T5L, T5M);
Chris@82 1491 T68 = VSUB(T5L, T5M);
Chris@82 1492 T5Z = VFNMS(LDK(KP555570233), T5O, VMUL(LDK(KP831469612), T5P));
Chris@82 1493 T60 = VFMA(LDK(KP555570233), T5R, VMUL(LDK(KP831469612), T5S));
Chris@82 1494 T61 = VADD(T5Z, T60);
Chris@82 1495 T69 = VSUB(T60, T5Z);
Chris@82 1496 }
Chris@82 1497 {
Chris@82 1498 V T5Q, T5T, T5W, T5X;
Chris@82 1499 T5Q = VFMA(LDK(KP831469612), T5O, VMUL(LDK(KP555570233), T5P));
Chris@82 1500 T5T = VFNMS(LDK(KP555570233), T5S, VMUL(LDK(KP831469612), T5R));
Chris@82 1501 T5U = VADD(T5Q, T5T);
Chris@82 1502 T65 = VSUB(T5T, T5Q);
Chris@82 1503 T5W = VSUB(T5r, T5q);
Chris@82 1504 T5X = VSUB(T4L, T4E);
Chris@82 1505 T5Y = VADD(T5W, T5X);
Chris@82 1506 T66 = VSUB(T5X, T5W);
Chris@82 1507 }
Chris@82 1508 {
Chris@82 1509 V T5V, T62, T6b, T6c;
Chris@82 1510 T5V = VADD(T5N, T5U);
Chris@82 1511 T62 = VBYI(VADD(T5Y, T61));
Chris@82 1512 T7D = VSUB(T5V, T62);
Chris@82 1513 STM2(&(xo[116]), T7D, ovs, &(xo[0]));
Chris@82 1514 T7E = VADD(T5V, T62);
Chris@82 1515 STM2(&(xo[12]), T7E, ovs, &(xo[0]));
Chris@82 1516 T6b = VBYI(VADD(T66, T65));
Chris@82 1517 T6c = VADD(T68, T69);
Chris@82 1518 T7F = VADD(T6b, T6c);
Chris@82 1519 STM2(&(xo[20]), T7F, ovs, &(xo[0]));
Chris@82 1520 T7G = VSUB(T6c, T6b);
Chris@82 1521 STM2(&(xo[108]), T7G, ovs, &(xo[0]));
Chris@82 1522 }
Chris@82 1523 {
Chris@82 1524 V T63, T64, T67, T6a;
Chris@82 1525 T63 = VSUB(T5N, T5U);
Chris@82 1526 T64 = VBYI(VSUB(T61, T5Y));
Chris@82 1527 T7H = VSUB(T63, T64);
Chris@82 1528 STM2(&(xo[76]), T7H, ovs, &(xo[0]));
Chris@82 1529 T7I = VADD(T63, T64);
Chris@82 1530 STM2(&(xo[52]), T7I, ovs, &(xo[0]));
Chris@82 1531 T67 = VBYI(VSUB(T65, T66));
Chris@82 1532 T6a = VSUB(T68, T69);
Chris@82 1533 T7J = VADD(T67, T6a);
Chris@82 1534 STM2(&(xo[44]), T7J, ovs, &(xo[0]));
Chris@82 1535 T7K = VSUB(T6a, T67);
Chris@82 1536 STM2(&(xo[84]), T7K, ovs, &(xo[0]));
Chris@82 1537 }
Chris@82 1538 }
Chris@82 1539 {
Chris@82 1540 V T7U, T7W, T7X, T7Z;
Chris@82 1541 {
Chris@82 1542 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@82 1543 {
Chris@82 1544 V Tr, T10, T2t, T2u;
Chris@82 1545 Tr = VSUB(Tb, Tq);
Chris@82 1546 T10 = VSUB(TI, TZ);
Chris@82 1547 T11 = VADD(Tr, T10);
Chris@82 1548 T2C = VSUB(Tr, T10);
Chris@82 1549 T2t = VFNMS(LDK(KP634393284), T1B, VMUL(LDK(KP773010453), T1s));
Chris@82 1550 T2u = VFMA(LDK(KP773010453), T2c, VMUL(LDK(KP634393284), T23));
Chris@82 1551 T2v = VADD(T2t, T2u);
Chris@82 1552 T2D = VSUB(T2u, T2t);
Chris@82 1553 }
Chris@82 1554 {
Chris@82 1555 V T1C, T2d, T2o, T2r;
Chris@82 1556 T1C = VFMA(LDK(KP634393284), T1s, VMUL(LDK(KP773010453), T1B));
Chris@82 1557 T2d = VFNMS(LDK(KP634393284), T2c, VMUL(LDK(KP773010453), T23));
Chris@82 1558 T2e = VADD(T1C, T2d);
Chris@82 1559 T2z = VSUB(T2d, T1C);
Chris@82 1560 T2o = VSUB(T2i, T2n);
Chris@82 1561 T2r = VSUB(T2p, T2q);
Chris@82 1562 T2s = VADD(T2o, T2r);
Chris@82 1563 T2A = VSUB(T2r, T2o);
Chris@82 1564 }
Chris@82 1565 {
Chris@82 1566 V T2f, T2w, T7L, T7M;
Chris@82 1567 T2f = VADD(T11, T2e);
Chris@82 1568 T2w = VBYI(VADD(T2s, T2v));
Chris@82 1569 T7L = VSUB(T2f, T2w);
Chris@82 1570 STM2(&(xo[114]), T7L, ovs, &(xo[2]));
Chris@82 1571 STN2(&(xo[112]), T7r, T7L, ovs);
Chris@82 1572 T7M = VADD(T2f, T2w);
Chris@82 1573 STM2(&(xo[14]), T7M, ovs, &(xo[2]));
Chris@82 1574 STN2(&(xo[12]), T7E, T7M, ovs);
Chris@82 1575 }
Chris@82 1576 {
Chris@82 1577 V T2F, T2G, T7N, T7O;
Chris@82 1578 T2F = VBYI(VADD(T2A, T2z));
Chris@82 1579 T2G = VADD(T2C, T2D);
Chris@82 1580 T7N = VADD(T2F, T2G);
Chris@82 1581 STM2(&(xo[18]), T7N, ovs, &(xo[2]));
Chris@82 1582 STN2(&(xo[16]), T7s, T7N, ovs);
Chris@82 1583 T7O = VSUB(T2G, T2F);
Chris@82 1584 STM2(&(xo[110]), T7O, ovs, &(xo[2]));
Chris@82 1585 STN2(&(xo[108]), T7G, T7O, ovs);
Chris@82 1586 }
Chris@82 1587 {
Chris@82 1588 V T2x, T2y, T7P, T7Q;
Chris@82 1589 T2x = VSUB(T11, T2e);
Chris@82 1590 T2y = VBYI(VSUB(T2v, T2s));
Chris@82 1591 T7P = VSUB(T2x, T2y);
Chris@82 1592 STM2(&(xo[78]), T7P, ovs, &(xo[2]));
Chris@82 1593 STN2(&(xo[76]), T7H, T7P, ovs);
Chris@82 1594 T7Q = VADD(T2x, T2y);
Chris@82 1595 STM2(&(xo[50]), T7Q, ovs, &(xo[2]));
Chris@82 1596 STN2(&(xo[48]), T7u, T7Q, ovs);
Chris@82 1597 }
Chris@82 1598 {
Chris@82 1599 V T2B, T2E, T7R, T7S;
Chris@82 1600 T2B = VBYI(VSUB(T2z, T2A));
Chris@82 1601 T2E = VSUB(T2C, T2D);
Chris@82 1602 T7R = VADD(T2B, T2E);
Chris@82 1603 STM2(&(xo[46]), T7R, ovs, &(xo[2]));
Chris@82 1604 STN2(&(xo[44]), T7J, T7R, ovs);
Chris@82 1605 T7S = VSUB(T2E, T2B);
Chris@82 1606 STM2(&(xo[82]), T7S, ovs, &(xo[2]));
Chris@82 1607 STN2(&(xo[80]), T7t, T7S, ovs);
Chris@82 1608 }
Chris@82 1609 }
Chris@82 1610 {
Chris@82 1611 V T3j, T3Q, T3J, T3R, T3y, T3N, T3G, T3O;
Chris@82 1612 {
Chris@82 1613 V T3b, T3i, T3H, T3I;
Chris@82 1614 T3b = VADD(T39, T3a);
Chris@82 1615 T3i = VADD(T3e, T3h);
Chris@82 1616 T3j = VADD(T3b, T3i);
Chris@82 1617 T3Q = VSUB(T3b, T3i);
Chris@82 1618 T3H = VFNMS(LDK(KP290284677), T3m, VMUL(LDK(KP956940335), T3p));
Chris@82 1619 T3I = VFMA(LDK(KP290284677), T3t, VMUL(LDK(KP956940335), T3w));
Chris@82 1620 T3J = VADD(T3H, T3I);
Chris@82 1621 T3R = VSUB(T3I, T3H);
Chris@82 1622 }
Chris@82 1623 {
Chris@82 1624 V T3q, T3x, T3C, T3F;
Chris@82 1625 T3q = VFMA(LDK(KP956940335), T3m, VMUL(LDK(KP290284677), T3p));
Chris@82 1626 T3x = VFNMS(LDK(KP290284677), T3w, VMUL(LDK(KP956940335), T3t));
Chris@82 1627 T3y = VADD(T3q, T3x);
Chris@82 1628 T3N = VSUB(T3x, T3q);
Chris@82 1629 T3C = VADD(T3A, T3B);
Chris@82 1630 T3F = VADD(T3D, T3E);
Chris@82 1631 T3G = VADD(T3C, T3F);
Chris@82 1632 T3O = VSUB(T3F, T3C);
Chris@82 1633 }
Chris@82 1634 {
Chris@82 1635 V T3z, T3K, T7T, T3T, T3U, T7V;
Chris@82 1636 T3z = VADD(T3j, T3y);
Chris@82 1637 T3K = VBYI(VADD(T3G, T3J));
Chris@82 1638 T7T = VSUB(T3z, T3K);
Chris@82 1639 STM2(&(xo[122]), T7T, ovs, &(xo[2]));
Chris@82 1640 STN2(&(xo[120]), T7v, T7T, ovs);
Chris@82 1641 T7U = VADD(T3z, T3K);
Chris@82 1642 STM2(&(xo[6]), T7U, ovs, &(xo[2]));
Chris@82 1643 T3T = VBYI(VADD(T3O, T3N));
Chris@82 1644 T3U = VADD(T3Q, T3R);
Chris@82 1645 T7V = VADD(T3T, T3U);
Chris@82 1646 STM2(&(xo[26]), T7V, ovs, &(xo[2]));
Chris@82 1647 STN2(&(xo[24]), T7x, T7V, ovs);
Chris@82 1648 T7W = VSUB(T3U, T3T);
Chris@82 1649 STM2(&(xo[102]), T7W, ovs, &(xo[2]));
Chris@82 1650 }
Chris@82 1651 {
Chris@82 1652 V T3L, T3M, T7Y, T3P, T3S, T80;
Chris@82 1653 T3L = VSUB(T3j, T3y);
Chris@82 1654 T3M = VBYI(VSUB(T3J, T3G));
Chris@82 1655 T7X = VSUB(T3L, T3M);
Chris@82 1656 STM2(&(xo[70]), T7X, ovs, &(xo[2]));
Chris@82 1657 T7Y = VADD(T3L, T3M);
Chris@82 1658 STM2(&(xo[58]), T7Y, ovs, &(xo[2]));
Chris@82 1659 STN2(&(xo[56]), T7A, T7Y, ovs);
Chris@82 1660 T3P = VBYI(VSUB(T3N, T3O));
Chris@82 1661 T3S = VSUB(T3Q, T3R);
Chris@82 1662 T7Z = VADD(T3P, T3S);
Chris@82 1663 STM2(&(xo[38]), T7Z, ovs, &(xo[2]));
Chris@82 1664 T80 = VSUB(T3S, T3P);
Chris@82 1665 STM2(&(xo[90]), T80, ovs, &(xo[2]));
Chris@82 1666 STN2(&(xo[88]), T7C, T80, ovs);
Chris@82 1667 }
Chris@82 1668 }
Chris@82 1669 {
Chris@82 1670 V T81, T83, T86, T88;
Chris@82 1671 {
Chris@82 1672 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@82 1673 {
Chris@82 1674 V T4x, T4M, T5x, T5y;
Chris@82 1675 T4x = VADD(T4p, T4w);
Chris@82 1676 T4M = VADD(T4E, T4L);
Chris@82 1677 T4N = VADD(T4x, T4M);
Chris@82 1678 T5G = VSUB(T4x, T4M);
Chris@82 1679 T5x = VFNMS(LDK(KP195090322), T4Y, VMUL(LDK(KP980785280), T53));
Chris@82 1680 T5y = VFMA(LDK(KP195090322), T5f, VMUL(LDK(KP980785280), T5k));
Chris@82 1681 T5z = VADD(T5x, T5y);
Chris@82 1682 T5H = VSUB(T5y, T5x);
Chris@82 1683 }
Chris@82 1684 {
Chris@82 1685 V T54, T5l, T5s, T5v;
Chris@82 1686 T54 = VFMA(LDK(KP980785280), T4Y, VMUL(LDK(KP195090322), T53));
Chris@82 1687 T5l = VFNMS(LDK(KP195090322), T5k, VMUL(LDK(KP980785280), T5f));
Chris@82 1688 T5m = VADD(T54, T5l);
Chris@82 1689 T5D = VSUB(T5l, T54);
Chris@82 1690 T5s = VADD(T5q, T5r);
Chris@82 1691 T5v = VADD(T5t, T5u);
Chris@82 1692 T5w = VADD(T5s, T5v);
Chris@82 1693 T5E = VSUB(T5v, T5s);
Chris@82 1694 }
Chris@82 1695 {
Chris@82 1696 V T5n, T5A, T82, T5J, T5K, T84;
Chris@82 1697 T5n = VADD(T4N, T5m);
Chris@82 1698 T5A = VBYI(VADD(T5w, T5z));
Chris@82 1699 T81 = VSUB(T5n, T5A);
Chris@82 1700 STM2(&(xo[124]), T81, ovs, &(xo[0]));
Chris@82 1701 T82 = VADD(T5n, T5A);
Chris@82 1702 STM2(&(xo[4]), T82, ovs, &(xo[0]));
Chris@82 1703 STN2(&(xo[4]), T82, T7U, ovs);
Chris@82 1704 T5J = VBYI(VADD(T5E, T5D));
Chris@82 1705 T5K = VADD(T5G, T5H);
Chris@82 1706 T83 = VADD(T5J, T5K);
Chris@82 1707 STM2(&(xo[28]), T83, ovs, &(xo[0]));
Chris@82 1708 T84 = VSUB(T5K, T5J);
Chris@82 1709 STM2(&(xo[100]), T84, ovs, &(xo[0]));
Chris@82 1710 STN2(&(xo[100]), T84, T7W, ovs);
Chris@82 1711 }
Chris@82 1712 {
Chris@82 1713 V T5B, T5C, T85, T5F, T5I, T87;
Chris@82 1714 T5B = VSUB(T4N, T5m);
Chris@82 1715 T5C = VBYI(VSUB(T5z, T5w));
Chris@82 1716 T85 = VSUB(T5B, T5C);
Chris@82 1717 STM2(&(xo[68]), T85, ovs, &(xo[0]));
Chris@82 1718 STN2(&(xo[68]), T85, T7X, ovs);
Chris@82 1719 T86 = VADD(T5B, T5C);
Chris@82 1720 STM2(&(xo[60]), T86, ovs, &(xo[0]));
Chris@82 1721 T5F = VBYI(VSUB(T5D, T5E));
Chris@82 1722 T5I = VSUB(T5G, T5H);
Chris@82 1723 T87 = VADD(T5F, T5I);
Chris@82 1724 STM2(&(xo[36]), T87, ovs, &(xo[0]));
Chris@82 1725 STN2(&(xo[36]), T87, T7Z, ovs);
Chris@82 1726 T88 = VSUB(T5I, T5F);
Chris@82 1727 STM2(&(xo[92]), T88, ovs, &(xo[0]));
Chris@82 1728 }
Chris@82 1729 }
Chris@82 1730 {
Chris@82 1731 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@82 1732 {
Chris@82 1733 V T2H, T2I, T2V, T2W;
Chris@82 1734 T2H = VADD(Tb, Tq);
Chris@82 1735 T2I = VADD(T2q, T2p);
Chris@82 1736 T2J = VADD(T2H, T2I);
Chris@82 1737 T34 = VSUB(T2H, T2I);
Chris@82 1738 T2V = VFNMS(LDK(KP098017140), T2L, VMUL(LDK(KP995184726), T2K));
Chris@82 1739 T2W = VFMA(LDK(KP995184726), T2O, VMUL(LDK(KP098017140), T2N));
Chris@82 1740 T2X = VADD(T2V, T2W);
Chris@82 1741 T35 = VSUB(T2W, T2V);
Chris@82 1742 }
Chris@82 1743 {
Chris@82 1744 V T2M, T2P, T2S, T2T;
Chris@82 1745 T2M = VFMA(LDK(KP098017140), T2K, VMUL(LDK(KP995184726), T2L));
Chris@82 1746 T2P = VFNMS(LDK(KP098017140), T2O, VMUL(LDK(KP995184726), T2N));
Chris@82 1747 T2Q = VADD(T2M, T2P);
Chris@82 1748 T31 = VSUB(T2P, T2M);
Chris@82 1749 T2S = VADD(T2n, T2i);
Chris@82 1750 T2T = VADD(TZ, TI);
Chris@82 1751 T2U = VADD(T2S, T2T);
Chris@82 1752 T32 = VSUB(T2T, T2S);
Chris@82 1753 }
Chris@82 1754 {
Chris@82 1755 V T2R, T2Y, T89, T8a;
Chris@82 1756 T2R = VADD(T2J, T2Q);
Chris@82 1757 T2Y = VBYI(VADD(T2U, T2X));
Chris@82 1758 T89 = VSUB(T2R, T2Y);
Chris@82 1759 STM2(&(xo[126]), T89, ovs, &(xo[2]));
Chris@82 1760 STN2(&(xo[124]), T81, T89, ovs);
Chris@82 1761 T8a = VADD(T2R, T2Y);
Chris@82 1762 STM2(&(xo[2]), T8a, ovs, &(xo[2]));
Chris@82 1763 STN2(&(xo[0]), T7p, T8a, ovs);
Chris@82 1764 }
Chris@82 1765 {
Chris@82 1766 V T37, T38, T8b, T8c;
Chris@82 1767 T37 = VBYI(VADD(T32, T31));
Chris@82 1768 T38 = VADD(T34, T35);
Chris@82 1769 T8b = VADD(T37, T38);
Chris@82 1770 STM2(&(xo[30]), T8b, ovs, &(xo[2]));
Chris@82 1771 STN2(&(xo[28]), T83, T8b, ovs);
Chris@82 1772 T8c = VSUB(T38, T37);
Chris@82 1773 STM2(&(xo[98]), T8c, ovs, &(xo[2]));
Chris@82 1774 STN2(&(xo[96]), T7q, T8c, ovs);
Chris@82 1775 }
Chris@82 1776 {
Chris@82 1777 V T2Z, T30, T8d, T8e;
Chris@82 1778 T2Z = VSUB(T2J, T2Q);
Chris@82 1779 T30 = VBYI(VSUB(T2X, T2U));
Chris@82 1780 T8d = VSUB(T2Z, T30);
Chris@82 1781 STM2(&(xo[66]), T8d, ovs, &(xo[2]));
Chris@82 1782 STN2(&(xo[64]), T7n, T8d, ovs);
Chris@82 1783 T8e = VADD(T2Z, T30);
Chris@82 1784 STM2(&(xo[62]), T8e, ovs, &(xo[2]));
Chris@82 1785 STN2(&(xo[60]), T86, T8e, ovs);
Chris@82 1786 }
Chris@82 1787 {
Chris@82 1788 V T33, T36, T8f, T8g;
Chris@82 1789 T33 = VBYI(VSUB(T31, T32));
Chris@82 1790 T36 = VSUB(T34, T35);
Chris@82 1791 T8f = VADD(T33, T36);
Chris@82 1792 STM2(&(xo[34]), T8f, ovs, &(xo[2]));
Chris@82 1793 STN2(&(xo[32]), T7o, T8f, ovs);
Chris@82 1794 T8g = VSUB(T36, T33);
Chris@82 1795 STM2(&(xo[94]), T8g, ovs, &(xo[2]));
Chris@82 1796 STN2(&(xo[92]), T88, T8g, ovs);
Chris@82 1797 }
Chris@82 1798 }
Chris@82 1799 {
Chris@82 1800 V T3X, T4i, T4b, T4j, T44, T4f, T48, T4g;
Chris@82 1801 {
Chris@82 1802 V T3V, T3W, T49, T4a;
Chris@82 1803 T3V = VSUB(T39, T3a);
Chris@82 1804 T3W = VSUB(T3E, T3D);
Chris@82 1805 T3X = VADD(T3V, T3W);
Chris@82 1806 T4i = VSUB(T3V, T3W);
Chris@82 1807 T49 = VFNMS(LDK(KP471396736), T3Y, VMUL(LDK(KP881921264), T3Z));
Chris@82 1808 T4a = VFMA(LDK(KP471396736), T41, VMUL(LDK(KP881921264), T42));
Chris@82 1809 T4b = VADD(T49, T4a);
Chris@82 1810 T4j = VSUB(T4a, T49);
Chris@82 1811 }
Chris@82 1812 {
Chris@82 1813 V T40, T43, T46, T47;
Chris@82 1814 T40 = VFMA(LDK(KP881921264), T3Y, VMUL(LDK(KP471396736), T3Z));
Chris@82 1815 T43 = VFNMS(LDK(KP471396736), T42, VMUL(LDK(KP881921264), T41));
Chris@82 1816 T44 = VADD(T40, T43);
Chris@82 1817 T4f = VSUB(T43, T40);
Chris@82 1818 T46 = VSUB(T3B, T3A);
Chris@82 1819 T47 = VSUB(T3h, T3e);
Chris@82 1820 T48 = VADD(T46, T47);
Chris@82 1821 T4g = VSUB(T47, T46);
Chris@82 1822 }
Chris@82 1823 {
Chris@82 1824 V T45, T4c, T8h, T8i;
Chris@82 1825 T45 = VADD(T3X, T44);
Chris@82 1826 T4c = VBYI(VADD(T48, T4b));
Chris@82 1827 T8h = VSUB(T45, T4c);
Chris@82 1828 STM2(&(xo[118]), T8h, ovs, &(xo[2]));
Chris@82 1829 STN2(&(xo[116]), T7D, T8h, ovs);
Chris@82 1830 T8i = VADD(T45, T4c);
Chris@82 1831 STM2(&(xo[10]), T8i, ovs, &(xo[2]));
Chris@82 1832 STN2(&(xo[8]), T7w, T8i, ovs);
Chris@82 1833 }
Chris@82 1834 {
Chris@82 1835 V T4l, T4m, T8j, T8k;
Chris@82 1836 T4l = VBYI(VADD(T4g, T4f));
Chris@82 1837 T4m = VADD(T4i, T4j);
Chris@82 1838 T8j = VADD(T4l, T4m);
Chris@82 1839 STM2(&(xo[22]), T8j, ovs, &(xo[2]));
Chris@82 1840 STN2(&(xo[20]), T7F, T8j, ovs);
Chris@82 1841 T8k = VSUB(T4m, T4l);
Chris@82 1842 STM2(&(xo[106]), T8k, ovs, &(xo[2]));
Chris@82 1843 STN2(&(xo[104]), T7y, T8k, ovs);
Chris@82 1844 }
Chris@82 1845 {
Chris@82 1846 V T4d, T4e, T8l, T8m;
Chris@82 1847 T4d = VSUB(T3X, T44);
Chris@82 1848 T4e = VBYI(VSUB(T4b, T48));
Chris@82 1849 T8l = VSUB(T4d, T4e);
Chris@82 1850 STM2(&(xo[74]), T8l, ovs, &(xo[2]));
Chris@82 1851 STN2(&(xo[72]), T7z, T8l, ovs);
Chris@82 1852 T8m = VADD(T4d, T4e);
Chris@82 1853 STM2(&(xo[54]), T8m, ovs, &(xo[2]));
Chris@82 1854 STN2(&(xo[52]), T7I, T8m, ovs);
Chris@82 1855 }
Chris@82 1856 {
Chris@82 1857 V T4h, T4k, T8n, T8o;
Chris@82 1858 T4h = VBYI(VSUB(T4f, T4g));
Chris@82 1859 T4k = VSUB(T4i, T4j);
Chris@82 1860 T8n = VADD(T4h, T4k);
Chris@82 1861 STM2(&(xo[42]), T8n, ovs, &(xo[2]));
Chris@82 1862 STN2(&(xo[40]), T7B, T8n, ovs);
Chris@82 1863 T8o = VSUB(T4k, T4h);
Chris@82 1864 STM2(&(xo[86]), T8o, ovs, &(xo[2]));
Chris@82 1865 STN2(&(xo[84]), T7K, T8o, ovs);
Chris@82 1866 }
Chris@82 1867 }
Chris@82 1868 }
Chris@82 1869 }
Chris@82 1870 }
Chris@82 1871 }
Chris@82 1872 }
Chris@82 1873 VLEAVE();
Chris@82 1874 }
Chris@82 1875
Chris@82 1876 static const kdft_desc desc = { 64, XSIMD_STRING("n2fv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
Chris@82 1877
Chris@82 1878 void XSIMD(codelet_n2fv_64) (planner *p) {
Chris@82 1879 X(kdft_register) (p, n2fv_64, &desc);
Chris@82 1880 }
Chris@82 1881
Chris@82 1882 #endif