annotate src/fftw-3.3.8/dft/simd/common/n1bv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:58 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include dft/simd/n1b.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@82 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@82 33 * 108 stack variables, 15 constants, and 128 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n1b.h"
Chris@82 36
Chris@82 37 static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@82 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@82 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@82 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@82 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 50 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 51 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 54 {
Chris@82 55 INT i;
Chris@82 56 const R *xi;
Chris@82 57 R *xo;
Chris@82 58 xi = ii;
Chris@82 59 xo = io;
Chris@82 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 61 V T7, T26, T47, T69, T5k, T6A, T2V, T3z, Tm, T27, T5n, T6a, T2Y, T3M, T4e;
Chris@82 62 V T6B, TC, T29, T6e, T6D, T3i, T3A, T4o, T5p, TR, T2a, T6h, T6E, T3l, T3B;
Chris@82 63 V T4x, T5q, T1N, T2x, T6t, T71, T6w, T72, T1W, T2y, T39, T3H, T57, T5N, T5e;
Chris@82 64 V T5O, T3c, T3I, T1g, T2u, T6m, T6Y, T6p, T6Z, T1p, T2v, T32, T3E, T4M, T5K;
Chris@82 65 V T4T, T5L, T35, T3F;
Chris@82 66 {
Chris@82 67 V T3, T43, T25, T45, T6, T5i, T22, T44;
Chris@82 68 {
Chris@82 69 V T1, T2, T23, T24;
Chris@82 70 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 71 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 72 T3 = VADD(T1, T2);
Chris@82 73 T43 = VSUB(T1, T2);
Chris@82 74 T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 75 T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 76 T25 = VADD(T23, T24);
Chris@82 77 T45 = VSUB(T23, T24);
Chris@82 78 }
Chris@82 79 {
Chris@82 80 V T4, T5, T20, T21;
Chris@82 81 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 82 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 83 T6 = VADD(T4, T5);
Chris@82 84 T5i = VSUB(T4, T5);
Chris@82 85 T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 86 T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 87 T22 = VADD(T20, T21);
Chris@82 88 T44 = VSUB(T20, T21);
Chris@82 89 }
Chris@82 90 T7 = VSUB(T3, T6);
Chris@82 91 T26 = VSUB(T22, T25);
Chris@82 92 {
Chris@82 93 V T46, T5j, T2T, T2U;
Chris@82 94 T46 = VADD(T44, T45);
Chris@82 95 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@82 96 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@82 97 T5j = VSUB(T44, T45);
Chris@82 98 T5k = VFMA(LDK(KP707106781), T5j, T5i);
Chris@82 99 T6A = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@82 100 T2T = VADD(T3, T6);
Chris@82 101 T2U = VADD(T22, T25);
Chris@82 102 T2V = VADD(T2T, T2U);
Chris@82 103 T3z = VSUB(T2T, T2U);
Chris@82 104 }
Chris@82 105 }
Chris@82 106 {
Chris@82 107 V Ta, T48, Tk, T4c, Td, T49, Th, T4b;
Chris@82 108 {
Chris@82 109 V T8, T9, Ti, Tj;
Chris@82 110 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 111 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 112 Ta = VADD(T8, T9);
Chris@82 113 T48 = VSUB(T8, T9);
Chris@82 114 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 115 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 116 Tk = VADD(Ti, Tj);
Chris@82 117 T4c = VSUB(Tj, Ti);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 V Tb, Tc, Tf, Tg;
Chris@82 121 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 122 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 123 Td = VADD(Tb, Tc);
Chris@82 124 T49 = VSUB(Tb, Tc);
Chris@82 125 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 126 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 127 Th = VADD(Tf, Tg);
Chris@82 128 T4b = VSUB(Tf, Tg);
Chris@82 129 }
Chris@82 130 {
Chris@82 131 V Te, Tl, T5l, T5m;
Chris@82 132 Te = VSUB(Ta, Td);
Chris@82 133 Tl = VSUB(Th, Tk);
Chris@82 134 Tm = VADD(Te, Tl);
Chris@82 135 T27 = VSUB(Te, Tl);
Chris@82 136 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@82 137 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@82 138 T5n = VSUB(T5l, T5m);
Chris@82 139 T6a = VADD(T5l, T5m);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 V T2W, T2X, T4a, T4d;
Chris@82 143 T2W = VADD(Ta, Td);
Chris@82 144 T2X = VADD(Th, Tk);
Chris@82 145 T2Y = VADD(T2W, T2X);
Chris@82 146 T3M = VSUB(T2W, T2X);
Chris@82 147 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@82 148 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@82 149 T4e = VADD(T4a, T4d);
Chris@82 150 T6B = VSUB(T4a, T4d);
Chris@82 151 }
Chris@82 152 }
Chris@82 153 {
Chris@82 154 V Tq, T4g, Tt, T4l, Tx, T4m, TA, T4j;
Chris@82 155 {
Chris@82 156 V To, Tp, Tr, Ts;
Chris@82 157 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 158 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 159 Tq = VADD(To, Tp);
Chris@82 160 T4g = VSUB(To, Tp);
Chris@82 161 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 162 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 163 Tt = VADD(Tr, Ts);
Chris@82 164 T4l = VSUB(Tr, Ts);
Chris@82 165 {
Chris@82 166 V Tv, Tw, T4h, Ty, Tz, T4i;
Chris@82 167 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 168 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 169 T4h = VSUB(Tv, Tw);
Chris@82 170 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 171 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 172 T4i = VSUB(Ty, Tz);
Chris@82 173 Tx = VADD(Tv, Tw);
Chris@82 174 T4m = VSUB(T4h, T4i);
Chris@82 175 TA = VADD(Ty, Tz);
Chris@82 176 T4j = VADD(T4h, T4i);
Chris@82 177 }
Chris@82 178 }
Chris@82 179 {
Chris@82 180 V Tu, TB, T6c, T6d;
Chris@82 181 Tu = VSUB(Tq, Tt);
Chris@82 182 TB = VSUB(Tx, TA);
Chris@82 183 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@82 184 T29 = VFMA(LDK(KP414213562), Tu, TB);
Chris@82 185 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@82 186 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@82 187 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@82 188 T6D = VFMA(LDK(KP668178637), T6c, T6d);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 V T3g, T3h, T4k, T4n;
Chris@82 192 T3g = VADD(Tq, Tt);
Chris@82 193 T3h = VADD(Tx, TA);
Chris@82 194 T3i = VADD(T3g, T3h);
Chris@82 195 T3A = VSUB(T3g, T3h);
Chris@82 196 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@82 197 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@82 198 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@82 199 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 V TF, T4p, TI, T4u, TM, T4v, TP, T4s;
Chris@82 204 {
Chris@82 205 V TD, TE, TG, TH;
Chris@82 206 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 207 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 208 TF = VADD(TD, TE);
Chris@82 209 T4p = VSUB(TD, TE);
Chris@82 210 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 211 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 212 TI = VADD(TG, TH);
Chris@82 213 T4u = VSUB(TH, TG);
Chris@82 214 {
Chris@82 215 V TK, TL, T4r, TN, TO, T4q;
Chris@82 216 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 217 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 218 T4r = VSUB(TK, TL);
Chris@82 219 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 220 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 221 T4q = VSUB(TN, TO);
Chris@82 222 TM = VADD(TK, TL);
Chris@82 223 T4v = VSUB(T4r, T4q);
Chris@82 224 TP = VADD(TN, TO);
Chris@82 225 T4s = VADD(T4q, T4r);
Chris@82 226 }
Chris@82 227 }
Chris@82 228 {
Chris@82 229 V TJ, TQ, T6f, T6g;
Chris@82 230 TJ = VSUB(TF, TI);
Chris@82 231 TQ = VSUB(TM, TP);
Chris@82 232 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@82 233 T2a = VFMA(LDK(KP414213562), TJ, TQ);
Chris@82 234 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@82 235 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@82 236 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@82 237 T6E = VFMA(LDK(KP668178637), T6f, T6g);
Chris@82 238 }
Chris@82 239 {
Chris@82 240 V T3j, T3k, T4t, T4w;
Chris@82 241 T3j = VADD(TF, TI);
Chris@82 242 T3k = VADD(TP, TM);
Chris@82 243 T3l = VADD(T3j, T3k);
Chris@82 244 T3B = VSUB(T3j, T3k);
Chris@82 245 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@82 246 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@82 247 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@82 248 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@82 249 }
Chris@82 250 }
Chris@82 251 {
Chris@82 252 V T1t, T4V, T1w, T58, T1Q, T59, T1T, T4Y, T1A, T1D, T1E, T5b, T52, T1H, T1K;
Chris@82 253 V T1L, T5c, T55;
Chris@82 254 {
Chris@82 255 V T1r, T1s, T1u, T1v;
Chris@82 256 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 257 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 258 T1t = VADD(T1r, T1s);
Chris@82 259 T4V = VSUB(T1r, T1s);
Chris@82 260 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 261 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 262 T1w = VADD(T1u, T1v);
Chris@82 263 T58 = VSUB(T1v, T1u);
Chris@82 264 }
Chris@82 265 {
Chris@82 266 V T1O, T1P, T4X, T1R, T1S, T4W;
Chris@82 267 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 268 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 269 T4X = VSUB(T1O, T1P);
Chris@82 270 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 271 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 272 T4W = VSUB(T1R, T1S);
Chris@82 273 T1Q = VADD(T1O, T1P);
Chris@82 274 T59 = VSUB(T4X, T4W);
Chris@82 275 T1T = VADD(T1R, T1S);
Chris@82 276 T4Y = VADD(T4W, T4X);
Chris@82 277 }
Chris@82 278 {
Chris@82 279 V T50, T51, T53, T54;
Chris@82 280 {
Chris@82 281 V T1y, T1z, T1B, T1C;
Chris@82 282 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 283 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 284 T1A = VADD(T1y, T1z);
Chris@82 285 T50 = VSUB(T1y, T1z);
Chris@82 286 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 287 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 288 T1D = VADD(T1B, T1C);
Chris@82 289 T51 = VSUB(T1C, T1B);
Chris@82 290 }
Chris@82 291 T1E = VSUB(T1A, T1D);
Chris@82 292 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@82 293 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@82 294 {
Chris@82 295 V T1F, T1G, T1I, T1J;
Chris@82 296 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 297 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 298 T1H = VADD(T1F, T1G);
Chris@82 299 T53 = VSUB(T1F, T1G);
Chris@82 300 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 301 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 302 T1K = VADD(T1I, T1J);
Chris@82 303 T54 = VSUB(T1J, T1I);
Chris@82 304 }
Chris@82 305 T1L = VSUB(T1H, T1K);
Chris@82 306 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@82 307 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@82 308 }
Chris@82 309 {
Chris@82 310 V T1x, T1M, T6r, T6s;
Chris@82 311 T1x = VSUB(T1t, T1w);
Chris@82 312 T1M = VADD(T1E, T1L);
Chris@82 313 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@82 314 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@82 315 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@82 316 T6s = VSUB(T5c, T5b);
Chris@82 317 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@82 318 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@82 319 }
Chris@82 320 {
Chris@82 321 V T6u, T6v, T1U, T1V;
Chris@82 322 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@82 323 T6v = VSUB(T55, T52);
Chris@82 324 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@82 325 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@82 326 T1U = VSUB(T1Q, T1T);
Chris@82 327 T1V = VSUB(T1L, T1E);
Chris@82 328 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@82 329 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@82 330 }
Chris@82 331 {
Chris@82 332 V T37, T38, T4Z, T56;
Chris@82 333 T37 = VADD(T1t, T1w);
Chris@82 334 T38 = VADD(T1T, T1Q);
Chris@82 335 T39 = VADD(T37, T38);
Chris@82 336 T3H = VSUB(T37, T38);
Chris@82 337 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@82 338 T56 = VADD(T52, T55);
Chris@82 339 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@82 340 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@82 341 }
Chris@82 342 {
Chris@82 343 V T5a, T5d, T3a, T3b;
Chris@82 344 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@82 345 T5d = VADD(T5b, T5c);
Chris@82 346 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@82 347 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@82 348 T3a = VADD(T1A, T1D);
Chris@82 349 T3b = VADD(T1H, T1K);
Chris@82 350 T3c = VADD(T3a, T3b);
Chris@82 351 T3I = VSUB(T3b, T3a);
Chris@82 352 }
Chris@82 353 }
Chris@82 354 {
Chris@82 355 V TW, T4A, TZ, T4N, T1j, T4O, T1m, T4D, T13, T16, T17, T4Q, T4H, T1a, T1d;
Chris@82 356 V T1e, T4R, T4K;
Chris@82 357 {
Chris@82 358 V TU, TV, TX, TY;
Chris@82 359 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 360 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 361 TW = VADD(TU, TV);
Chris@82 362 T4A = VSUB(TU, TV);
Chris@82 363 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 364 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 365 TZ = VADD(TX, TY);
Chris@82 366 T4N = VSUB(TX, TY);
Chris@82 367 }
Chris@82 368 {
Chris@82 369 V T1h, T1i, T4B, T1k, T1l, T4C;
Chris@82 370 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 371 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 372 T4B = VSUB(T1h, T1i);
Chris@82 373 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 374 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 375 T4C = VSUB(T1k, T1l);
Chris@82 376 T1j = VADD(T1h, T1i);
Chris@82 377 T4O = VSUB(T4B, T4C);
Chris@82 378 T1m = VADD(T1k, T1l);
Chris@82 379 T4D = VADD(T4B, T4C);
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V T4F, T4G, T4I, T4J;
Chris@82 383 {
Chris@82 384 V T11, T12, T14, T15;
Chris@82 385 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 386 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 387 T13 = VADD(T11, T12);
Chris@82 388 T4F = VSUB(T11, T12);
Chris@82 389 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 390 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 391 T16 = VADD(T14, T15);
Chris@82 392 T4G = VSUB(T14, T15);
Chris@82 393 }
Chris@82 394 T17 = VSUB(T13, T16);
Chris@82 395 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@82 396 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@82 397 {
Chris@82 398 V T18, T19, T1b, T1c;
Chris@82 399 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 400 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 401 T1a = VADD(T18, T19);
Chris@82 402 T4I = VSUB(T18, T19);
Chris@82 403 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 404 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 405 T1d = VADD(T1b, T1c);
Chris@82 406 T4J = VSUB(T1b, T1c);
Chris@82 407 }
Chris@82 408 T1e = VSUB(T1a, T1d);
Chris@82 409 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@82 410 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@82 411 }
Chris@82 412 {
Chris@82 413 V T10, T1f, T6k, T6l;
Chris@82 414 T10 = VSUB(TW, TZ);
Chris@82 415 T1f = VADD(T17, T1e);
Chris@82 416 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@82 417 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@82 418 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@82 419 T6l = VSUB(T4Q, T4R);
Chris@82 420 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@82 421 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@82 422 }
Chris@82 423 {
Chris@82 424 V T6n, T6o, T1n, T1o;
Chris@82 425 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@82 426 T6o = VSUB(T4H, T4K);
Chris@82 427 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@82 428 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@82 429 T1n = VSUB(T1j, T1m);
Chris@82 430 T1o = VSUB(T17, T1e);
Chris@82 431 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@82 432 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@82 433 }
Chris@82 434 {
Chris@82 435 V T30, T31, T4E, T4L;
Chris@82 436 T30 = VADD(TW, TZ);
Chris@82 437 T31 = VADD(T1j, T1m);
Chris@82 438 T32 = VADD(T30, T31);
Chris@82 439 T3E = VSUB(T30, T31);
Chris@82 440 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@82 441 T4L = VADD(T4H, T4K);
Chris@82 442 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@82 443 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@82 444 }
Chris@82 445 {
Chris@82 446 V T4P, T4S, T33, T34;
Chris@82 447 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@82 448 T4S = VADD(T4Q, T4R);
Chris@82 449 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@82 450 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@82 451 T33 = VADD(T13, T16);
Chris@82 452 T34 = VADD(T1a, T1d);
Chris@82 453 T35 = VADD(T33, T34);
Chris@82 454 T3F = VSUB(T33, T34);
Chris@82 455 }
Chris@82 456 }
Chris@82 457 {
Chris@82 458 V T3t, T3x, T3w, T3y;
Chris@82 459 {
Chris@82 460 V T3r, T3s, T3u, T3v;
Chris@82 461 T3r = VADD(T2V, T2Y);
Chris@82 462 T3s = VADD(T3i, T3l);
Chris@82 463 T3t = VSUB(T3r, T3s);
Chris@82 464 T3x = VADD(T3r, T3s);
Chris@82 465 T3u = VADD(T32, T35);
Chris@82 466 T3v = VADD(T39, T3c);
Chris@82 467 T3w = VSUB(T3u, T3v);
Chris@82 468 T3y = VADD(T3u, T3v);
Chris@82 469 }
Chris@82 470 ST(&(xo[WS(os, 48)]), VFNMSI(T3w, T3t), ovs, &(xo[0]));
Chris@82 471 ST(&(xo[0]), VADD(T3x, T3y), ovs, &(xo[0]));
Chris@82 472 ST(&(xo[WS(os, 16)]), VFMAI(T3w, T3t), ovs, &(xo[0]));
Chris@82 473 ST(&(xo[WS(os, 32)]), VSUB(T3x, T3y), ovs, &(xo[0]));
Chris@82 474 }
Chris@82 475 {
Chris@82 476 V T2Z, T3m, T3e, T3n, T36, T3d;
Chris@82 477 T2Z = VSUB(T2V, T2Y);
Chris@82 478 T3m = VSUB(T3i, T3l);
Chris@82 479 T36 = VSUB(T32, T35);
Chris@82 480 T3d = VSUB(T39, T3c);
Chris@82 481 T3e = VADD(T36, T3d);
Chris@82 482 T3n = VSUB(T36, T3d);
Chris@82 483 {
Chris@82 484 V T3f, T3o, T3p, T3q;
Chris@82 485 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@82 486 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@82 487 ST(&(xo[WS(os, 24)]), VFNMSI(T3o, T3f), ovs, &(xo[0]));
Chris@82 488 ST(&(xo[WS(os, 40)]), VFMAI(T3o, T3f), ovs, &(xo[0]));
Chris@82 489 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@82 490 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@82 491 ST(&(xo[WS(os, 8)]), VFMAI(T3q, T3p), ovs, &(xo[0]));
Chris@82 492 ST(&(xo[WS(os, 56)]), VFNMSI(T3q, T3p), ovs, &(xo[0]));
Chris@82 493 }
Chris@82 494 }
Chris@82 495 {
Chris@82 496 V T3D, T3V, T3O, T3Y, T3K, T3Z, T3R, T3W, T3C, T3N;
Chris@82 497 T3C = VADD(T3A, T3B);
Chris@82 498 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@82 499 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@82 500 T3N = VSUB(T3A, T3B);
Chris@82 501 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@82 502 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@82 503 {
Chris@82 504 V T3G, T3J, T3P, T3Q;
Chris@82 505 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@82 506 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@82 507 T3K = VADD(T3G, T3J);
Chris@82 508 T3Z = VSUB(T3G, T3J);
Chris@82 509 T3P = VFMA(LDK(KP414213562), T3E, T3F);
Chris@82 510 T3Q = VFMA(LDK(KP414213562), T3H, T3I);
Chris@82 511 T3R = VSUB(T3P, T3Q);
Chris@82 512 T3W = VADD(T3P, T3Q);
Chris@82 513 }
Chris@82 514 {
Chris@82 515 V T3L, T3S, T41, T42;
Chris@82 516 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@82 517 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@82 518 ST(&(xo[WS(os, 28)]), VFNMSI(T3S, T3L), ovs, &(xo[0]));
Chris@82 519 ST(&(xo[WS(os, 36)]), VFMAI(T3S, T3L), ovs, &(xo[0]));
Chris@82 520 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@82 521 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@82 522 ST(&(xo[WS(os, 12)]), VFNMSI(T42, T41), ovs, &(xo[0]));
Chris@82 523 ST(&(xo[WS(os, 52)]), VFMAI(T42, T41), ovs, &(xo[0]));
Chris@82 524 }
Chris@82 525 {
Chris@82 526 V T3T, T3U, T3X, T40;
Chris@82 527 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@82 528 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@82 529 ST(&(xo[WS(os, 60)]), VFNMSI(T3U, T3T), ovs, &(xo[0]));
Chris@82 530 ST(&(xo[WS(os, 4)]), VFMAI(T3U, T3T), ovs, &(xo[0]));
Chris@82 531 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@82 532 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@82 533 ST(&(xo[WS(os, 20)]), VFMAI(T40, T3X), ovs, &(xo[0]));
Chris@82 534 ST(&(xo[WS(os, 44)]), VFNMSI(T40, T3X), ovs, &(xo[0]));
Chris@82 535 }
Chris@82 536 }
Chris@82 537 {
Chris@82 538 V T6X, T7f, T7b, T7g, T74, T7j, T78, T7i;
Chris@82 539 {
Chris@82 540 V T6V, T6W, T79, T7a;
Chris@82 541 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@82 542 T6W = VADD(T6D, T6E);
Chris@82 543 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@82 544 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@82 545 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@82 546 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@82 547 T7b = VSUB(T79, T7a);
Chris@82 548 T7g = VADD(T79, T7a);
Chris@82 549 }
Chris@82 550 {
Chris@82 551 V T70, T73, T76, T77;
Chris@82 552 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@82 553 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@82 554 T74 = VADD(T70, T73);
Chris@82 555 T7j = VSUB(T70, T73);
Chris@82 556 T76 = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@82 557 T77 = VSUB(T6e, T6h);
Chris@82 558 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@82 559 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@82 560 }
Chris@82 561 {
Chris@82 562 V T75, T7c, T7l, T7m;
Chris@82 563 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@82 564 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@82 565 ST(&(xo[WS(os, 35)]), VFNMSI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@82 566 ST(&(xo[WS(os, 29)]), VFMAI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@82 567 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@82 568 T7m = VFMA(LDK(KP956940335), T7j, T7i);
Chris@82 569 ST(&(xo[WS(os, 13)]), VFMAI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@82 570 ST(&(xo[WS(os, 51)]), VFNMSI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@82 571 }
Chris@82 572 {
Chris@82 573 V T7d, T7e, T7h, T7k;
Chris@82 574 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@82 575 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@82 576 ST(&(xo[WS(os, 3)]), VFNMSI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@82 577 ST(&(xo[WS(os, 61)]), VFMAI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@82 578 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@82 579 T7k = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@82 580 ST(&(xo[WS(os, 19)]), VFNMSI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@82 581 ST(&(xo[WS(os, 45)]), VFMAI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@82 582 }
Chris@82 583 }
Chris@82 584 {
Chris@82 585 V TT, T2j, T2f, T2k, T1Y, T2n, T2c, T2m;
Chris@82 586 {
Chris@82 587 V Tn, TS, T2d, T2e;
Chris@82 588 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@82 589 TS = VADD(TC, TR);
Chris@82 590 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@82 591 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@82 592 T2d = VFMA(LDK(KP198912367), T1g, T1p);
Chris@82 593 T2e = VFMA(LDK(KP198912367), T1N, T1W);
Chris@82 594 T2f = VSUB(T2d, T2e);
Chris@82 595 T2k = VADD(T2d, T2e);
Chris@82 596 }
Chris@82 597 {
Chris@82 598 V T1q, T1X, T28, T2b;
Chris@82 599 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@82 600 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@82 601 T1Y = VADD(T1q, T1X);
Chris@82 602 T2n = VSUB(T1q, T1X);
Chris@82 603 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@82 604 T2b = VSUB(T29, T2a);
Chris@82 605 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@82 606 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@82 607 }
Chris@82 608 {
Chris@82 609 V T1Z, T2g, T2p, T2q;
Chris@82 610 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@82 611 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@82 612 ST(&(xo[WS(os, 30)]), VFNMSI(T2g, T1Z), ovs, &(xo[0]));
Chris@82 613 ST(&(xo[WS(os, 34)]), VFMAI(T2g, T1Z), ovs, &(xo[0]));
Chris@82 614 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@82 615 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@82 616 ST(&(xo[WS(os, 14)]), VFNMSI(T2q, T2p), ovs, &(xo[0]));
Chris@82 617 ST(&(xo[WS(os, 50)]), VFMAI(T2q, T2p), ovs, &(xo[0]));
Chris@82 618 }
Chris@82 619 {
Chris@82 620 V T2h, T2i, T2l, T2o;
Chris@82 621 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@82 622 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@82 623 ST(&(xo[WS(os, 62)]), VFNMSI(T2i, T2h), ovs, &(xo[0]));
Chris@82 624 ST(&(xo[WS(os, 2)]), VFMAI(T2i, T2h), ovs, &(xo[0]));
Chris@82 625 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@82 626 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@82 627 ST(&(xo[WS(os, 18)]), VFMAI(T2o, T2l), ovs, &(xo[0]));
Chris@82 628 ST(&(xo[WS(os, 46)]), VFNMSI(T2o, T2l), ovs, &(xo[0]));
Chris@82 629 }
Chris@82 630 }
Chris@82 631 {
Chris@82 632 V T4z, T5z, T5v, T5A, T5g, T5D, T5s, T5C;
Chris@82 633 {
Chris@82 634 V T4f, T4y, T5t, T5u;
Chris@82 635 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@82 636 T4y = VADD(T4o, T4x);
Chris@82 637 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@82 638 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@82 639 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@82 640 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@82 641 T5v = VSUB(T5t, T5u);
Chris@82 642 T5A = VADD(T5t, T5u);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 V T4U, T5f, T5o, T5r;
Chris@82 646 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@82 647 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@82 648 T5g = VADD(T4U, T5f);
Chris@82 649 T5D = VSUB(T4U, T5f);
Chris@82 650 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@82 651 T5r = VSUB(T5p, T5q);
Chris@82 652 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@82 653 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@82 654 }
Chris@82 655 {
Chris@82 656 V T5h, T5w, T5F, T5G;
Chris@82 657 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@82 658 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@82 659 ST(&(xo[WS(os, 31)]), VFNMSI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@82 660 ST(&(xo[WS(os, 33)]), VFMAI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@82 661 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@82 662 T5G = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@82 663 ST(&(xo[WS(os, 15)]), VFNMSI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@82 664 ST(&(xo[WS(os, 49)]), VFMAI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@82 665 }
Chris@82 666 {
Chris@82 667 V T5x, T5y, T5B, T5E;
Chris@82 668 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@82 669 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@82 670 ST(&(xo[WS(os, 63)]), VFNMSI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@82 671 ST(&(xo[WS(os, 1)]), VFMAI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@82 672 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@82 673 T5E = VFMA(LDK(KP995184726), T5D, T5C);
Chris@82 674 ST(&(xo[WS(os, 17)]), VFMAI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@82 675 ST(&(xo[WS(os, 47)]), VFNMSI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@82 676 }
Chris@82 677 }
Chris@82 678 {
Chris@82 679 V T6j, T6N, T6J, T6O, T6y, T6R, T6G, T6Q;
Chris@82 680 {
Chris@82 681 V T6b, T6i, T6H, T6I;
Chris@82 682 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@82 683 T6i = VADD(T6e, T6h);
Chris@82 684 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@82 685 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@82 686 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@82 687 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@82 688 T6J = VSUB(T6H, T6I);
Chris@82 689 T6O = VADD(T6H, T6I);
Chris@82 690 }
Chris@82 691 {
Chris@82 692 V T6q, T6x, T6C, T6F;
Chris@82 693 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@82 694 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@82 695 T6y = VADD(T6q, T6x);
Chris@82 696 T6R = VSUB(T6q, T6x);
Chris@82 697 T6C = VFMA(LDK(KP923879532), T6B, T6A);
Chris@82 698 T6F = VSUB(T6D, T6E);
Chris@82 699 T6G = VFMA(LDK(KP831469612), T6F, T6C);
Chris@82 700 T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@82 701 }
Chris@82 702 {
Chris@82 703 V T6z, T6K, T6T, T6U;
Chris@82 704 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@82 705 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@82 706 ST(&(xo[WS(os, 27)]), VFNMSI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@82 707 ST(&(xo[WS(os, 37)]), VFMAI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@82 708 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@82 709 T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@82 710 ST(&(xo[WS(os, 11)]), VFNMSI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@82 711 ST(&(xo[WS(os, 53)]), VFMAI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@82 712 }
Chris@82 713 {
Chris@82 714 V T6L, T6M, T6P, T6S;
Chris@82 715 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@82 716 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@82 717 ST(&(xo[WS(os, 59)]), VFNMSI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@82 718 ST(&(xo[WS(os, 5)]), VFMAI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@82 719 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@82 720 T6S = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@82 721 ST(&(xo[WS(os, 21)]), VFMAI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@82 722 ST(&(xo[WS(os, 43)]), VFNMSI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@82 723 }
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V T2t, T2L, T2H, T2M, T2A, T2P, T2E, T2O;
Chris@82 727 {
Chris@82 728 V T2r, T2s, T2F, T2G;
Chris@82 729 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@82 730 T2s = VADD(T29, T2a);
Chris@82 731 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@82 732 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@82 733 T2F = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@82 734 T2G = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@82 735 T2H = VSUB(T2F, T2G);
Chris@82 736 T2M = VADD(T2F, T2G);
Chris@82 737 }
Chris@82 738 {
Chris@82 739 V T2w, T2z, T2C, T2D;
Chris@82 740 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@82 741 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@82 742 T2A = VADD(T2w, T2z);
Chris@82 743 T2P = VSUB(T2w, T2z);
Chris@82 744 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@82 745 T2D = VSUB(TC, TR);
Chris@82 746 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@82 747 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@82 748 }
Chris@82 749 {
Chris@82 750 V T2B, T2I, T2R, T2S;
Chris@82 751 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@82 752 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@82 753 ST(&(xo[WS(os, 38)]), VFNMSI(T2I, T2B), ovs, &(xo[0]));
Chris@82 754 ST(&(xo[WS(os, 26)]), VFMAI(T2I, T2B), ovs, &(xo[0]));
Chris@82 755 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@82 756 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@82 757 ST(&(xo[WS(os, 10)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@82 758 ST(&(xo[WS(os, 54)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@82 759 }
Chris@82 760 {
Chris@82 761 V T2J, T2K, T2N, T2Q;
Chris@82 762 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@82 763 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@82 764 ST(&(xo[WS(os, 6)]), VFNMSI(T2K, T2J), ovs, &(xo[0]));
Chris@82 765 ST(&(xo[WS(os, 58)]), VFMAI(T2K, T2J), ovs, &(xo[0]));
Chris@82 766 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@82 767 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@82 768 ST(&(xo[WS(os, 22)]), VFNMSI(T2Q, T2N), ovs, &(xo[0]));
Chris@82 769 ST(&(xo[WS(os, 42)]), VFMAI(T2Q, T2N), ovs, &(xo[0]));
Chris@82 770 }
Chris@82 771 }
Chris@82 772 {
Chris@82 773 V T5J, T61, T5X, T62, T5Q, T65, T5U, T64;
Chris@82 774 {
Chris@82 775 V T5H, T5I, T5V, T5W;
Chris@82 776 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@82 777 T5I = VADD(T5p, T5q);
Chris@82 778 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@82 779 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@82 780 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@82 781 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@82 782 T5X = VSUB(T5V, T5W);
Chris@82 783 T62 = VADD(T5V, T5W);
Chris@82 784 }
Chris@82 785 {
Chris@82 786 V T5M, T5P, T5S, T5T;
Chris@82 787 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@82 788 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@82 789 T5Q = VADD(T5M, T5P);
Chris@82 790 T65 = VSUB(T5M, T5P);
Chris@82 791 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@82 792 T5T = VSUB(T4o, T4x);
Chris@82 793 T5U = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@82 794 T64 = VFMA(LDK(KP980785280), T5T, T5S);
Chris@82 795 }
Chris@82 796 {
Chris@82 797 V T5R, T5Y, T67, T68;
Chris@82 798 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@82 799 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@82 800 ST(&(xo[WS(os, 39)]), VFNMSI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@82 801 ST(&(xo[WS(os, 25)]), VFMAI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@82 802 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@82 803 T68 = VFMA(LDK(KP773010453), T65, T64);
Chris@82 804 ST(&(xo[WS(os, 9)]), VFMAI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@82 805 ST(&(xo[WS(os, 55)]), VFNMSI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@82 806 }
Chris@82 807 {
Chris@82 808 V T5Z, T60, T63, T66;
Chris@82 809 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@82 810 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@82 811 ST(&(xo[WS(os, 7)]), VFNMSI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@82 812 ST(&(xo[WS(os, 57)]), VFMAI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@82 813 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@82 814 T66 = VFNMS(LDK(KP773010453), T65, T64);
Chris@82 815 ST(&(xo[WS(os, 23)]), VFNMSI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@82 816 ST(&(xo[WS(os, 41)]), VFMAI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@82 817 }
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 VLEAVE();
Chris@82 822 }
Chris@82 823
Chris@82 824 static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {198, 0, 258, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 825
Chris@82 826 void XSIMD(codelet_n1bv_64) (planner *p) {
Chris@82 827 X(kdft_register) (p, n1bv_64, &desc);
Chris@82 828 }
Chris@82 829
Chris@82 830 #else
Chris@82 831
Chris@82 832 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include dft/simd/n1b.h */
Chris@82 833
Chris@82 834 /*
Chris@82 835 * This function contains 456 FP additions, 124 FP multiplications,
Chris@82 836 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@82 837 * 108 stack variables, 15 constants, and 128 memory accesses
Chris@82 838 */
Chris@82 839 #include "dft/simd/n1b.h"
Chris@82 840
Chris@82 841 static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 842 {
Chris@82 843 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@82 844 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@82 845 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@82 846 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@82 847 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@82 848 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@82 849 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@82 850 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@82 851 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 852 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 853 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 854 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 855 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 856 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 857 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 858 {
Chris@82 859 INT i;
Chris@82 860 const R *xi;
Chris@82 861 R *xo;
Chris@82 862 xi = ii;
Chris@82 863 xo = io;
Chris@82 864 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 865 V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
Chris@82 866 V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
Chris@82 867 V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
Chris@82 868 V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
Chris@82 869 V T53, T5P, T3e, T3W;
Chris@82 870 {
Chris@82 871 V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
Chris@82 872 {
Chris@82 873 V T1, T2, T2n, T2o;
Chris@82 874 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@82 875 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@82 876 T3 = VSUB(T1, T2);
Chris@82 877 T4n = VADD(T1, T2);
Chris@82 878 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@82 879 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@82 880 T2p = VSUB(T2n, T2o);
Chris@82 881 T4o = VADD(T2n, T2o);
Chris@82 882 }
Chris@82 883 {
Chris@82 884 V T4, T5, T7, T8;
Chris@82 885 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@82 886 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@82 887 T6 = VSUB(T4, T5);
Chris@82 888 T5s = VADD(T4, T5);
Chris@82 889 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@82 890 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@82 891 T9 = VSUB(T7, T8);
Chris@82 892 T5t = VADD(T7, T8);
Chris@82 893 }
Chris@82 894 T4p = VSUB(T4n, T4o);
Chris@82 895 T5u = VSUB(T5s, T5t);
Chris@82 896 {
Chris@82 897 V Ta, T2m, T6E, T6F;
Chris@82 898 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 899 Tb = VSUB(T3, Ta);
Chris@82 900 T3A = VADD(T3, Ta);
Chris@82 901 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@82 902 T2q = VSUB(T2m, T2p);
Chris@82 903 T3v = VADD(T2p, T2m);
Chris@82 904 T6E = VADD(T4n, T4o);
Chris@82 905 T6F = VADD(T5s, T5t);
Chris@82 906 T6G = VSUB(T6E, T6F);
Chris@82 907 T78 = VADD(T6E, T6F);
Chris@82 908 }
Chris@82 909 }
Chris@82 910 {
Chris@82 911 V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
Chris@82 912 {
Chris@82 913 V Tc, Td, Tm, Tn;
Chris@82 914 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@82 915 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@82 916 Te = VSUB(Tc, Td);
Chris@82 917 T4q = VADD(Tc, Td);
Chris@82 918 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@82 919 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@82 920 To = VSUB(Tm, Tn);
Chris@82 921 T4t = VADD(Tm, Tn);
Chris@82 922 }
Chris@82 923 {
Chris@82 924 V Tf, Tg, Tj, Tk;
Chris@82 925 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@82 926 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@82 927 Th = VSUB(Tf, Tg);
Chris@82 928 T4r = VADD(Tf, Tg);
Chris@82 929 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@82 930 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@82 931 Tl = VSUB(Tj, Tk);
Chris@82 932 T4u = VADD(Tj, Tk);
Chris@82 933 }
Chris@82 934 {
Chris@82 935 V Ti, Tp, T6z, T6A;
Chris@82 936 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@82 937 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@82 938 Tq = VSUB(Ti, Tp);
Chris@82 939 T3w = VADD(Ti, Tp);
Chris@82 940 T6z = VADD(T4q, T4r);
Chris@82 941 T6A = VADD(T4t, T4u);
Chris@82 942 T6B = VSUB(T6z, T6A);
Chris@82 943 T79 = VADD(T6z, T6A);
Chris@82 944 }
Chris@82 945 {
Chris@82 946 V T2j, T2k, T4s, T4v;
Chris@82 947 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@82 948 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@82 949 T2l = VSUB(T2j, T2k);
Chris@82 950 T3B = VADD(T2j, T2k);
Chris@82 951 T4s = VSUB(T4q, T4r);
Chris@82 952 T4v = VSUB(T4t, T4u);
Chris@82 953 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@82 954 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
Chris@82 955 }
Chris@82 956 }
Chris@82 957 {
Chris@82 958 V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
Chris@82 959 {
Chris@82 960 V Tz, TA, TD, TE;
Chris@82 961 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@82 962 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@82 963 TB = VSUB(Tz, TA);
Chris@82 964 T4z = VADD(Tz, TA);
Chris@82 965 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@82 966 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@82 967 TF = VSUB(TD, TE);
Chris@82 968 T4y = VADD(TD, TE);
Chris@82 969 {
Chris@82 970 V Ts, Tt, Tu, Tv, Tw, Tx;
Chris@82 971 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@82 972 Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@82 973 Tu = VSUB(Ts, Tt);
Chris@82 974 Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@82 975 Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@82 976 Tx = VSUB(Tv, Tw);
Chris@82 977 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@82 978 T4C = VADD(Tv, Tw);
Chris@82 979 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@82 980 T4B = VADD(Ts, Tt);
Chris@82 981 }
Chris@82 982 }
Chris@82 983 {
Chris@82 984 V TC, TH, T6s, T6t;
Chris@82 985 TC = VSUB(Ty, TB);
Chris@82 986 TH = VSUB(TF, TG);
Chris@82 987 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@82 988 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@82 989 T6s = VADD(T4y, T4z);
Chris@82 990 T6t = VADD(T4B, T4C);
Chris@82 991 T6u = VSUB(T6s, T6t);
Chris@82 992 T74 = VADD(T6s, T6t);
Chris@82 993 }
Chris@82 994 {
Chris@82 995 V T3o, T3p, T4A, T4D;
Chris@82 996 T3o = VADD(TB, Ty);
Chris@82 997 T3p = VADD(TF, TG);
Chris@82 998 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
Chris@82 999 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
Chris@82 1000 T4A = VSUB(T4y, T4z);
Chris@82 1001 T4D = VSUB(T4B, T4C);
Chris@82 1002 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@82 1003 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
Chris@82 1004 }
Chris@82 1005 }
Chris@82 1006 {
Chris@82 1007 V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
Chris@82 1008 {
Chris@82 1009 V TQ, TR, TU, TV;
Chris@82 1010 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@82 1011 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@82 1012 TS = VSUB(TQ, TR);
Chris@82 1013 T4J = VADD(TQ, TR);
Chris@82 1014 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@82 1015 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@82 1016 TW = VSUB(TU, TV);
Chris@82 1017 T4I = VADD(TU, TV);
Chris@82 1018 {
Chris@82 1019 V TJ, TK, TL, TM, TN, TO;
Chris@82 1020 TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@82 1021 TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@82 1022 TL = VSUB(TJ, TK);
Chris@82 1023 TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@82 1024 TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@82 1025 TO = VSUB(TM, TN);
Chris@82 1026 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@82 1027 T4G = VADD(TM, TN);
Chris@82 1028 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@82 1029 T4F = VADD(TJ, TK);
Chris@82 1030 }
Chris@82 1031 }
Chris@82 1032 {
Chris@82 1033 V TT, TY, T6v, T6w;
Chris@82 1034 TT = VSUB(TP, TS);
Chris@82 1035 TY = VSUB(TW, TX);
Chris@82 1036 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@82 1037 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@82 1038 T6v = VADD(T4I, T4J);
Chris@82 1039 T6w = VADD(T4F, T4G);
Chris@82 1040 T6x = VSUB(T6v, T6w);
Chris@82 1041 T75 = VADD(T6v, T6w);
Chris@82 1042 }
Chris@82 1043 {
Chris@82 1044 V T3r, T3s, T4H, T4K;
Chris@82 1045 T3r = VADD(TS, TP);
Chris@82 1046 T3s = VADD(TW, TX);
Chris@82 1047 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
Chris@82 1048 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
Chris@82 1049 T4H = VSUB(T4F, T4G);
Chris@82 1050 T4K = VSUB(T4I, T4J);
Chris@82 1051 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@82 1052 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
Chris@82 1053 }
Chris@82 1054 }
Chris@82 1055 {
Chris@82 1056 V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
Chris@82 1057 V T1Q, T5a, T2a;
Chris@82 1058 {
Chris@82 1059 V T1Z, T20, T24, T25;
Chris@82 1060 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1061 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1062 T21 = VSUB(T1Z, T20);
Chris@82 1063 T5h = VADD(T1Z, T20);
Chris@82 1064 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1065 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1066 T26 = VSUB(T24, T25);
Chris@82 1067 T5g = VADD(T24, T25);
Chris@82 1068 }
Chris@82 1069 {
Chris@82 1070 V T1S, T1T, T1U, T1V, T1W, T1X;
Chris@82 1071 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1072 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1073 T1U = VSUB(T1S, T1T);
Chris@82 1074 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1075 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1076 T1X = VSUB(T1V, T1W);
Chris@82 1077 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
Chris@82 1078 T5d = VADD(T1V, T1W);
Chris@82 1079 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
Chris@82 1080 T5c = VADD(T1S, T1T);
Chris@82 1081 }
Chris@82 1082 {
Chris@82 1083 V T1F, T1I, T1M, T1P;
Chris@82 1084 {
Chris@82 1085 V T1D, T1E, T1G, T1H;
Chris@82 1086 T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1087 T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1088 T1F = VSUB(T1D, T1E);
Chris@82 1089 T55 = VADD(T1D, T1E);
Chris@82 1090 T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1091 T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1092 T1I = VSUB(T1G, T1H);
Chris@82 1093 T56 = VADD(T1G, T1H);
Chris@82 1094 }
Chris@82 1095 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
Chris@82 1096 T57 = VSUB(T55, T56);
Chris@82 1097 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
Chris@82 1098 {
Chris@82 1099 V T1K, T1L, T1N, T1O;
Chris@82 1100 T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1101 T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1102 T1M = VSUB(T1K, T1L);
Chris@82 1103 T58 = VADD(T1K, T1L);
Chris@82 1104 T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1105 T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1106 T1P = VSUB(T1N, T1O);
Chris@82 1107 T59 = VADD(T1N, T1O);
Chris@82 1108 }
Chris@82 1109 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
Chris@82 1110 T5a = VSUB(T58, T59);
Chris@82 1111 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
Chris@82 1112 }
Chris@82 1113 {
Chris@82 1114 V T1R, T22, T6k, T6l;
Chris@82 1115 T1R = VSUB(T1J, T1Q);
Chris@82 1116 T22 = VSUB(T1Y, T21);
Chris@82 1117 T23 = VSUB(T1R, T22);
Chris@82 1118 T2N = VADD(T22, T1R);
Chris@82 1119 T6k = VADD(T5g, T5h);
Chris@82 1120 T6l = VADD(T5c, T5d);
Chris@82 1121 T6m = VSUB(T6k, T6l);
Chris@82 1122 T70 = VADD(T6k, T6l);
Chris@82 1123 }
Chris@82 1124 {
Chris@82 1125 V T6n, T6o, T28, T2b;
Chris@82 1126 T6n = VADD(T55, T56);
Chris@82 1127 T6o = VADD(T58, T59);
Chris@82 1128 T6p = VSUB(T6n, T6o);
Chris@82 1129 T71 = VADD(T6n, T6o);
Chris@82 1130 T28 = VSUB(T26, T27);
Chris@82 1131 T2b = VSUB(T29, T2a);
Chris@82 1132 T2c = VSUB(T28, T2b);
Chris@82 1133 T2O = VADD(T28, T2b);
Chris@82 1134 }
Chris@82 1135 {
Chris@82 1136 V T3g, T3h, T5b, T5e;
Chris@82 1137 T3g = VADD(T26, T27);
Chris@82 1138 T3h = VADD(T1J, T1Q);
Chris@82 1139 T3i = VADD(T3g, T3h);
Chris@82 1140 T3Y = VSUB(T3g, T3h);
Chris@82 1141 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
Chris@82 1142 T5e = VSUB(T5c, T5d);
Chris@82 1143 T5f = VSUB(T5b, T5e);
Chris@82 1144 T5R = VADD(T5e, T5b);
Chris@82 1145 }
Chris@82 1146 {
Chris@82 1147 V T5i, T5j, T3j, T3k;
Chris@82 1148 T5i = VSUB(T5g, T5h);
Chris@82 1149 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
Chris@82 1150 T5k = VSUB(T5i, T5j);
Chris@82 1151 T5S = VADD(T5i, T5j);
Chris@82 1152 T3j = VADD(T21, T1Y);
Chris@82 1153 T3k = VADD(T29, T2a);
Chris@82 1154 T3l = VADD(T3j, T3k);
Chris@82 1155 T3Z = VSUB(T3k, T3j);
Chris@82 1156 }
Chris@82 1157 }
Chris@82 1158 {
Chris@82 1159 V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
Chris@82 1160 V T1f, T4T, T1z;
Chris@82 1161 {
Chris@82 1162 V T1o, T1p, T1t, T1u;
Chris@82 1163 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1164 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1165 T1q = VSUB(T1o, T1p);
Chris@82 1166 T50 = VADD(T1o, T1p);
Chris@82 1167 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1168 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1169 T1v = VSUB(T1t, T1u);
Chris@82 1170 T4Z = VADD(T1t, T1u);
Chris@82 1171 }
Chris@82 1172 {
Chris@82 1173 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@82 1174 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1175 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1176 T1j = VSUB(T1h, T1i);
Chris@82 1177 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1178 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1179 T1m = VSUB(T1k, T1l);
Chris@82 1180 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@82 1181 T4W = VADD(T1k, T1l);
Chris@82 1182 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
Chris@82 1183 T4V = VADD(T1h, T1i);
Chris@82 1184 }
Chris@82 1185 {
Chris@82 1186 V T14, T17, T1b, T1e;
Chris@82 1187 {
Chris@82 1188 V T12, T13, T15, T16;
Chris@82 1189 T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1190 T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1191 T14 = VSUB(T12, T13);
Chris@82 1192 T4O = VADD(T12, T13);
Chris@82 1193 T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1194 T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1195 T17 = VSUB(T15, T16);
Chris@82 1196 T4P = VADD(T15, T16);
Chris@82 1197 }
Chris@82 1198 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
Chris@82 1199 T4Q = VSUB(T4O, T4P);
Chris@82 1200 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
Chris@82 1201 {
Chris@82 1202 V T19, T1a, T1c, T1d;
Chris@82 1203 T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1204 T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1205 T1b = VSUB(T19, T1a);
Chris@82 1206 T4R = VADD(T19, T1a);
Chris@82 1207 T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1208 T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@82 1209 T1e = VSUB(T1c, T1d);
Chris@82 1210 T4S = VADD(T1c, T1d);
Chris@82 1211 }
Chris@82 1212 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
Chris@82 1213 T4T = VSUB(T4R, T4S);
Chris@82 1214 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@82 1215 }
Chris@82 1216 {
Chris@82 1217 V T1g, T1r, T6d, T6e;
Chris@82 1218 T1g = VSUB(T18, T1f);
Chris@82 1219 T1r = VSUB(T1n, T1q);
Chris@82 1220 T1s = VSUB(T1g, T1r);
Chris@82 1221 T2K = VADD(T1r, T1g);
Chris@82 1222 T6d = VADD(T4Z, T50);
Chris@82 1223 T6e = VADD(T4V, T4W);
Chris@82 1224 T6f = VSUB(T6d, T6e);
Chris@82 1225 T6X = VADD(T6d, T6e);
Chris@82 1226 }
Chris@82 1227 {
Chris@82 1228 V T6g, T6h, T1x, T1A;
Chris@82 1229 T6g = VADD(T4O, T4P);
Chris@82 1230 T6h = VADD(T4R, T4S);
Chris@82 1231 T6i = VSUB(T6g, T6h);
Chris@82 1232 T6Y = VADD(T6g, T6h);
Chris@82 1233 T1x = VSUB(T1v, T1w);
Chris@82 1234 T1A = VSUB(T1y, T1z);
Chris@82 1235 T1B = VSUB(T1x, T1A);
Chris@82 1236 T2L = VADD(T1x, T1A);
Chris@82 1237 }
Chris@82 1238 {
Chris@82 1239 V T39, T3a, T4U, T4X;
Chris@82 1240 T39 = VADD(T1v, T1w);
Chris@82 1241 T3a = VADD(T18, T1f);
Chris@82 1242 T3b = VADD(T39, T3a);
Chris@82 1243 T3V = VSUB(T39, T3a);
Chris@82 1244 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
Chris@82 1245 T4X = VSUB(T4V, T4W);
Chris@82 1246 T4Y = VSUB(T4U, T4X);
Chris@82 1247 T5O = VADD(T4X, T4U);
Chris@82 1248 }
Chris@82 1249 {
Chris@82 1250 V T51, T52, T3c, T3d;
Chris@82 1251 T51 = VSUB(T4Z, T50);
Chris@82 1252 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
Chris@82 1253 T53 = VSUB(T51, T52);
Chris@82 1254 T5P = VADD(T51, T52);
Chris@82 1255 T3c = VADD(T1q, T1n);
Chris@82 1256 T3d = VADD(T1y, T1z);
Chris@82 1257 T3e = VADD(T3c, T3d);
Chris@82 1258 T3W = VSUB(T3d, T3c);
Chris@82 1259 }
Chris@82 1260 }
Chris@82 1261 {
Chris@82 1262 V T7h, T7l, T7k, T7m;
Chris@82 1263 {
Chris@82 1264 V T7f, T7g, T7i, T7j;
Chris@82 1265 T7f = VADD(T78, T79);
Chris@82 1266 T7g = VADD(T74, T75);
Chris@82 1267 T7h = VSUB(T7f, T7g);
Chris@82 1268 T7l = VADD(T7f, T7g);
Chris@82 1269 T7i = VADD(T6X, T6Y);
Chris@82 1270 T7j = VADD(T70, T71);
Chris@82 1271 T7k = VBYI(VSUB(T7i, T7j));
Chris@82 1272 T7m = VADD(T7i, T7j);
Chris@82 1273 }
Chris@82 1274 ST(&(xo[WS(os, 48)]), VSUB(T7h, T7k), ovs, &(xo[0]));
Chris@82 1275 ST(&(xo[0]), VADD(T7l, T7m), ovs, &(xo[0]));
Chris@82 1276 ST(&(xo[WS(os, 16)]), VADD(T7h, T7k), ovs, &(xo[0]));
Chris@82 1277 ST(&(xo[WS(os, 32)]), VSUB(T7l, T7m), ovs, &(xo[0]));
Chris@82 1278 }
Chris@82 1279 {
Chris@82 1280 V T76, T7a, T73, T7b, T6Z, T72;
Chris@82 1281 T76 = VSUB(T74, T75);
Chris@82 1282 T7a = VSUB(T78, T79);
Chris@82 1283 T6Z = VSUB(T6X, T6Y);
Chris@82 1284 T72 = VSUB(T70, T71);
Chris@82 1285 T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
Chris@82 1286 T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
Chris@82 1287 {
Chris@82 1288 V T77, T7c, T7d, T7e;
Chris@82 1289 T77 = VBYI(VSUB(T73, T76));
Chris@82 1290 T7c = VSUB(T7a, T7b);
Chris@82 1291 ST(&(xo[WS(os, 24)]), VADD(T77, T7c), ovs, &(xo[0]));
Chris@82 1292 ST(&(xo[WS(os, 40)]), VSUB(T7c, T77), ovs, &(xo[0]));
Chris@82 1293 T7d = VBYI(VADD(T76, T73));
Chris@82 1294 T7e = VADD(T7a, T7b);
Chris@82 1295 ST(&(xo[WS(os, 8)]), VADD(T7d, T7e), ovs, &(xo[0]));
Chris@82 1296 ST(&(xo[WS(os, 56)]), VSUB(T7e, T7d), ovs, &(xo[0]));
Chris@82 1297 }
Chris@82 1298 }
Chris@82 1299 {
Chris@82 1300 V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
Chris@82 1301 T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
Chris@82 1302 T6C = VSUB(T6y, T6B);
Chris@82 1303 T6S = VADD(T6B, T6y);
Chris@82 1304 T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
Chris@82 1305 T6I = VSUB(T6G, T6H);
Chris@82 1306 T6P = VADD(T6G, T6H);
Chris@82 1307 {
Chris@82 1308 V T6j, T6q, T6J, T6K;
Chris@82 1309 T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
Chris@82 1310 T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
Chris@82 1311 T6r = VSUB(T6j, T6q);
Chris@82 1312 T6Q = VADD(T6j, T6q);
Chris@82 1313 T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
Chris@82 1314 T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
Chris@82 1315 T6L = VSUB(T6J, T6K);
Chris@82 1316 T6T = VADD(T6J, T6K);
Chris@82 1317 }
Chris@82 1318 {
Chris@82 1319 V T6D, T6M, T6V, T6W;
Chris@82 1320 T6D = VBYI(VSUB(T6r, T6C));
Chris@82 1321 T6M = VSUB(T6I, T6L);
Chris@82 1322 ST(&(xo[WS(os, 20)]), VADD(T6D, T6M), ovs, &(xo[0]));
Chris@82 1323 ST(&(xo[WS(os, 44)]), VSUB(T6M, T6D), ovs, &(xo[0]));
Chris@82 1324 T6V = VSUB(T6P, T6Q);
Chris@82 1325 T6W = VBYI(VSUB(T6T, T6S));
Chris@82 1326 ST(&(xo[WS(os, 36)]), VSUB(T6V, T6W), ovs, &(xo[0]));
Chris@82 1327 ST(&(xo[WS(os, 28)]), VADD(T6V, T6W), ovs, &(xo[0]));
Chris@82 1328 }
Chris@82 1329 {
Chris@82 1330 V T6N, T6O, T6R, T6U;
Chris@82 1331 T6N = VBYI(VADD(T6C, T6r));
Chris@82 1332 T6O = VADD(T6I, T6L);
Chris@82 1333 ST(&(xo[WS(os, 12)]), VADD(T6N, T6O), ovs, &(xo[0]));
Chris@82 1334 ST(&(xo[WS(os, 52)]), VSUB(T6O, T6N), ovs, &(xo[0]));
Chris@82 1335 T6R = VADD(T6P, T6Q);
Chris@82 1336 T6U = VBYI(VADD(T6S, T6T));
Chris@82 1337 ST(&(xo[WS(os, 60)]), VSUB(T6R, T6U), ovs, &(xo[0]));
Chris@82 1338 ST(&(xo[WS(os, 4)]), VADD(T6R, T6U), ovs, &(xo[0]));
Chris@82 1339 }
Chris@82 1340 }
Chris@82 1341 {
Chris@82 1342 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@82 1343 {
Chris@82 1344 V T5L, T5M, T5Z, T60;
Chris@82 1345 T5L = VADD(T4p, T4w);
Chris@82 1346 T5M = VADD(T5o, T5p);
Chris@82 1347 T5N = VSUB(T5L, T5M);
Chris@82 1348 T68 = VADD(T5L, T5M);
Chris@82 1349 T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
Chris@82 1350 T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
Chris@82 1351 T61 = VSUB(T5Z, T60);
Chris@82 1352 T69 = VADD(T5Z, T60);
Chris@82 1353 }
Chris@82 1354 {
Chris@82 1355 V T5Q, T5T, T5W, T5X;
Chris@82 1356 T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
Chris@82 1357 T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
Chris@82 1358 T5U = VSUB(T5Q, T5T);
Chris@82 1359 T65 = VADD(T5Q, T5T);
Chris@82 1360 T5W = VADD(T4E, T4L);
Chris@82 1361 T5X = VADD(T5u, T5r);
Chris@82 1362 T5Y = VSUB(T5W, T5X);
Chris@82 1363 T66 = VADD(T5X, T5W);
Chris@82 1364 }
Chris@82 1365 {
Chris@82 1366 V T5V, T62, T6b, T6c;
Chris@82 1367 T5V = VADD(T5N, T5U);
Chris@82 1368 T62 = VBYI(VADD(T5Y, T61));
Chris@82 1369 ST(&(xo[WS(os, 50)]), VSUB(T5V, T62), ovs, &(xo[0]));
Chris@82 1370 ST(&(xo[WS(os, 14)]), VADD(T5V, T62), ovs, &(xo[0]));
Chris@82 1371 T6b = VBYI(VADD(T66, T65));
Chris@82 1372 T6c = VADD(T68, T69);
Chris@82 1373 ST(&(xo[WS(os, 2)]), VADD(T6b, T6c), ovs, &(xo[0]));
Chris@82 1374 ST(&(xo[WS(os, 62)]), VSUB(T6c, T6b), ovs, &(xo[0]));
Chris@82 1375 }
Chris@82 1376 {
Chris@82 1377 V T63, T64, T67, T6a;
Chris@82 1378 T63 = VSUB(T5N, T5U);
Chris@82 1379 T64 = VBYI(VSUB(T61, T5Y));
Chris@82 1380 ST(&(xo[WS(os, 46)]), VSUB(T63, T64), ovs, &(xo[0]));
Chris@82 1381 ST(&(xo[WS(os, 18)]), VADD(T63, T64), ovs, &(xo[0]));
Chris@82 1382 T67 = VBYI(VSUB(T65, T66));
Chris@82 1383 T6a = VSUB(T68, T69);
Chris@82 1384 ST(&(xo[WS(os, 30)]), VADD(T67, T6a), ovs, &(xo[0]));
Chris@82 1385 ST(&(xo[WS(os, 34)]), VSUB(T6a, T67), ovs, &(xo[0]));
Chris@82 1386 }
Chris@82 1387 }
Chris@82 1388 {
Chris@82 1389 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@82 1390 {
Chris@82 1391 V Tr, T10, T2t, T2u;
Chris@82 1392 Tr = VSUB(Tb, Tq);
Chris@82 1393 T10 = VSUB(TI, TZ);
Chris@82 1394 T11 = VSUB(Tr, T10);
Chris@82 1395 T2C = VADD(Tr, T10);
Chris@82 1396 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
Chris@82 1397 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
Chris@82 1398 T2v = VSUB(T2t, T2u);
Chris@82 1399 T2D = VADD(T2t, T2u);
Chris@82 1400 }
Chris@82 1401 {
Chris@82 1402 V T1C, T2d, T2i, T2r;
Chris@82 1403 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
Chris@82 1404 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
Chris@82 1405 T2e = VSUB(T1C, T2d);
Chris@82 1406 T2z = VADD(T1C, T2d);
Chris@82 1407 T2i = VSUB(T2g, T2h);
Chris@82 1408 T2r = VSUB(T2l, T2q);
Chris@82 1409 T2s = VSUB(T2i, T2r);
Chris@82 1410 T2A = VADD(T2r, T2i);
Chris@82 1411 }
Chris@82 1412 {
Chris@82 1413 V T2f, T2w, T2F, T2G;
Chris@82 1414 T2f = VADD(T11, T2e);
Chris@82 1415 T2w = VBYI(VADD(T2s, T2v));
Chris@82 1416 ST(&(xo[WS(os, 53)]), VSUB(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@82 1417 ST(&(xo[WS(os, 11)]), VADD(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@82 1418 T2F = VBYI(VADD(T2A, T2z));
Chris@82 1419 T2G = VADD(T2C, T2D);
Chris@82 1420 ST(&(xo[WS(os, 5)]), VADD(T2F, T2G), ovs, &(xo[WS(os, 1)]));
Chris@82 1421 ST(&(xo[WS(os, 59)]), VSUB(T2G, T2F), ovs, &(xo[WS(os, 1)]));
Chris@82 1422 }
Chris@82 1423 {
Chris@82 1424 V T2x, T2y, T2B, T2E;
Chris@82 1425 T2x = VSUB(T11, T2e);
Chris@82 1426 T2y = VBYI(VSUB(T2v, T2s));
Chris@82 1427 ST(&(xo[WS(os, 43)]), VSUB(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@82 1428 ST(&(xo[WS(os, 21)]), VADD(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@82 1429 T2B = VBYI(VSUB(T2z, T2A));
Chris@82 1430 T2E = VSUB(T2C, T2D);
Chris@82 1431 ST(&(xo[WS(os, 27)]), VADD(T2B, T2E), ovs, &(xo[WS(os, 1)]));
Chris@82 1432 ST(&(xo[WS(os, 37)]), VSUB(T2E, T2B), ovs, &(xo[WS(os, 1)]));
Chris@82 1433 }
Chris@82 1434 }
Chris@82 1435 {
Chris@82 1436 V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
Chris@82 1437 {
Chris@82 1438 V T3f, T3m, T3H, T3I;
Chris@82 1439 T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
Chris@82 1440 T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
Chris@82 1441 T3n = VSUB(T3f, T3m);
Chris@82 1442 T3O = VADD(T3f, T3m);
Chris@82 1443 T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
Chris@82 1444 T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
Chris@82 1445 T3J = VSUB(T3H, T3I);
Chris@82 1446 T3R = VADD(T3H, T3I);
Chris@82 1447 }
Chris@82 1448 {
Chris@82 1449 V T3u, T3x, T3C, T3F;
Chris@82 1450 T3u = VADD(T3q, T3t);
Chris@82 1451 T3x = VADD(T3v, T3w);
Chris@82 1452 T3y = VSUB(T3u, T3x);
Chris@82 1453 T3Q = VADD(T3x, T3u);
Chris@82 1454 T3C = VADD(T3A, T3B);
Chris@82 1455 T3F = VADD(T3D, T3E);
Chris@82 1456 T3G = VSUB(T3C, T3F);
Chris@82 1457 T3N = VADD(T3C, T3F);
Chris@82 1458 }
Chris@82 1459 {
Chris@82 1460 V T3z, T3K, T3T, T3U;
Chris@82 1461 T3z = VBYI(VSUB(T3n, T3y));
Chris@82 1462 T3K = VSUB(T3G, T3J);
Chris@82 1463 ST(&(xo[WS(os, 17)]), VADD(T3z, T3K), ovs, &(xo[WS(os, 1)]));
Chris@82 1464 ST(&(xo[WS(os, 47)]), VSUB(T3K, T3z), ovs, &(xo[WS(os, 1)]));
Chris@82 1465 T3T = VSUB(T3N, T3O);
Chris@82 1466 T3U = VBYI(VSUB(T3R, T3Q));
Chris@82 1467 ST(&(xo[WS(os, 33)]), VSUB(T3T, T3U), ovs, &(xo[WS(os, 1)]));
Chris@82 1468 ST(&(xo[WS(os, 31)]), VADD(T3T, T3U), ovs, &(xo[WS(os, 1)]));
Chris@82 1469 }
Chris@82 1470 {
Chris@82 1471 V T3L, T3M, T3P, T3S;
Chris@82 1472 T3L = VBYI(VADD(T3y, T3n));
Chris@82 1473 T3M = VADD(T3G, T3J);
Chris@82 1474 ST(&(xo[WS(os, 15)]), VADD(T3L, T3M), ovs, &(xo[WS(os, 1)]));
Chris@82 1475 ST(&(xo[WS(os, 49)]), VSUB(T3M, T3L), ovs, &(xo[WS(os, 1)]));
Chris@82 1476 T3P = VADD(T3N, T3O);
Chris@82 1477 T3S = VBYI(VADD(T3Q, T3R));
Chris@82 1478 ST(&(xo[WS(os, 63)]), VSUB(T3P, T3S), ovs, &(xo[WS(os, 1)]));
Chris@82 1479 ST(&(xo[WS(os, 1)]), VADD(T3P, T3S), ovs, &(xo[WS(os, 1)]));
Chris@82 1480 }
Chris@82 1481 }
Chris@82 1482 {
Chris@82 1483 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@82 1484 {
Chris@82 1485 V T4x, T4M, T5x, T5y;
Chris@82 1486 T4x = VSUB(T4p, T4w);
Chris@82 1487 T4M = VSUB(T4E, T4L);
Chris@82 1488 T4N = VSUB(T4x, T4M);
Chris@82 1489 T5G = VADD(T4x, T4M);
Chris@82 1490 T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
Chris@82 1491 T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
Chris@82 1492 T5z = VSUB(T5x, T5y);
Chris@82 1493 T5H = VADD(T5x, T5y);
Chris@82 1494 }
Chris@82 1495 {
Chris@82 1496 V T54, T5l, T5q, T5v;
Chris@82 1497 T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
Chris@82 1498 T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
Chris@82 1499 T5m = VSUB(T54, T5l);
Chris@82 1500 T5D = VADD(T54, T5l);
Chris@82 1501 T5q = VSUB(T5o, T5p);
Chris@82 1502 T5v = VSUB(T5r, T5u);
Chris@82 1503 T5w = VSUB(T5q, T5v);
Chris@82 1504 T5E = VADD(T5v, T5q);
Chris@82 1505 }
Chris@82 1506 {
Chris@82 1507 V T5n, T5A, T5J, T5K;
Chris@82 1508 T5n = VADD(T4N, T5m);
Chris@82 1509 T5A = VBYI(VADD(T5w, T5z));
Chris@82 1510 ST(&(xo[WS(os, 54)]), VSUB(T5n, T5A), ovs, &(xo[0]));
Chris@82 1511 ST(&(xo[WS(os, 10)]), VADD(T5n, T5A), ovs, &(xo[0]));
Chris@82 1512 T5J = VBYI(VADD(T5E, T5D));
Chris@82 1513 T5K = VADD(T5G, T5H);
Chris@82 1514 ST(&(xo[WS(os, 6)]), VADD(T5J, T5K), ovs, &(xo[0]));
Chris@82 1515 ST(&(xo[WS(os, 58)]), VSUB(T5K, T5J), ovs, &(xo[0]));
Chris@82 1516 }
Chris@82 1517 {
Chris@82 1518 V T5B, T5C, T5F, T5I;
Chris@82 1519 T5B = VSUB(T4N, T5m);
Chris@82 1520 T5C = VBYI(VSUB(T5z, T5w));
Chris@82 1521 ST(&(xo[WS(os, 42)]), VSUB(T5B, T5C), ovs, &(xo[0]));
Chris@82 1522 ST(&(xo[WS(os, 22)]), VADD(T5B, T5C), ovs, &(xo[0]));
Chris@82 1523 T5F = VBYI(VSUB(T5D, T5E));
Chris@82 1524 T5I = VSUB(T5G, T5H);
Chris@82 1525 ST(&(xo[WS(os, 26)]), VADD(T5F, T5I), ovs, &(xo[0]));
Chris@82 1526 ST(&(xo[WS(os, 38)]), VSUB(T5I, T5F), ovs, &(xo[0]));
Chris@82 1527 }
Chris@82 1528 }
Chris@82 1529 {
Chris@82 1530 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@82 1531 {
Chris@82 1532 V T2H, T2I, T2V, T2W;
Chris@82 1533 T2H = VADD(Tb, Tq);
Chris@82 1534 T2I = VADD(T2g, T2h);
Chris@82 1535 T2J = VSUB(T2H, T2I);
Chris@82 1536 T34 = VADD(T2H, T2I);
Chris@82 1537 T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
Chris@82 1538 T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
Chris@82 1539 T2X = VSUB(T2V, T2W);
Chris@82 1540 T35 = VADD(T2V, T2W);
Chris@82 1541 }
Chris@82 1542 {
Chris@82 1543 V T2M, T2P, T2S, T2T;
Chris@82 1544 T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
Chris@82 1545 T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
Chris@82 1546 T2Q = VSUB(T2M, T2P);
Chris@82 1547 T31 = VADD(T2M, T2P);
Chris@82 1548 T2S = VADD(TI, TZ);
Chris@82 1549 T2T = VADD(T2q, T2l);
Chris@82 1550 T2U = VSUB(T2S, T2T);
Chris@82 1551 T32 = VADD(T2T, T2S);
Chris@82 1552 }
Chris@82 1553 {
Chris@82 1554 V T2R, T2Y, T37, T38;
Chris@82 1555 T2R = VADD(T2J, T2Q);
Chris@82 1556 T2Y = VBYI(VADD(T2U, T2X));
Chris@82 1557 ST(&(xo[WS(os, 51)]), VSUB(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@82 1558 ST(&(xo[WS(os, 13)]), VADD(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@82 1559 T37 = VBYI(VADD(T32, T31));
Chris@82 1560 T38 = VADD(T34, T35);
Chris@82 1561 ST(&(xo[WS(os, 3)]), VADD(T37, T38), ovs, &(xo[WS(os, 1)]));
Chris@82 1562 ST(&(xo[WS(os, 61)]), VSUB(T38, T37), ovs, &(xo[WS(os, 1)]));
Chris@82 1563 }
Chris@82 1564 {
Chris@82 1565 V T2Z, T30, T33, T36;
Chris@82 1566 T2Z = VSUB(T2J, T2Q);
Chris@82 1567 T30 = VBYI(VSUB(T2X, T2U));
Chris@82 1568 ST(&(xo[WS(os, 45)]), VSUB(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@82 1569 ST(&(xo[WS(os, 19)]), VADD(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@82 1570 T33 = VBYI(VSUB(T31, T32));
Chris@82 1571 T36 = VSUB(T34, T35);
Chris@82 1572 ST(&(xo[WS(os, 29)]), VADD(T33, T36), ovs, &(xo[WS(os, 1)]));
Chris@82 1573 ST(&(xo[WS(os, 35)]), VSUB(T36, T33), ovs, &(xo[WS(os, 1)]));
Chris@82 1574 }
Chris@82 1575 }
Chris@82 1576 {
Chris@82 1577 V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
Chris@82 1578 {
Chris@82 1579 V T3X, T40, T49, T4a;
Chris@82 1580 T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
Chris@82 1581 T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
Chris@82 1582 T41 = VSUB(T3X, T40);
Chris@82 1583 T4g = VADD(T3X, T40);
Chris@82 1584 T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
Chris@82 1585 T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
Chris@82 1586 T4b = VSUB(T49, T4a);
Chris@82 1587 T4j = VADD(T49, T4a);
Chris@82 1588 }
Chris@82 1589 {
Chris@82 1590 V T42, T43, T46, T47;
Chris@82 1591 T42 = VSUB(T3D, T3E);
Chris@82 1592 T43 = VSUB(T3w, T3v);
Chris@82 1593 T44 = VSUB(T42, T43);
Chris@82 1594 T4i = VADD(T43, T42);
Chris@82 1595 T46 = VSUB(T3A, T3B);
Chris@82 1596 T47 = VSUB(T3q, T3t);
Chris@82 1597 T48 = VSUB(T46, T47);
Chris@82 1598 T4f = VADD(T46, T47);
Chris@82 1599 }
Chris@82 1600 {
Chris@82 1601 V T45, T4c, T4l, T4m;
Chris@82 1602 T45 = VBYI(VSUB(T41, T44));
Chris@82 1603 T4c = VSUB(T48, T4b);
Chris@82 1604 ST(&(xo[WS(os, 23)]), VADD(T45, T4c), ovs, &(xo[WS(os, 1)]));
Chris@82 1605 ST(&(xo[WS(os, 41)]), VSUB(T4c, T45), ovs, &(xo[WS(os, 1)]));
Chris@82 1606 T4l = VSUB(T4f, T4g);
Chris@82 1607 T4m = VBYI(VSUB(T4j, T4i));
Chris@82 1608 ST(&(xo[WS(os, 39)]), VSUB(T4l, T4m), ovs, &(xo[WS(os, 1)]));
Chris@82 1609 ST(&(xo[WS(os, 25)]), VADD(T4l, T4m), ovs, &(xo[WS(os, 1)]));
Chris@82 1610 }
Chris@82 1611 {
Chris@82 1612 V T4d, T4e, T4h, T4k;
Chris@82 1613 T4d = VBYI(VADD(T44, T41));
Chris@82 1614 T4e = VADD(T48, T4b);
Chris@82 1615 ST(&(xo[WS(os, 9)]), VADD(T4d, T4e), ovs, &(xo[WS(os, 1)]));
Chris@82 1616 ST(&(xo[WS(os, 55)]), VSUB(T4e, T4d), ovs, &(xo[WS(os, 1)]));
Chris@82 1617 T4h = VADD(T4f, T4g);
Chris@82 1618 T4k = VBYI(VADD(T4i, T4j));
Chris@82 1619 ST(&(xo[WS(os, 57)]), VSUB(T4h, T4k), ovs, &(xo[WS(os, 1)]));
Chris@82 1620 ST(&(xo[WS(os, 7)]), VADD(T4h, T4k), ovs, &(xo[WS(os, 1)]));
Chris@82 1621 }
Chris@82 1622 }
Chris@82 1623 }
Chris@82 1624 }
Chris@82 1625 VLEAVE();
Chris@82 1626 }
Chris@82 1627
Chris@82 1628 static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 1629
Chris@82 1630 void XSIMD(codelet_n1bv_64) (planner *p) {
Chris@82 1631 X(kdft_register) (p, n1bv_64, &desc);
Chris@82 1632 }
Chris@82 1633
Chris@82 1634 #endif