annotate src/fftw-3.3.5/dft/simd/common/n1bv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:39:24 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include n1b.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@42 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@42 33 * 168 stack variables, 15 constants, and 128 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1b.h"
Chris@42 36
Chris@42 37 static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@42 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@42 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@42 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@42 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 50 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 51 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 54 {
Chris@42 55 INT i;
Chris@42 56 const R *xi;
Chris@42 57 R *xo;
Chris@42 58 xi = ii;
Chris@42 59 xo = io;
Chris@42 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 61 V T5T, T5S, T5X, T65, T5Z, T5R, T67, T63, T5U, T64;
Chris@42 62 {
Chris@42 63 V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
Chris@42 64 V Tm, T3A, T3i, T29, TC, T5p, T4o, T6D, T6e, T3l, T3B, TR, T2a, T4x, T5q;
Chris@42 65 V T6h, T6E, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
Chris@42 66 V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
Chris@42 67 V T6m, T6Y, T5L, T4T;
Chris@42 68 {
Chris@42 69 V T4g, T4l, T3g, Tu, Tx, T4h, TA, T4i;
Chris@42 70 {
Chris@42 71 V T1, T2, T23, T24, T4, T5, T20, T21;
Chris@42 72 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 73 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 74 T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 75 T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 76 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 77 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 78 T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 79 T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 80 {
Chris@42 81 V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
Chris@42 82 {
Chris@42 83 V T8, T43, T3, T45, T25, T5i, T6, T44, T22, T9, Ti, Tj, Tb, Tc;
Chris@42 84 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 85 T43 = VSUB(T1, T2);
Chris@42 86 T3 = VADD(T1, T2);
Chris@42 87 T45 = VSUB(T23, T24);
Chris@42 88 T25 = VADD(T23, T24);
Chris@42 89 T5i = VSUB(T4, T5);
Chris@42 90 T6 = VADD(T4, T5);
Chris@42 91 T44 = VSUB(T20, T21);
Chris@42 92 T22 = VADD(T20, T21);
Chris@42 93 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 94 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 95 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 96 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 97 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 98 {
Chris@42 99 V T2T, T46, T5j, T2U;
Chris@42 100 T7 = VSUB(T3, T6);
Chris@42 101 T2T = VADD(T3, T6);
Chris@42 102 T46 = VADD(T44, T45);
Chris@42 103 T5j = VSUB(T44, T45);
Chris@42 104 T26 = VSUB(T22, T25);
Chris@42 105 T2U = VADD(T22, T25);
Chris@42 106 Ta = VADD(T8, T9);
Chris@42 107 T48 = VSUB(T8, T9);
Chris@42 108 Tk = VADD(Ti, Tj);
Chris@42 109 T4c = VSUB(Tj, Ti);
Chris@42 110 T5k = VFMA(LDK(KP707106781), T5j, T5i);
Chris@42 111 T6A = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@42 112 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@42 113 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@42 114 T2V = VADD(T2T, T2U);
Chris@42 115 T3z = VSUB(T2T, T2U);
Chris@42 116 T49 = VSUB(Tb, Tc);
Chris@42 117 Td = VADD(Tb, Tc);
Chris@42 118 }
Chris@42 119 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 120 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 121 }
Chris@42 122 {
Chris@42 123 V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
Chris@42 124 V Tp;
Chris@42 125 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 126 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 127 {
Chris@42 128 V Th, T4b, Tr, Ts;
Chris@42 129 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 130 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 131 Te = VSUB(Ta, Td);
Chris@42 132 T2W = VADD(Ta, Td);
Chris@42 133 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@42 134 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@42 135 Th = VADD(Tf, Tg);
Chris@42 136 T4b = VSUB(Tf, Tg);
Chris@42 137 Tq = VADD(To, Tp);
Chris@42 138 T4g = VSUB(To, Tp);
Chris@42 139 T4l = VSUB(Tr, Ts);
Chris@42 140 Tt = VADD(Tr, Ts);
Chris@42 141 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 142 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 143 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@42 144 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@42 145 Tl = VSUB(Th, Tk);
Chris@42 146 T2X = VADD(Th, Tk);
Chris@42 147 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 148 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 149 }
Chris@42 150 T3g = VADD(Tq, Tt);
Chris@42 151 Tu = VSUB(Tq, Tt);
Chris@42 152 Tx = VADD(Tv, Tw);
Chris@42 153 T4h = VSUB(Tv, Tw);
Chris@42 154 T6B = VSUB(T4a, T4d);
Chris@42 155 T4e = VADD(T4a, T4d);
Chris@42 156 T6a = VADD(T5l, T5m);
Chris@42 157 T5n = VSUB(T5l, T5m);
Chris@42 158 T3M = VSUB(T2W, T2X);
Chris@42 159 T2Y = VADD(T2W, T2X);
Chris@42 160 T27 = VSUB(Te, Tl);
Chris@42 161 Tm = VADD(Te, Tl);
Chris@42 162 TA = VADD(Ty, Tz);
Chris@42 163 T4i = VSUB(Ty, Tz);
Chris@42 164 }
Chris@42 165 }
Chris@42 166 }
Chris@42 167 {
Chris@42 168 V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3j, TJ, TF, TI;
Chris@42 169 {
Chris@42 170 V TD, TE, TG, TH;
Chris@42 171 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 172 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 173 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 174 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 175 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 176 {
Chris@42 177 V T3h, TB, T4j, T4m;
Chris@42 178 T3h = VADD(Tx, TA);
Chris@42 179 TB = VSUB(Tx, TA);
Chris@42 180 T4j = VADD(T4h, T4i);
Chris@42 181 T4m = VSUB(T4h, T4i);
Chris@42 182 T4p = VSUB(TD, TE);
Chris@42 183 TF = VADD(TD, TE);
Chris@42 184 T4u = VSUB(TH, TG);
Chris@42 185 TI = VADD(TG, TH);
Chris@42 186 T3A = VSUB(T3g, T3h);
Chris@42 187 T3i = VADD(T3g, T3h);
Chris@42 188 T29 = VFMA(LDK(KP414213562), Tu, TB);
Chris@42 189 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@42 190 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@42 191 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@42 192 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@42 193 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@42 194 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 195 }
Chris@42 196 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 197 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 198 }
Chris@42 199 T3j = VADD(TF, TI);
Chris@42 200 TJ = VSUB(TF, TI);
Chris@42 201 {
Chris@42 202 V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
Chris@42 203 {
Chris@42 204 V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
Chris@42 205 {
Chris@42 206 V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
Chris@42 207 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 208 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@42 209 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@42 210 T6D = VFMA(LDK(KP668178637), T6c, T6d);
Chris@42 211 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@42 212 TM = VADD(TK, TL);
Chris@42 213 T4r = VSUB(TK, TL);
Chris@42 214 TP = VADD(TN, TO);
Chris@42 215 T4q = VSUB(TN, TO);
Chris@42 216 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 217 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 219 {
Chris@42 220 V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
Chris@42 221 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 222 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 223 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 224 {
Chris@42 225 V T3k, TQ, T4s, T4v;
Chris@42 226 T3k = VADD(TP, TM);
Chris@42 227 TQ = VSUB(TM, TP);
Chris@42 228 T4s = VADD(T4q, T4r);
Chris@42 229 T4v = VSUB(T4r, T4q);
Chris@42 230 T4V = VSUB(T1r, T1s);
Chris@42 231 T1t = VADD(T1r, T1s);
Chris@42 232 T58 = VSUB(T1v, T1u);
Chris@42 233 T1w = VADD(T1u, T1v);
Chris@42 234 T4X = VSUB(T1O, T1P);
Chris@42 235 T1Q = VADD(T1O, T1P);
Chris@42 236 T3l = VADD(T3j, T3k);
Chris@42 237 T3B = VSUB(T3j, T3k);
Chris@42 238 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@42 239 T2a = VFMA(LDK(KP414213562), TJ, TQ);
Chris@42 240 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@42 241 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@42 242 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@42 243 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@42 244 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 245 }
Chris@42 246 {
Chris@42 247 V T4W, T1A, T50, T51, T1D, T1F, T1G;
Chris@42 248 {
Chris@42 249 V T1y, T1z, T1B, T1C;
Chris@42 250 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 251 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 252 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 253 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 254 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@42 255 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@42 256 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@42 257 T6E = VFMA(LDK(KP668178637), T6f, T6g);
Chris@42 258 T4W = VSUB(T1R, T1S);
Chris@42 259 T1T = VADD(T1R, T1S);
Chris@42 260 T1A = VADD(T1y, T1z);
Chris@42 261 T50 = VSUB(T1y, T1z);
Chris@42 262 T51 = VSUB(T1C, T1B);
Chris@42 263 T1D = VADD(T1B, T1C);
Chris@42 264 }
Chris@42 265 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 266 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 267 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 268 T4Y = VADD(T4W, T4X);
Chris@42 269 T59 = VSUB(T4X, T4W);
Chris@42 270 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 271 T3a = VADD(T1A, T1D);
Chris@42 272 T1E = VSUB(T1A, T1D);
Chris@42 273 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@42 274 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@42 275 T53 = VSUB(T1F, T1G);
Chris@42 276 T1H = VADD(T1F, T1G);
Chris@42 277 }
Chris@42 278 }
Chris@42 279 }
Chris@42 280 {
Chris@42 281 V T37, T54, T1K, T38;
Chris@42 282 T1x = VSUB(T1t, T1w);
Chris@42 283 T37 = VADD(T1t, T1w);
Chris@42 284 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@42 285 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@42 286 T54 = VSUB(T1J, T1I);
Chris@42 287 T1K = VADD(T1I, T1J);
Chris@42 288 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@42 289 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@42 290 T38 = VADD(T1T, T1Q);
Chris@42 291 T1U = VSUB(T1Q, T1T);
Chris@42 292 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@42 293 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@42 294 T1L = VSUB(T1H, T1K);
Chris@42 295 T3b = VADD(T1H, T1K);
Chris@42 296 T39 = VADD(T37, T38);
Chris@42 297 T3H = VSUB(T37, T38);
Chris@42 298 }
Chris@42 299 }
Chris@42 300 {
Chris@42 301 V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
Chris@42 302 V T1d;
Chris@42 303 {
Chris@42 304 V TU, TV, TX, TY, T56, T6v;
Chris@42 305 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 306 T56 = VADD(T52, T55);
Chris@42 307 T6v = VSUB(T55, T52);
Chris@42 308 {
Chris@42 309 V T5d, T6s, T1V, T1M;
Chris@42 310 T5d = VADD(T5b, T5c);
Chris@42 311 T6s = VSUB(T5c, T5b);
Chris@42 312 T1V = VSUB(T1L, T1E);
Chris@42 313 T1M = VADD(T1E, T1L);
Chris@42 314 T3I = VSUB(T3b, T3a);
Chris@42 315 T3c = VADD(T3a, T3b);
Chris@42 316 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@42 317 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@42 318 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@42 319 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@42 320 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@42 321 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@42 322 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@42 323 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@42 324 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@42 325 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@42 326 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@42 327 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@42 328 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 329 }
Chris@42 330 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 331 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 332 {
Chris@42 333 V T1h, T1i, T1k, T1l;
Chris@42 334 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 335 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 336 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 337 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 338 {
Chris@42 339 V T11, T4B, T4C, T12, T14, T15;
Chris@42 340 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 341 T4A = VSUB(TU, TV);
Chris@42 342 TW = VADD(TU, TV);
Chris@42 343 T4N = VSUB(TX, TY);
Chris@42 344 TZ = VADD(TX, TY);
Chris@42 345 T1j = VADD(T1h, T1i);
Chris@42 346 T4B = VSUB(T1h, T1i);
Chris@42 347 T1m = VADD(T1k, T1l);
Chris@42 348 T4C = VSUB(T1k, T1l);
Chris@42 349 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 350 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 351 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 352 {
Chris@42 353 V T18, T19, T1b, T1c;
Chris@42 354 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 355 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 356 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 357 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 358 T4O = VSUB(T4B, T4C);
Chris@42 359 T4D = VADD(T4B, T4C);
Chris@42 360 T13 = VADD(T11, T12);
Chris@42 361 T4F = VSUB(T11, T12);
Chris@42 362 T16 = VADD(T14, T15);
Chris@42 363 T4G = VSUB(T14, T15);
Chris@42 364 T1a = VADD(T18, T19);
Chris@42 365 T4I = VSUB(T18, T19);
Chris@42 366 T4J = VSUB(T1b, T1c);
Chris@42 367 T1d = VADD(T1b, T1c);
Chris@42 368 }
Chris@42 369 }
Chris@42 370 }
Chris@42 371 }
Chris@42 372 {
Chris@42 373 V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
Chris@42 374 T30 = VADD(TW, TZ);
Chris@42 375 T10 = VSUB(TW, TZ);
Chris@42 376 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@42 377 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@42 378 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@42 379 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@42 380 T33 = VADD(T13, T16);
Chris@42 381 T17 = VSUB(T13, T16);
Chris@42 382 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@42 383 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@42 384 T34 = VADD(T1a, T1d);
Chris@42 385 T1e = VSUB(T1a, T1d);
Chris@42 386 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@42 387 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@42 388 T1n = VSUB(T1j, T1m);
Chris@42 389 T31 = VADD(T1j, T1m);
Chris@42 390 {
Chris@42 391 V T1f, T1o, T6o, T4L, T4S, T6l;
Chris@42 392 T1f = VADD(T17, T1e);
Chris@42 393 T1o = VSUB(T17, T1e);
Chris@42 394 T6o = VSUB(T4H, T4K);
Chris@42 395 T4L = VADD(T4H, T4K);
Chris@42 396 T4S = VADD(T4Q, T4R);
Chris@42 397 T6l = VSUB(T4Q, T4R);
Chris@42 398 T3E = VSUB(T30, T31);
Chris@42 399 T32 = VADD(T30, T31);
Chris@42 400 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@42 401 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@42 402 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@42 403 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@42 404 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@42 405 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@42 406 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@42 407 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@42 408 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@42 409 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@42 410 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@42 411 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@42 412 }
Chris@42 413 }
Chris@42 414 }
Chris@42 415 }
Chris@42 416 }
Chris@42 417 }
Chris@42 418 {
Chris@42 419 V T6b, T6F, T7f, T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
Chris@42 420 {
Chris@42 421 V T2Z, T3r, T3s, T3m, T3d, T3v;
Chris@42 422 T2Z = VSUB(T2V, T2Y);
Chris@42 423 T3r = VADD(T2V, T2Y);
Chris@42 424 T3s = VADD(T3i, T3l);
Chris@42 425 T3m = VSUB(T3i, T3l);
Chris@42 426 T3d = VSUB(T39, T3c);
Chris@42 427 T3v = VADD(T39, T3c);
Chris@42 428 {
Chris@42 429 V T3x, T3t, T3Q, T3J, T3D, T3V, T3G, T3P, T3u, T36, T3O, T3Y, T6V, T6W;
Chris@42 430 {
Chris@42 431 V T3N, T3C, T3F, T35;
Chris@42 432 T3N = VSUB(T3A, T3B);
Chris@42 433 T3C = VADD(T3A, T3B);
Chris@42 434 T3F = VSUB(T33, T34);
Chris@42 435 T35 = VADD(T33, T34);
Chris@42 436 T3x = VADD(T3r, T3s);
Chris@42 437 T3t = VSUB(T3r, T3s);
Chris@42 438 T3Q = VFMA(LDK(KP414213562), T3H, T3I);
Chris@42 439 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@42 440 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@42 441 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@42 442 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@42 443 T3P = VFMA(LDK(KP414213562), T3E, T3F);
Chris@42 444 T3u = VADD(T32, T35);
Chris@42 445 T36 = VSUB(T32, T35);
Chris@42 446 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@42 447 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@42 448 }
Chris@42 449 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@42 450 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@42 451 T6W = VADD(T6D, T6E);
Chris@42 452 T6F = VSUB(T6D, T6E);
Chris@42 453 {
Chris@42 454 V T3R, T3W, T3K, T3Z;
Chris@42 455 T3R = VSUB(T3P, T3Q);
Chris@42 456 T3W = VADD(T3P, T3Q);
Chris@42 457 T3K = VADD(T3G, T3J);
Chris@42 458 T3Z = VSUB(T3G, T3J);
Chris@42 459 {
Chris@42 460 V T3e, T3n, T3w, T3y;
Chris@42 461 T3e = VADD(T36, T3d);
Chris@42 462 T3n = VSUB(T36, T3d);
Chris@42 463 T3w = VSUB(T3u, T3v);
Chris@42 464 T3y = VADD(T3u, T3v);
Chris@42 465 {
Chris@42 466 V T41, T3X, T3S, T3U;
Chris@42 467 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@42 468 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@42 469 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@42 470 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@42 471 {
Chris@42 472 V T42, T40, T3L, T3T;
Chris@42 473 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@42 474 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@42 475 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@42 476 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@42 477 {
Chris@42 478 V T3o, T3q, T3f, T3p;
Chris@42 479 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@42 480 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@42 481 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@42 482 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@42 483 ST(&(xo[WS(os, 32)]), VSUB(T3x, T3y), ovs, &(xo[0]));
Chris@42 484 ST(&(xo[0]), VADD(T3x, T3y), ovs, &(xo[0]));
Chris@42 485 ST(&(xo[WS(os, 16)]), VFMAI(T3w, T3t), ovs, &(xo[0]));
Chris@42 486 ST(&(xo[WS(os, 48)]), VFNMSI(T3w, T3t), ovs, &(xo[0]));
Chris@42 487 ST(&(xo[WS(os, 44)]), VFNMSI(T40, T3X), ovs, &(xo[0]));
Chris@42 488 ST(&(xo[WS(os, 20)]), VFMAI(T40, T3X), ovs, &(xo[0]));
Chris@42 489 ST(&(xo[WS(os, 52)]), VFMAI(T42, T41), ovs, &(xo[0]));
Chris@42 490 ST(&(xo[WS(os, 12)]), VFNMSI(T42, T41), ovs, &(xo[0]));
Chris@42 491 ST(&(xo[WS(os, 4)]), VFMAI(T3U, T3T), ovs, &(xo[0]));
Chris@42 492 ST(&(xo[WS(os, 60)]), VFNMSI(T3U, T3T), ovs, &(xo[0]));
Chris@42 493 ST(&(xo[WS(os, 36)]), VFMAI(T3S, T3L), ovs, &(xo[0]));
Chris@42 494 ST(&(xo[WS(os, 28)]), VFNMSI(T3S, T3L), ovs, &(xo[0]));
Chris@42 495 ST(&(xo[WS(os, 56)]), VFNMSI(T3q, T3p), ovs, &(xo[0]));
Chris@42 496 ST(&(xo[WS(os, 8)]), VFMAI(T3q, T3p), ovs, &(xo[0]));
Chris@42 497 ST(&(xo[WS(os, 40)]), VFMAI(T3o, T3f), ovs, &(xo[0]));
Chris@42 498 ST(&(xo[WS(os, 24)]), VFNMSI(T3o, T3f), ovs, &(xo[0]));
Chris@42 499 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@42 500 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@42 501 }
Chris@42 502 }
Chris@42 503 }
Chris@42 504 }
Chris@42 505 }
Chris@42 506 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@42 507 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@42 508 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@42 509 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@42 510 T6C = VFMA(LDK(KP923879532), T6B, T6A);
Chris@42 511 T76 = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@42 512 T77 = VSUB(T6e, T6h);
Chris@42 513 T6i = VADD(T6e, T6h);
Chris@42 514 }
Chris@42 515 }
Chris@42 516 {
Chris@42 517 V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T5r, T5I, T5x, T5h, T5F, T5B;
Chris@42 518 {
Chris@42 519 V TT, T2f, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
Chris@42 520 {
Chris@42 521 V T1q, T2d, T7h, T7l, T2e, T1X, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
Chris@42 522 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@42 523 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@42 524 TS = VADD(TC, TR);
Chris@42 525 T2D = VSUB(TC, TR);
Chris@42 526 {
Chris@42 527 V T7b, T7j, T74, T7i, T78, T7g;
Chris@42 528 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@42 529 T2d = VFMA(LDK(KP198912367), T1g, T1p);
Chris@42 530 T7g = VADD(T79, T7a);
Chris@42 531 T7b = VSUB(T79, T7a);
Chris@42 532 T7j = VSUB(T70, T73);
Chris@42 533 T74 = VADD(T70, T73);
Chris@42 534 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@42 535 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@42 536 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@42 537 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@42 538 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@42 539 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@42 540 T2e = VFMA(LDK(KP198912367), T1N, T1W);
Chris@42 541 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@42 542 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@42 543 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@42 544 T7m = VFMA(LDK(KP956940335), T7j, T7i);
Chris@42 545 T7k = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@42 546 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@42 547 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@42 548 }
Chris@42 549 T2k = VADD(T2d, T2e);
Chris@42 550 T2f = VSUB(T2d, T2e);
Chris@42 551 ST(&(xo[WS(os, 45)]), VFMAI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@42 552 ST(&(xo[WS(os, 19)]), VFNMSI(T7k, T7h), ovs, &(xo[WS(os, 1)]));
Chris@42 553 ST(&(xo[WS(os, 51)]), VFNMSI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@42 554 ST(&(xo[WS(os, 13)]), VFMAI(T7m, T7l), ovs, &(xo[WS(os, 1)]));
Chris@42 555 ST(&(xo[WS(os, 61)]), VFMAI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@42 556 ST(&(xo[WS(os, 3)]), VFNMSI(T7e, T7d), ovs, &(xo[WS(os, 1)]));
Chris@42 557 ST(&(xo[WS(os, 29)]), VFMAI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@42 558 ST(&(xo[WS(os, 35)]), VFNMSI(T7c, T75), ovs, &(xo[WS(os, 1)]));
Chris@42 559 T2n = VSUB(T1q, T1X);
Chris@42 560 T1Y = VADD(T1q, T1X);
Chris@42 561 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@42 562 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@42 563 T2b = VSUB(T29, T2a);
Chris@42 564 T2s = VADD(T29, T2a);
Chris@42 565 }
Chris@42 566 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@42 567 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@42 568 {
Chris@42 569 V T5z, T4z, T5A, T5g;
Chris@42 570 {
Chris@42 571 V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
Chris@42 572 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@42 573 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@42 574 T4y = VADD(T4o, T4x);
Chris@42 575 T5T = VSUB(T4o, T4x);
Chris@42 576 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@42 577 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@42 578 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@42 579 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@42 580 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@42 581 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@42 582 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@42 583 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@42 584 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@42 585 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@42 586 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@42 587 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@42 588 {
Chris@42 589 V T2o, T2q, T2i, T2g;
Chris@42 590 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@42 591 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@42 592 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@42 593 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@42 594 T5A = VADD(T5t, T5u);
Chris@42 595 T5v = VSUB(T5t, T5u);
Chris@42 596 T5D = VSUB(T4U, T5f);
Chris@42 597 T5g = VADD(T4U, T5f);
Chris@42 598 ST(&(xo[WS(os, 46)]), VFNMSI(T2o, T2l), ovs, &(xo[0]));
Chris@42 599 ST(&(xo[WS(os, 18)]), VFMAI(T2o, T2l), ovs, &(xo[0]));
Chris@42 600 ST(&(xo[WS(os, 50)]), VFMAI(T2q, T2p), ovs, &(xo[0]));
Chris@42 601 ST(&(xo[WS(os, 14)]), VFNMSI(T2q, T2p), ovs, &(xo[0]));
Chris@42 602 ST(&(xo[WS(os, 2)]), VFMAI(T2i, T2h), ovs, &(xo[0]));
Chris@42 603 ST(&(xo[WS(os, 62)]), VFNMSI(T2i, T2h), ovs, &(xo[0]));
Chris@42 604 ST(&(xo[WS(os, 34)]), VFMAI(T2g, T1Z), ovs, &(xo[0]));
Chris@42 605 ST(&(xo[WS(os, 30)]), VFNMSI(T2g, T1Z), ovs, &(xo[0]));
Chris@42 606 T5r = VSUB(T5p, T5q);
Chris@42 607 T5I = VADD(T5p, T5q);
Chris@42 608 }
Chris@42 609 }
Chris@42 610 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@42 611 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@42 612 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@42 613 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@42 614 }
Chris@42 615 }
Chris@42 616 {
Chris@42 617 V T6J, T6R, T6L, T6z, T6T, T6P;
Chris@42 618 {
Chris@42 619 V T6N, T6j, T6O, T6y;
Chris@42 620 {
Chris@42 621 V T6q, T6H, T5C, T5s, T6I, T6x;
Chris@42 622 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@42 623 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@42 624 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@42 625 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@42 626 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@42 627 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@42 628 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@42 629 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@42 630 {
Chris@42 631 V T5E, T5G, T5y, T5w;
Chris@42 632 T5E = VFMA(LDK(KP995184726), T5D, T5C);
Chris@42 633 T5G = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@42 634 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@42 635 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@42 636 T6O = VADD(T6H, T6I);
Chris@42 637 T6J = VSUB(T6H, T6I);
Chris@42 638 T6R = VSUB(T6q, T6x);
Chris@42 639 T6y = VADD(T6q, T6x);
Chris@42 640 ST(&(xo[WS(os, 47)]), VFNMSI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@42 641 ST(&(xo[WS(os, 17)]), VFMAI(T5E, T5B), ovs, &(xo[WS(os, 1)]));
Chris@42 642 ST(&(xo[WS(os, 49)]), VFMAI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@42 643 ST(&(xo[WS(os, 15)]), VFNMSI(T5G, T5F), ovs, &(xo[WS(os, 1)]));
Chris@42 644 ST(&(xo[WS(os, 1)]), VFMAI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@42 645 ST(&(xo[WS(os, 63)]), VFNMSI(T5y, T5x), ovs, &(xo[WS(os, 1)]));
Chris@42 646 ST(&(xo[WS(os, 33)]), VFMAI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@42 647 ST(&(xo[WS(os, 31)]), VFNMSI(T5w, T5h), ovs, &(xo[WS(os, 1)]));
Chris@42 648 }
Chris@42 649 }
Chris@42 650 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@42 651 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@42 652 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@42 653 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@42 654 }
Chris@42 655 {
Chris@42 656 V T2H, T2P, T2J, T2B, T2R, T2N;
Chris@42 657 {
Chris@42 658 V T2L, T2t, T2M, T2A;
Chris@42 659 {
Chris@42 660 V T2w, T2F, T6Q, T6G, T2G, T2z;
Chris@42 661 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@42 662 T2F = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@42 663 T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@42 664 T6G = VFMA(LDK(KP831469612), T6F, T6C);
Chris@42 665 T2G = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@42 666 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@42 667 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@42 668 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@42 669 {
Chris@42 670 V T6S, T6U, T6M, T6K;
Chris@42 671 T6S = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@42 672 T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@42 673 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@42 674 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@42 675 T2M = VADD(T2F, T2G);
Chris@42 676 T2H = VSUB(T2F, T2G);
Chris@42 677 T2P = VSUB(T2w, T2z);
Chris@42 678 T2A = VADD(T2w, T2z);
Chris@42 679 ST(&(xo[WS(os, 43)]), VFNMSI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@42 680 ST(&(xo[WS(os, 21)]), VFMAI(T6S, T6P), ovs, &(xo[WS(os, 1)]));
Chris@42 681 ST(&(xo[WS(os, 53)]), VFMAI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@42 682 ST(&(xo[WS(os, 11)]), VFNMSI(T6U, T6T), ovs, &(xo[WS(os, 1)]));
Chris@42 683 ST(&(xo[WS(os, 5)]), VFMAI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@42 684 ST(&(xo[WS(os, 59)]), VFNMSI(T6M, T6L), ovs, &(xo[WS(os, 1)]));
Chris@42 685 ST(&(xo[WS(os, 37)]), VFMAI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@42 686 ST(&(xo[WS(os, 27)]), VFNMSI(T6K, T6z), ovs, &(xo[WS(os, 1)]));
Chris@42 687 }
Chris@42 688 }
Chris@42 689 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@42 690 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@42 691 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@42 692 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@42 693 }
Chris@42 694 {
Chris@42 695 V T61, T5J, T62, T5Q;
Chris@42 696 {
Chris@42 697 V T5M, T5V, T2O, T2E, T5W, T5P;
Chris@42 698 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@42 699 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@42 700 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@42 701 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@42 702 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@42 703 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@42 704 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@42 705 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@42 706 {
Chris@42 707 V T2Q, T2S, T2K, T2I;
Chris@42 708 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@42 709 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@42 710 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@42 711 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@42 712 T62 = VADD(T5V, T5W);
Chris@42 713 T5X = VSUB(T5V, T5W);
Chris@42 714 T65 = VSUB(T5M, T5P);
Chris@42 715 T5Q = VADD(T5M, T5P);
Chris@42 716 ST(&(xo[WS(os, 42)]), VFMAI(T2Q, T2N), ovs, &(xo[0]));
Chris@42 717 ST(&(xo[WS(os, 22)]), VFNMSI(T2Q, T2N), ovs, &(xo[0]));
Chris@42 718 ST(&(xo[WS(os, 54)]), VFNMSI(T2S, T2R), ovs, &(xo[0]));
Chris@42 719 ST(&(xo[WS(os, 10)]), VFMAI(T2S, T2R), ovs, &(xo[0]));
Chris@42 720 ST(&(xo[WS(os, 58)]), VFMAI(T2K, T2J), ovs, &(xo[0]));
Chris@42 721 ST(&(xo[WS(os, 6)]), VFNMSI(T2K, T2J), ovs, &(xo[0]));
Chris@42 722 ST(&(xo[WS(os, 26)]), VFMAI(T2I, T2B), ovs, &(xo[0]));
Chris@42 723 ST(&(xo[WS(os, 38)]), VFNMSI(T2I, T2B), ovs, &(xo[0]));
Chris@42 724 }
Chris@42 725 }
Chris@42 726 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@42 727 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@42 728 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@42 729 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@42 730 }
Chris@42 731 }
Chris@42 732 }
Chris@42 733 }
Chris@42 734 }
Chris@42 735 }
Chris@42 736 T5U = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@42 737 T64 = VFMA(LDK(KP980785280), T5T, T5S);
Chris@42 738 {
Chris@42 739 V T68, T66, T5Y, T60;
Chris@42 740 T68 = VFMA(LDK(KP773010453), T65, T64);
Chris@42 741 T66 = VFNMS(LDK(KP773010453), T65, T64);
Chris@42 742 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@42 743 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@42 744 ST(&(xo[WS(os, 41)]), VFMAI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@42 745 ST(&(xo[WS(os, 23)]), VFNMSI(T66, T63), ovs, &(xo[WS(os, 1)]));
Chris@42 746 ST(&(xo[WS(os, 55)]), VFNMSI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@42 747 ST(&(xo[WS(os, 9)]), VFMAI(T68, T67), ovs, &(xo[WS(os, 1)]));
Chris@42 748 ST(&(xo[WS(os, 57)]), VFMAI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@42 749 ST(&(xo[WS(os, 7)]), VFNMSI(T60, T5Z), ovs, &(xo[WS(os, 1)]));
Chris@42 750 ST(&(xo[WS(os, 25)]), VFMAI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@42 751 ST(&(xo[WS(os, 39)]), VFNMSI(T5Y, T5R), ovs, &(xo[WS(os, 1)]));
Chris@42 752 }
Chris@42 753 }
Chris@42 754 }
Chris@42 755 VLEAVE();
Chris@42 756 }
Chris@42 757
Chris@42 758 static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {198, 0, 258, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 759
Chris@42 760 void XSIMD(codelet_n1bv_64) (planner *p) {
Chris@42 761 X(kdft_register) (p, n1bv_64, &desc);
Chris@42 762 }
Chris@42 763
Chris@42 764 #else /* HAVE_FMA */
Chris@42 765
Chris@42 766 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n1bv_64 -include n1b.h */
Chris@42 767
Chris@42 768 /*
Chris@42 769 * This function contains 456 FP additions, 124 FP multiplications,
Chris@42 770 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@42 771 * 108 stack variables, 15 constants, and 128 memory accesses
Chris@42 772 */
Chris@42 773 #include "n1b.h"
Chris@42 774
Chris@42 775 static void n1bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 776 {
Chris@42 777 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@42 778 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 779 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@42 780 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 781 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@42 782 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 783 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@42 784 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 785 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 786 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 787 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 788 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 789 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 790 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 791 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 792 {
Chris@42 793 INT i;
Chris@42 794 const R *xi;
Chris@42 795 R *xo;
Chris@42 796 xi = ii;
Chris@42 797 xo = io;
Chris@42 798 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 799 V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
Chris@42 800 V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
Chris@42 801 V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
Chris@42 802 V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
Chris@42 803 V T53, T5P, T3e, T3W;
Chris@42 804 {
Chris@42 805 V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
Chris@42 806 {
Chris@42 807 V T1, T2, T2n, T2o;
Chris@42 808 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 809 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 810 T3 = VSUB(T1, T2);
Chris@42 811 T4n = VADD(T1, T2);
Chris@42 812 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 813 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 814 T2p = VSUB(T2n, T2o);
Chris@42 815 T4o = VADD(T2n, T2o);
Chris@42 816 }
Chris@42 817 {
Chris@42 818 V T4, T5, T7, T8;
Chris@42 819 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 820 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 821 T6 = VSUB(T4, T5);
Chris@42 822 T5s = VADD(T4, T5);
Chris@42 823 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 824 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 825 T9 = VSUB(T7, T8);
Chris@42 826 T5t = VADD(T7, T8);
Chris@42 827 }
Chris@42 828 T4p = VSUB(T4n, T4o);
Chris@42 829 T5u = VSUB(T5s, T5t);
Chris@42 830 {
Chris@42 831 V Ta, T2m, T6E, T6F;
Chris@42 832 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 833 Tb = VSUB(T3, Ta);
Chris@42 834 T3A = VADD(T3, Ta);
Chris@42 835 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@42 836 T2q = VSUB(T2m, T2p);
Chris@42 837 T3v = VADD(T2p, T2m);
Chris@42 838 T6E = VADD(T4n, T4o);
Chris@42 839 T6F = VADD(T5s, T5t);
Chris@42 840 T6G = VSUB(T6E, T6F);
Chris@42 841 T78 = VADD(T6E, T6F);
Chris@42 842 }
Chris@42 843 }
Chris@42 844 {
Chris@42 845 V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
Chris@42 846 {
Chris@42 847 V Tc, Td, Tm, Tn;
Chris@42 848 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 849 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 850 Te = VSUB(Tc, Td);
Chris@42 851 T4q = VADD(Tc, Td);
Chris@42 852 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 853 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 854 To = VSUB(Tm, Tn);
Chris@42 855 T4t = VADD(Tm, Tn);
Chris@42 856 }
Chris@42 857 {
Chris@42 858 V Tf, Tg, Tj, Tk;
Chris@42 859 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 860 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 861 Th = VSUB(Tf, Tg);
Chris@42 862 T4r = VADD(Tf, Tg);
Chris@42 863 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 864 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 865 Tl = VSUB(Tj, Tk);
Chris@42 866 T4u = VADD(Tj, Tk);
Chris@42 867 }
Chris@42 868 {
Chris@42 869 V Ti, Tp, T6z, T6A;
Chris@42 870 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@42 871 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@42 872 Tq = VSUB(Ti, Tp);
Chris@42 873 T3w = VADD(Ti, Tp);
Chris@42 874 T6z = VADD(T4q, T4r);
Chris@42 875 T6A = VADD(T4t, T4u);
Chris@42 876 T6B = VSUB(T6z, T6A);
Chris@42 877 T79 = VADD(T6z, T6A);
Chris@42 878 }
Chris@42 879 {
Chris@42 880 V T2j, T2k, T4s, T4v;
Chris@42 881 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@42 882 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@42 883 T2l = VSUB(T2j, T2k);
Chris@42 884 T3B = VADD(T2j, T2k);
Chris@42 885 T4s = VSUB(T4q, T4r);
Chris@42 886 T4v = VSUB(T4t, T4u);
Chris@42 887 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@42 888 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
Chris@42 889 }
Chris@42 890 }
Chris@42 891 {
Chris@42 892 V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
Chris@42 893 {
Chris@42 894 V Tz, TA, TD, TE;
Chris@42 895 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 896 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 897 TB = VSUB(Tz, TA);
Chris@42 898 T4z = VADD(Tz, TA);
Chris@42 899 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 900 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 901 TF = VSUB(TD, TE);
Chris@42 902 T4y = VADD(TD, TE);
Chris@42 903 {
Chris@42 904 V Ts, Tt, Tu, Tv, Tw, Tx;
Chris@42 905 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 906 Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 907 Tu = VSUB(Ts, Tt);
Chris@42 908 Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 909 Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 910 Tx = VSUB(Tv, Tw);
Chris@42 911 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@42 912 T4C = VADD(Tv, Tw);
Chris@42 913 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@42 914 T4B = VADD(Ts, Tt);
Chris@42 915 }
Chris@42 916 }
Chris@42 917 {
Chris@42 918 V TC, TH, T6s, T6t;
Chris@42 919 TC = VSUB(Ty, TB);
Chris@42 920 TH = VSUB(TF, TG);
Chris@42 921 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@42 922 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@42 923 T6s = VADD(T4y, T4z);
Chris@42 924 T6t = VADD(T4B, T4C);
Chris@42 925 T6u = VSUB(T6s, T6t);
Chris@42 926 T74 = VADD(T6s, T6t);
Chris@42 927 }
Chris@42 928 {
Chris@42 929 V T3o, T3p, T4A, T4D;
Chris@42 930 T3o = VADD(TB, Ty);
Chris@42 931 T3p = VADD(TF, TG);
Chris@42 932 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
Chris@42 933 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
Chris@42 934 T4A = VSUB(T4y, T4z);
Chris@42 935 T4D = VSUB(T4B, T4C);
Chris@42 936 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@42 937 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
Chris@42 938 }
Chris@42 939 }
Chris@42 940 {
Chris@42 941 V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
Chris@42 942 {
Chris@42 943 V TQ, TR, TU, TV;
Chris@42 944 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 945 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 946 TS = VSUB(TQ, TR);
Chris@42 947 T4J = VADD(TQ, TR);
Chris@42 948 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 949 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 950 TW = VSUB(TU, TV);
Chris@42 951 T4I = VADD(TU, TV);
Chris@42 952 {
Chris@42 953 V TJ, TK, TL, TM, TN, TO;
Chris@42 954 TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 955 TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 956 TL = VSUB(TJ, TK);
Chris@42 957 TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 958 TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 959 TO = VSUB(TM, TN);
Chris@42 960 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@42 961 T4G = VADD(TM, TN);
Chris@42 962 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@42 963 T4F = VADD(TJ, TK);
Chris@42 964 }
Chris@42 965 }
Chris@42 966 {
Chris@42 967 V TT, TY, T6v, T6w;
Chris@42 968 TT = VSUB(TP, TS);
Chris@42 969 TY = VSUB(TW, TX);
Chris@42 970 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@42 971 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@42 972 T6v = VADD(T4I, T4J);
Chris@42 973 T6w = VADD(T4F, T4G);
Chris@42 974 T6x = VSUB(T6v, T6w);
Chris@42 975 T75 = VADD(T6v, T6w);
Chris@42 976 }
Chris@42 977 {
Chris@42 978 V T3r, T3s, T4H, T4K;
Chris@42 979 T3r = VADD(TS, TP);
Chris@42 980 T3s = VADD(TW, TX);
Chris@42 981 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
Chris@42 982 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
Chris@42 983 T4H = VSUB(T4F, T4G);
Chris@42 984 T4K = VSUB(T4I, T4J);
Chris@42 985 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@42 986 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
Chris@42 987 }
Chris@42 988 }
Chris@42 989 {
Chris@42 990 V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
Chris@42 991 V T1Q, T5a, T2a;
Chris@42 992 {
Chris@42 993 V T1Z, T20, T24, T25;
Chris@42 994 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 995 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 996 T21 = VSUB(T1Z, T20);
Chris@42 997 T5h = VADD(T1Z, T20);
Chris@42 998 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 999 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1000 T26 = VSUB(T24, T25);
Chris@42 1001 T5g = VADD(T24, T25);
Chris@42 1002 }
Chris@42 1003 {
Chris@42 1004 V T1S, T1T, T1U, T1V, T1W, T1X;
Chris@42 1005 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1006 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1007 T1U = VSUB(T1S, T1T);
Chris@42 1008 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1009 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1010 T1X = VSUB(T1V, T1W);
Chris@42 1011 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
Chris@42 1012 T5d = VADD(T1V, T1W);
Chris@42 1013 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
Chris@42 1014 T5c = VADD(T1S, T1T);
Chris@42 1015 }
Chris@42 1016 {
Chris@42 1017 V T1F, T1I, T1M, T1P;
Chris@42 1018 {
Chris@42 1019 V T1D, T1E, T1G, T1H;
Chris@42 1020 T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1021 T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1022 T1F = VSUB(T1D, T1E);
Chris@42 1023 T55 = VADD(T1D, T1E);
Chris@42 1024 T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1025 T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1026 T1I = VSUB(T1G, T1H);
Chris@42 1027 T56 = VADD(T1G, T1H);
Chris@42 1028 }
Chris@42 1029 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
Chris@42 1030 T57 = VSUB(T55, T56);
Chris@42 1031 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
Chris@42 1032 {
Chris@42 1033 V T1K, T1L, T1N, T1O;
Chris@42 1034 T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1035 T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1036 T1M = VSUB(T1K, T1L);
Chris@42 1037 T58 = VADD(T1K, T1L);
Chris@42 1038 T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1039 T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1040 T1P = VSUB(T1N, T1O);
Chris@42 1041 T59 = VADD(T1N, T1O);
Chris@42 1042 }
Chris@42 1043 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
Chris@42 1044 T5a = VSUB(T58, T59);
Chris@42 1045 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
Chris@42 1046 }
Chris@42 1047 {
Chris@42 1048 V T1R, T22, T6k, T6l;
Chris@42 1049 T1R = VSUB(T1J, T1Q);
Chris@42 1050 T22 = VSUB(T1Y, T21);
Chris@42 1051 T23 = VSUB(T1R, T22);
Chris@42 1052 T2N = VADD(T22, T1R);
Chris@42 1053 T6k = VADD(T5g, T5h);
Chris@42 1054 T6l = VADD(T5c, T5d);
Chris@42 1055 T6m = VSUB(T6k, T6l);
Chris@42 1056 T70 = VADD(T6k, T6l);
Chris@42 1057 }
Chris@42 1058 {
Chris@42 1059 V T6n, T6o, T28, T2b;
Chris@42 1060 T6n = VADD(T55, T56);
Chris@42 1061 T6o = VADD(T58, T59);
Chris@42 1062 T6p = VSUB(T6n, T6o);
Chris@42 1063 T71 = VADD(T6n, T6o);
Chris@42 1064 T28 = VSUB(T26, T27);
Chris@42 1065 T2b = VSUB(T29, T2a);
Chris@42 1066 T2c = VSUB(T28, T2b);
Chris@42 1067 T2O = VADD(T28, T2b);
Chris@42 1068 }
Chris@42 1069 {
Chris@42 1070 V T3g, T3h, T5b, T5e;
Chris@42 1071 T3g = VADD(T26, T27);
Chris@42 1072 T3h = VADD(T1J, T1Q);
Chris@42 1073 T3i = VADD(T3g, T3h);
Chris@42 1074 T3Y = VSUB(T3g, T3h);
Chris@42 1075 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
Chris@42 1076 T5e = VSUB(T5c, T5d);
Chris@42 1077 T5f = VSUB(T5b, T5e);
Chris@42 1078 T5R = VADD(T5e, T5b);
Chris@42 1079 }
Chris@42 1080 {
Chris@42 1081 V T5i, T5j, T3j, T3k;
Chris@42 1082 T5i = VSUB(T5g, T5h);
Chris@42 1083 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
Chris@42 1084 T5k = VSUB(T5i, T5j);
Chris@42 1085 T5S = VADD(T5i, T5j);
Chris@42 1086 T3j = VADD(T21, T1Y);
Chris@42 1087 T3k = VADD(T29, T2a);
Chris@42 1088 T3l = VADD(T3j, T3k);
Chris@42 1089 T3Z = VSUB(T3k, T3j);
Chris@42 1090 }
Chris@42 1091 }
Chris@42 1092 {
Chris@42 1093 V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
Chris@42 1094 V T1f, T4T, T1z;
Chris@42 1095 {
Chris@42 1096 V T1o, T1p, T1t, T1u;
Chris@42 1097 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1098 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1099 T1q = VSUB(T1o, T1p);
Chris@42 1100 T50 = VADD(T1o, T1p);
Chris@42 1101 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1102 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1103 T1v = VSUB(T1t, T1u);
Chris@42 1104 T4Z = VADD(T1t, T1u);
Chris@42 1105 }
Chris@42 1106 {
Chris@42 1107 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@42 1108 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1109 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1110 T1j = VSUB(T1h, T1i);
Chris@42 1111 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1112 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1113 T1m = VSUB(T1k, T1l);
Chris@42 1114 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@42 1115 T4W = VADD(T1k, T1l);
Chris@42 1116 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
Chris@42 1117 T4V = VADD(T1h, T1i);
Chris@42 1118 }
Chris@42 1119 {
Chris@42 1120 V T14, T17, T1b, T1e;
Chris@42 1121 {
Chris@42 1122 V T12, T13, T15, T16;
Chris@42 1123 T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1124 T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1125 T14 = VSUB(T12, T13);
Chris@42 1126 T4O = VADD(T12, T13);
Chris@42 1127 T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1128 T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1129 T17 = VSUB(T15, T16);
Chris@42 1130 T4P = VADD(T15, T16);
Chris@42 1131 }
Chris@42 1132 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
Chris@42 1133 T4Q = VSUB(T4O, T4P);
Chris@42 1134 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
Chris@42 1135 {
Chris@42 1136 V T19, T1a, T1c, T1d;
Chris@42 1137 T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1138 T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1139 T1b = VSUB(T19, T1a);
Chris@42 1140 T4R = VADD(T19, T1a);
Chris@42 1141 T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1142 T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1143 T1e = VSUB(T1c, T1d);
Chris@42 1144 T4S = VADD(T1c, T1d);
Chris@42 1145 }
Chris@42 1146 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
Chris@42 1147 T4T = VSUB(T4R, T4S);
Chris@42 1148 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@42 1149 }
Chris@42 1150 {
Chris@42 1151 V T1g, T1r, T6d, T6e;
Chris@42 1152 T1g = VSUB(T18, T1f);
Chris@42 1153 T1r = VSUB(T1n, T1q);
Chris@42 1154 T1s = VSUB(T1g, T1r);
Chris@42 1155 T2K = VADD(T1r, T1g);
Chris@42 1156 T6d = VADD(T4Z, T50);
Chris@42 1157 T6e = VADD(T4V, T4W);
Chris@42 1158 T6f = VSUB(T6d, T6e);
Chris@42 1159 T6X = VADD(T6d, T6e);
Chris@42 1160 }
Chris@42 1161 {
Chris@42 1162 V T6g, T6h, T1x, T1A;
Chris@42 1163 T6g = VADD(T4O, T4P);
Chris@42 1164 T6h = VADD(T4R, T4S);
Chris@42 1165 T6i = VSUB(T6g, T6h);
Chris@42 1166 T6Y = VADD(T6g, T6h);
Chris@42 1167 T1x = VSUB(T1v, T1w);
Chris@42 1168 T1A = VSUB(T1y, T1z);
Chris@42 1169 T1B = VSUB(T1x, T1A);
Chris@42 1170 T2L = VADD(T1x, T1A);
Chris@42 1171 }
Chris@42 1172 {
Chris@42 1173 V T39, T3a, T4U, T4X;
Chris@42 1174 T39 = VADD(T1v, T1w);
Chris@42 1175 T3a = VADD(T18, T1f);
Chris@42 1176 T3b = VADD(T39, T3a);
Chris@42 1177 T3V = VSUB(T39, T3a);
Chris@42 1178 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
Chris@42 1179 T4X = VSUB(T4V, T4W);
Chris@42 1180 T4Y = VSUB(T4U, T4X);
Chris@42 1181 T5O = VADD(T4X, T4U);
Chris@42 1182 }
Chris@42 1183 {
Chris@42 1184 V T51, T52, T3c, T3d;
Chris@42 1185 T51 = VSUB(T4Z, T50);
Chris@42 1186 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
Chris@42 1187 T53 = VSUB(T51, T52);
Chris@42 1188 T5P = VADD(T51, T52);
Chris@42 1189 T3c = VADD(T1q, T1n);
Chris@42 1190 T3d = VADD(T1y, T1z);
Chris@42 1191 T3e = VADD(T3c, T3d);
Chris@42 1192 T3W = VSUB(T3d, T3c);
Chris@42 1193 }
Chris@42 1194 }
Chris@42 1195 {
Chris@42 1196 V T7h, T7l, T7k, T7m;
Chris@42 1197 {
Chris@42 1198 V T7f, T7g, T7i, T7j;
Chris@42 1199 T7f = VADD(T78, T79);
Chris@42 1200 T7g = VADD(T74, T75);
Chris@42 1201 T7h = VSUB(T7f, T7g);
Chris@42 1202 T7l = VADD(T7f, T7g);
Chris@42 1203 T7i = VADD(T6X, T6Y);
Chris@42 1204 T7j = VADD(T70, T71);
Chris@42 1205 T7k = VBYI(VSUB(T7i, T7j));
Chris@42 1206 T7m = VADD(T7i, T7j);
Chris@42 1207 }
Chris@42 1208 ST(&(xo[WS(os, 48)]), VSUB(T7h, T7k), ovs, &(xo[0]));
Chris@42 1209 ST(&(xo[0]), VADD(T7l, T7m), ovs, &(xo[0]));
Chris@42 1210 ST(&(xo[WS(os, 16)]), VADD(T7h, T7k), ovs, &(xo[0]));
Chris@42 1211 ST(&(xo[WS(os, 32)]), VSUB(T7l, T7m), ovs, &(xo[0]));
Chris@42 1212 }
Chris@42 1213 {
Chris@42 1214 V T76, T7a, T73, T7b, T6Z, T72;
Chris@42 1215 T76 = VSUB(T74, T75);
Chris@42 1216 T7a = VSUB(T78, T79);
Chris@42 1217 T6Z = VSUB(T6X, T6Y);
Chris@42 1218 T72 = VSUB(T70, T71);
Chris@42 1219 T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
Chris@42 1220 T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
Chris@42 1221 {
Chris@42 1222 V T77, T7c, T7d, T7e;
Chris@42 1223 T77 = VBYI(VSUB(T73, T76));
Chris@42 1224 T7c = VSUB(T7a, T7b);
Chris@42 1225 ST(&(xo[WS(os, 24)]), VADD(T77, T7c), ovs, &(xo[0]));
Chris@42 1226 ST(&(xo[WS(os, 40)]), VSUB(T7c, T77), ovs, &(xo[0]));
Chris@42 1227 T7d = VBYI(VADD(T76, T73));
Chris@42 1228 T7e = VADD(T7a, T7b);
Chris@42 1229 ST(&(xo[WS(os, 8)]), VADD(T7d, T7e), ovs, &(xo[0]));
Chris@42 1230 ST(&(xo[WS(os, 56)]), VSUB(T7e, T7d), ovs, &(xo[0]));
Chris@42 1231 }
Chris@42 1232 }
Chris@42 1233 {
Chris@42 1234 V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
Chris@42 1235 T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
Chris@42 1236 T6C = VSUB(T6y, T6B);
Chris@42 1237 T6S = VADD(T6B, T6y);
Chris@42 1238 T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
Chris@42 1239 T6I = VSUB(T6G, T6H);
Chris@42 1240 T6P = VADD(T6G, T6H);
Chris@42 1241 {
Chris@42 1242 V T6j, T6q, T6J, T6K;
Chris@42 1243 T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
Chris@42 1244 T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
Chris@42 1245 T6r = VSUB(T6j, T6q);
Chris@42 1246 T6Q = VADD(T6j, T6q);
Chris@42 1247 T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
Chris@42 1248 T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
Chris@42 1249 T6L = VSUB(T6J, T6K);
Chris@42 1250 T6T = VADD(T6J, T6K);
Chris@42 1251 }
Chris@42 1252 {
Chris@42 1253 V T6D, T6M, T6V, T6W;
Chris@42 1254 T6D = VBYI(VSUB(T6r, T6C));
Chris@42 1255 T6M = VSUB(T6I, T6L);
Chris@42 1256 ST(&(xo[WS(os, 20)]), VADD(T6D, T6M), ovs, &(xo[0]));
Chris@42 1257 ST(&(xo[WS(os, 44)]), VSUB(T6M, T6D), ovs, &(xo[0]));
Chris@42 1258 T6V = VSUB(T6P, T6Q);
Chris@42 1259 T6W = VBYI(VSUB(T6T, T6S));
Chris@42 1260 ST(&(xo[WS(os, 36)]), VSUB(T6V, T6W), ovs, &(xo[0]));
Chris@42 1261 ST(&(xo[WS(os, 28)]), VADD(T6V, T6W), ovs, &(xo[0]));
Chris@42 1262 }
Chris@42 1263 {
Chris@42 1264 V T6N, T6O, T6R, T6U;
Chris@42 1265 T6N = VBYI(VADD(T6C, T6r));
Chris@42 1266 T6O = VADD(T6I, T6L);
Chris@42 1267 ST(&(xo[WS(os, 12)]), VADD(T6N, T6O), ovs, &(xo[0]));
Chris@42 1268 ST(&(xo[WS(os, 52)]), VSUB(T6O, T6N), ovs, &(xo[0]));
Chris@42 1269 T6R = VADD(T6P, T6Q);
Chris@42 1270 T6U = VBYI(VADD(T6S, T6T));
Chris@42 1271 ST(&(xo[WS(os, 60)]), VSUB(T6R, T6U), ovs, &(xo[0]));
Chris@42 1272 ST(&(xo[WS(os, 4)]), VADD(T6R, T6U), ovs, &(xo[0]));
Chris@42 1273 }
Chris@42 1274 }
Chris@42 1275 {
Chris@42 1276 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@42 1277 {
Chris@42 1278 V T5L, T5M, T5Z, T60;
Chris@42 1279 T5L = VADD(T4p, T4w);
Chris@42 1280 T5M = VADD(T5o, T5p);
Chris@42 1281 T5N = VSUB(T5L, T5M);
Chris@42 1282 T68 = VADD(T5L, T5M);
Chris@42 1283 T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
Chris@42 1284 T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
Chris@42 1285 T61 = VSUB(T5Z, T60);
Chris@42 1286 T69 = VADD(T5Z, T60);
Chris@42 1287 }
Chris@42 1288 {
Chris@42 1289 V T5Q, T5T, T5W, T5X;
Chris@42 1290 T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
Chris@42 1291 T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
Chris@42 1292 T5U = VSUB(T5Q, T5T);
Chris@42 1293 T65 = VADD(T5Q, T5T);
Chris@42 1294 T5W = VADD(T4E, T4L);
Chris@42 1295 T5X = VADD(T5u, T5r);
Chris@42 1296 T5Y = VSUB(T5W, T5X);
Chris@42 1297 T66 = VADD(T5X, T5W);
Chris@42 1298 }
Chris@42 1299 {
Chris@42 1300 V T5V, T62, T6b, T6c;
Chris@42 1301 T5V = VADD(T5N, T5U);
Chris@42 1302 T62 = VBYI(VADD(T5Y, T61));
Chris@42 1303 ST(&(xo[WS(os, 50)]), VSUB(T5V, T62), ovs, &(xo[0]));
Chris@42 1304 ST(&(xo[WS(os, 14)]), VADD(T5V, T62), ovs, &(xo[0]));
Chris@42 1305 T6b = VBYI(VADD(T66, T65));
Chris@42 1306 T6c = VADD(T68, T69);
Chris@42 1307 ST(&(xo[WS(os, 2)]), VADD(T6b, T6c), ovs, &(xo[0]));
Chris@42 1308 ST(&(xo[WS(os, 62)]), VSUB(T6c, T6b), ovs, &(xo[0]));
Chris@42 1309 }
Chris@42 1310 {
Chris@42 1311 V T63, T64, T67, T6a;
Chris@42 1312 T63 = VSUB(T5N, T5U);
Chris@42 1313 T64 = VBYI(VSUB(T61, T5Y));
Chris@42 1314 ST(&(xo[WS(os, 46)]), VSUB(T63, T64), ovs, &(xo[0]));
Chris@42 1315 ST(&(xo[WS(os, 18)]), VADD(T63, T64), ovs, &(xo[0]));
Chris@42 1316 T67 = VBYI(VSUB(T65, T66));
Chris@42 1317 T6a = VSUB(T68, T69);
Chris@42 1318 ST(&(xo[WS(os, 30)]), VADD(T67, T6a), ovs, &(xo[0]));
Chris@42 1319 ST(&(xo[WS(os, 34)]), VSUB(T6a, T67), ovs, &(xo[0]));
Chris@42 1320 }
Chris@42 1321 }
Chris@42 1322 {
Chris@42 1323 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@42 1324 {
Chris@42 1325 V Tr, T10, T2t, T2u;
Chris@42 1326 Tr = VSUB(Tb, Tq);
Chris@42 1327 T10 = VSUB(TI, TZ);
Chris@42 1328 T11 = VSUB(Tr, T10);
Chris@42 1329 T2C = VADD(Tr, T10);
Chris@42 1330 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
Chris@42 1331 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
Chris@42 1332 T2v = VSUB(T2t, T2u);
Chris@42 1333 T2D = VADD(T2t, T2u);
Chris@42 1334 }
Chris@42 1335 {
Chris@42 1336 V T1C, T2d, T2i, T2r;
Chris@42 1337 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
Chris@42 1338 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
Chris@42 1339 T2e = VSUB(T1C, T2d);
Chris@42 1340 T2z = VADD(T1C, T2d);
Chris@42 1341 T2i = VSUB(T2g, T2h);
Chris@42 1342 T2r = VSUB(T2l, T2q);
Chris@42 1343 T2s = VSUB(T2i, T2r);
Chris@42 1344 T2A = VADD(T2r, T2i);
Chris@42 1345 }
Chris@42 1346 {
Chris@42 1347 V T2f, T2w, T2F, T2G;
Chris@42 1348 T2f = VADD(T11, T2e);
Chris@42 1349 T2w = VBYI(VADD(T2s, T2v));
Chris@42 1350 ST(&(xo[WS(os, 53)]), VSUB(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@42 1351 ST(&(xo[WS(os, 11)]), VADD(T2f, T2w), ovs, &(xo[WS(os, 1)]));
Chris@42 1352 T2F = VBYI(VADD(T2A, T2z));
Chris@42 1353 T2G = VADD(T2C, T2D);
Chris@42 1354 ST(&(xo[WS(os, 5)]), VADD(T2F, T2G), ovs, &(xo[WS(os, 1)]));
Chris@42 1355 ST(&(xo[WS(os, 59)]), VSUB(T2G, T2F), ovs, &(xo[WS(os, 1)]));
Chris@42 1356 }
Chris@42 1357 {
Chris@42 1358 V T2x, T2y, T2B, T2E;
Chris@42 1359 T2x = VSUB(T11, T2e);
Chris@42 1360 T2y = VBYI(VSUB(T2v, T2s));
Chris@42 1361 ST(&(xo[WS(os, 43)]), VSUB(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@42 1362 ST(&(xo[WS(os, 21)]), VADD(T2x, T2y), ovs, &(xo[WS(os, 1)]));
Chris@42 1363 T2B = VBYI(VSUB(T2z, T2A));
Chris@42 1364 T2E = VSUB(T2C, T2D);
Chris@42 1365 ST(&(xo[WS(os, 27)]), VADD(T2B, T2E), ovs, &(xo[WS(os, 1)]));
Chris@42 1366 ST(&(xo[WS(os, 37)]), VSUB(T2E, T2B), ovs, &(xo[WS(os, 1)]));
Chris@42 1367 }
Chris@42 1368 }
Chris@42 1369 {
Chris@42 1370 V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
Chris@42 1371 {
Chris@42 1372 V T3f, T3m, T3H, T3I;
Chris@42 1373 T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
Chris@42 1374 T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
Chris@42 1375 T3n = VSUB(T3f, T3m);
Chris@42 1376 T3O = VADD(T3f, T3m);
Chris@42 1377 T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
Chris@42 1378 T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
Chris@42 1379 T3J = VSUB(T3H, T3I);
Chris@42 1380 T3R = VADD(T3H, T3I);
Chris@42 1381 }
Chris@42 1382 {
Chris@42 1383 V T3u, T3x, T3C, T3F;
Chris@42 1384 T3u = VADD(T3q, T3t);
Chris@42 1385 T3x = VADD(T3v, T3w);
Chris@42 1386 T3y = VSUB(T3u, T3x);
Chris@42 1387 T3Q = VADD(T3x, T3u);
Chris@42 1388 T3C = VADD(T3A, T3B);
Chris@42 1389 T3F = VADD(T3D, T3E);
Chris@42 1390 T3G = VSUB(T3C, T3F);
Chris@42 1391 T3N = VADD(T3C, T3F);
Chris@42 1392 }
Chris@42 1393 {
Chris@42 1394 V T3z, T3K, T3T, T3U;
Chris@42 1395 T3z = VBYI(VSUB(T3n, T3y));
Chris@42 1396 T3K = VSUB(T3G, T3J);
Chris@42 1397 ST(&(xo[WS(os, 17)]), VADD(T3z, T3K), ovs, &(xo[WS(os, 1)]));
Chris@42 1398 ST(&(xo[WS(os, 47)]), VSUB(T3K, T3z), ovs, &(xo[WS(os, 1)]));
Chris@42 1399 T3T = VSUB(T3N, T3O);
Chris@42 1400 T3U = VBYI(VSUB(T3R, T3Q));
Chris@42 1401 ST(&(xo[WS(os, 33)]), VSUB(T3T, T3U), ovs, &(xo[WS(os, 1)]));
Chris@42 1402 ST(&(xo[WS(os, 31)]), VADD(T3T, T3U), ovs, &(xo[WS(os, 1)]));
Chris@42 1403 }
Chris@42 1404 {
Chris@42 1405 V T3L, T3M, T3P, T3S;
Chris@42 1406 T3L = VBYI(VADD(T3y, T3n));
Chris@42 1407 T3M = VADD(T3G, T3J);
Chris@42 1408 ST(&(xo[WS(os, 15)]), VADD(T3L, T3M), ovs, &(xo[WS(os, 1)]));
Chris@42 1409 ST(&(xo[WS(os, 49)]), VSUB(T3M, T3L), ovs, &(xo[WS(os, 1)]));
Chris@42 1410 T3P = VADD(T3N, T3O);
Chris@42 1411 T3S = VBYI(VADD(T3Q, T3R));
Chris@42 1412 ST(&(xo[WS(os, 63)]), VSUB(T3P, T3S), ovs, &(xo[WS(os, 1)]));
Chris@42 1413 ST(&(xo[WS(os, 1)]), VADD(T3P, T3S), ovs, &(xo[WS(os, 1)]));
Chris@42 1414 }
Chris@42 1415 }
Chris@42 1416 {
Chris@42 1417 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@42 1418 {
Chris@42 1419 V T4x, T4M, T5x, T5y;
Chris@42 1420 T4x = VSUB(T4p, T4w);
Chris@42 1421 T4M = VSUB(T4E, T4L);
Chris@42 1422 T4N = VSUB(T4x, T4M);
Chris@42 1423 T5G = VADD(T4x, T4M);
Chris@42 1424 T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
Chris@42 1425 T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
Chris@42 1426 T5z = VSUB(T5x, T5y);
Chris@42 1427 T5H = VADD(T5x, T5y);
Chris@42 1428 }
Chris@42 1429 {
Chris@42 1430 V T54, T5l, T5q, T5v;
Chris@42 1431 T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
Chris@42 1432 T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
Chris@42 1433 T5m = VSUB(T54, T5l);
Chris@42 1434 T5D = VADD(T54, T5l);
Chris@42 1435 T5q = VSUB(T5o, T5p);
Chris@42 1436 T5v = VSUB(T5r, T5u);
Chris@42 1437 T5w = VSUB(T5q, T5v);
Chris@42 1438 T5E = VADD(T5v, T5q);
Chris@42 1439 }
Chris@42 1440 {
Chris@42 1441 V T5n, T5A, T5J, T5K;
Chris@42 1442 T5n = VADD(T4N, T5m);
Chris@42 1443 T5A = VBYI(VADD(T5w, T5z));
Chris@42 1444 ST(&(xo[WS(os, 54)]), VSUB(T5n, T5A), ovs, &(xo[0]));
Chris@42 1445 ST(&(xo[WS(os, 10)]), VADD(T5n, T5A), ovs, &(xo[0]));
Chris@42 1446 T5J = VBYI(VADD(T5E, T5D));
Chris@42 1447 T5K = VADD(T5G, T5H);
Chris@42 1448 ST(&(xo[WS(os, 6)]), VADD(T5J, T5K), ovs, &(xo[0]));
Chris@42 1449 ST(&(xo[WS(os, 58)]), VSUB(T5K, T5J), ovs, &(xo[0]));
Chris@42 1450 }
Chris@42 1451 {
Chris@42 1452 V T5B, T5C, T5F, T5I;
Chris@42 1453 T5B = VSUB(T4N, T5m);
Chris@42 1454 T5C = VBYI(VSUB(T5z, T5w));
Chris@42 1455 ST(&(xo[WS(os, 42)]), VSUB(T5B, T5C), ovs, &(xo[0]));
Chris@42 1456 ST(&(xo[WS(os, 22)]), VADD(T5B, T5C), ovs, &(xo[0]));
Chris@42 1457 T5F = VBYI(VSUB(T5D, T5E));
Chris@42 1458 T5I = VSUB(T5G, T5H);
Chris@42 1459 ST(&(xo[WS(os, 26)]), VADD(T5F, T5I), ovs, &(xo[0]));
Chris@42 1460 ST(&(xo[WS(os, 38)]), VSUB(T5I, T5F), ovs, &(xo[0]));
Chris@42 1461 }
Chris@42 1462 }
Chris@42 1463 {
Chris@42 1464 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@42 1465 {
Chris@42 1466 V T2H, T2I, T2V, T2W;
Chris@42 1467 T2H = VADD(Tb, Tq);
Chris@42 1468 T2I = VADD(T2g, T2h);
Chris@42 1469 T2J = VSUB(T2H, T2I);
Chris@42 1470 T34 = VADD(T2H, T2I);
Chris@42 1471 T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
Chris@42 1472 T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
Chris@42 1473 T2X = VSUB(T2V, T2W);
Chris@42 1474 T35 = VADD(T2V, T2W);
Chris@42 1475 }
Chris@42 1476 {
Chris@42 1477 V T2M, T2P, T2S, T2T;
Chris@42 1478 T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
Chris@42 1479 T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
Chris@42 1480 T2Q = VSUB(T2M, T2P);
Chris@42 1481 T31 = VADD(T2M, T2P);
Chris@42 1482 T2S = VADD(TI, TZ);
Chris@42 1483 T2T = VADD(T2q, T2l);
Chris@42 1484 T2U = VSUB(T2S, T2T);
Chris@42 1485 T32 = VADD(T2T, T2S);
Chris@42 1486 }
Chris@42 1487 {
Chris@42 1488 V T2R, T2Y, T37, T38;
Chris@42 1489 T2R = VADD(T2J, T2Q);
Chris@42 1490 T2Y = VBYI(VADD(T2U, T2X));
Chris@42 1491 ST(&(xo[WS(os, 51)]), VSUB(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@42 1492 ST(&(xo[WS(os, 13)]), VADD(T2R, T2Y), ovs, &(xo[WS(os, 1)]));
Chris@42 1493 T37 = VBYI(VADD(T32, T31));
Chris@42 1494 T38 = VADD(T34, T35);
Chris@42 1495 ST(&(xo[WS(os, 3)]), VADD(T37, T38), ovs, &(xo[WS(os, 1)]));
Chris@42 1496 ST(&(xo[WS(os, 61)]), VSUB(T38, T37), ovs, &(xo[WS(os, 1)]));
Chris@42 1497 }
Chris@42 1498 {
Chris@42 1499 V T2Z, T30, T33, T36;
Chris@42 1500 T2Z = VSUB(T2J, T2Q);
Chris@42 1501 T30 = VBYI(VSUB(T2X, T2U));
Chris@42 1502 ST(&(xo[WS(os, 45)]), VSUB(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@42 1503 ST(&(xo[WS(os, 19)]), VADD(T2Z, T30), ovs, &(xo[WS(os, 1)]));
Chris@42 1504 T33 = VBYI(VSUB(T31, T32));
Chris@42 1505 T36 = VSUB(T34, T35);
Chris@42 1506 ST(&(xo[WS(os, 29)]), VADD(T33, T36), ovs, &(xo[WS(os, 1)]));
Chris@42 1507 ST(&(xo[WS(os, 35)]), VSUB(T36, T33), ovs, &(xo[WS(os, 1)]));
Chris@42 1508 }
Chris@42 1509 }
Chris@42 1510 {
Chris@42 1511 V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
Chris@42 1512 {
Chris@42 1513 V T3X, T40, T49, T4a;
Chris@42 1514 T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
Chris@42 1515 T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
Chris@42 1516 T41 = VSUB(T3X, T40);
Chris@42 1517 T4g = VADD(T3X, T40);
Chris@42 1518 T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
Chris@42 1519 T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
Chris@42 1520 T4b = VSUB(T49, T4a);
Chris@42 1521 T4j = VADD(T49, T4a);
Chris@42 1522 }
Chris@42 1523 {
Chris@42 1524 V T42, T43, T46, T47;
Chris@42 1525 T42 = VSUB(T3D, T3E);
Chris@42 1526 T43 = VSUB(T3w, T3v);
Chris@42 1527 T44 = VSUB(T42, T43);
Chris@42 1528 T4i = VADD(T43, T42);
Chris@42 1529 T46 = VSUB(T3A, T3B);
Chris@42 1530 T47 = VSUB(T3q, T3t);
Chris@42 1531 T48 = VSUB(T46, T47);
Chris@42 1532 T4f = VADD(T46, T47);
Chris@42 1533 }
Chris@42 1534 {
Chris@42 1535 V T45, T4c, T4l, T4m;
Chris@42 1536 T45 = VBYI(VSUB(T41, T44));
Chris@42 1537 T4c = VSUB(T48, T4b);
Chris@42 1538 ST(&(xo[WS(os, 23)]), VADD(T45, T4c), ovs, &(xo[WS(os, 1)]));
Chris@42 1539 ST(&(xo[WS(os, 41)]), VSUB(T4c, T45), ovs, &(xo[WS(os, 1)]));
Chris@42 1540 T4l = VSUB(T4f, T4g);
Chris@42 1541 T4m = VBYI(VSUB(T4j, T4i));
Chris@42 1542 ST(&(xo[WS(os, 39)]), VSUB(T4l, T4m), ovs, &(xo[WS(os, 1)]));
Chris@42 1543 ST(&(xo[WS(os, 25)]), VADD(T4l, T4m), ovs, &(xo[WS(os, 1)]));
Chris@42 1544 }
Chris@42 1545 {
Chris@42 1546 V T4d, T4e, T4h, T4k;
Chris@42 1547 T4d = VBYI(VADD(T44, T41));
Chris@42 1548 T4e = VADD(T48, T4b);
Chris@42 1549 ST(&(xo[WS(os, 9)]), VADD(T4d, T4e), ovs, &(xo[WS(os, 1)]));
Chris@42 1550 ST(&(xo[WS(os, 55)]), VSUB(T4e, T4d), ovs, &(xo[WS(os, 1)]));
Chris@42 1551 T4h = VADD(T4f, T4g);
Chris@42 1552 T4k = VBYI(VADD(T4i, T4j));
Chris@42 1553 ST(&(xo[WS(os, 57)]), VSUB(T4h, T4k), ovs, &(xo[WS(os, 1)]));
Chris@42 1554 ST(&(xo[WS(os, 7)]), VADD(T4h, T4k), ovs, &(xo[WS(os, 1)]));
Chris@42 1555 }
Chris@42 1556 }
Chris@42 1557 }
Chris@42 1558 }
Chris@42 1559 VLEAVE();
Chris@42 1560 }
Chris@42 1561
Chris@42 1562 static const kdft_desc desc = { 64, XSIMD_STRING("n1bv_64"), {404, 72, 52, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 1563
Chris@42 1564 void XSIMD(codelet_n1bv_64) (planner *p) {
Chris@42 1565 X(kdft_register) (p, n1bv_64, &desc);
Chris@42 1566 }
Chris@42 1567
Chris@42 1568 #endif /* HAVE_FMA */