annotate src/fftw-3.3.5/dft/simd/common/n2bv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:40:56 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 456 FP additions, 258 FP multiplications,
Chris@42 32 * (or, 198 additions, 0 multiplications, 258 fused multiply/add),
Chris@42 33 * 178 stack variables, 15 constants, and 160 memory accesses
Chris@42 34 */
Chris@42 35 #include "n2b.h"
Chris@42 36
Chris@42 37 static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 40 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@42 41 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 42 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@42 43 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 44 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@42 45 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 46 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 47 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@42 48 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 49 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 50 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 51 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 54 {
Chris@42 55 INT i;
Chris@42 56 const R *xi;
Chris@42 57 R *xo;
Chris@42 58 xi = ii;
Chris@42 59 xo = io;
Chris@42 60 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 61 V T7z, T7A, T7B, T7C, T5T, T5S, T5X, T65, T8a, T8b, T8e, T8g, T5Z, T5R, T67;
Chris@42 62 V T63, T5U, T64;
Chris@42 63 {
Chris@42 64 V T7, T26, T5k, T6A, T47, T69, T2V, T3z, T6B, T4e, T6a, T5n, T3M, T2Y, T27;
Chris@42 65 V Tm, T3A, T3i, T29, TC, T5p, T4o, T6D, T6e, T3l, T3B, TR, T2a, T4x, T5q;
Chris@42 66 V T6h, T6E, T39, T3H, T3I, T3c, T5N, T57, T72, T6w, T5O, T5e, T71, T6t, T2y;
Chris@42 67 V T1W, T2x, T1N, T33, T34, T3E, T32, T1p, T2v, T1g, T2u, T4M, T5K, T6p, T6Z;
Chris@42 68 V T6m, T6Y, T5L, T4T;
Chris@42 69 {
Chris@42 70 V T4g, T4l, T3g, Tu, Tx, T4h, TA, T4i;
Chris@42 71 {
Chris@42 72 V T1, T2, T23, T24, T4, T5, T20, T21;
Chris@42 73 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 74 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 75 T23 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 76 T24 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 77 T4 = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 78 T5 = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 79 T20 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 80 T21 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 81 {
Chris@42 82 V Ta, T48, Tk, T4c, T49, Td, Tf, Tg;
Chris@42 83 {
Chris@42 84 V T8, T43, T3, T45, T25, T5i, T6, T44, T22, T9, Ti, Tj, Tb, Tc;
Chris@42 85 T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 86 T43 = VSUB(T1, T2);
Chris@42 87 T3 = VADD(T1, T2);
Chris@42 88 T45 = VSUB(T23, T24);
Chris@42 89 T25 = VADD(T23, T24);
Chris@42 90 T5i = VSUB(T4, T5);
Chris@42 91 T6 = VADD(T4, T5);
Chris@42 92 T44 = VSUB(T20, T21);
Chris@42 93 T22 = VADD(T20, T21);
Chris@42 94 T9 = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 95 Ti = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 96 Tj = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 97 Tb = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 98 Tc = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 99 {
Chris@42 100 V T2T, T46, T5j, T2U;
Chris@42 101 T7 = VSUB(T3, T6);
Chris@42 102 T2T = VADD(T3, T6);
Chris@42 103 T46 = VADD(T44, T45);
Chris@42 104 T5j = VSUB(T44, T45);
Chris@42 105 T26 = VSUB(T22, T25);
Chris@42 106 T2U = VADD(T22, T25);
Chris@42 107 Ta = VADD(T8, T9);
Chris@42 108 T48 = VSUB(T8, T9);
Chris@42 109 Tk = VADD(Ti, Tj);
Chris@42 110 T4c = VSUB(Tj, Ti);
Chris@42 111 T5k = VFMA(LDK(KP707106781), T5j, T5i);
Chris@42 112 T6A = VFNMS(LDK(KP707106781), T5j, T5i);
Chris@42 113 T47 = VFMA(LDK(KP707106781), T46, T43);
Chris@42 114 T69 = VFNMS(LDK(KP707106781), T46, T43);
Chris@42 115 T2V = VADD(T2T, T2U);
Chris@42 116 T3z = VSUB(T2T, T2U);
Chris@42 117 T49 = VSUB(Tb, Tc);
Chris@42 118 Td = VADD(Tb, Tc);
Chris@42 119 }
Chris@42 120 Tf = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 121 Tg = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 122 }
Chris@42 123 {
Chris@42 124 V Te, T2W, T5l, T4a, Tq, Tt, Tv, Tw, T5m, T4d, Tl, T2X, Ty, Tz, To;
Chris@42 125 V Tp;
Chris@42 126 To = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 127 Tp = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 128 {
Chris@42 129 V Th, T4b, Tr, Ts;
Chris@42 130 Tr = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 131 Ts = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 132 Te = VSUB(Ta, Td);
Chris@42 133 T2W = VADD(Ta, Td);
Chris@42 134 T5l = VFMA(LDK(KP414213562), T48, T49);
Chris@42 135 T4a = VFNMS(LDK(KP414213562), T49, T48);
Chris@42 136 Th = VADD(Tf, Tg);
Chris@42 137 T4b = VSUB(Tf, Tg);
Chris@42 138 Tq = VADD(To, Tp);
Chris@42 139 T4g = VSUB(To, Tp);
Chris@42 140 T4l = VSUB(Tr, Ts);
Chris@42 141 Tt = VADD(Tr, Ts);
Chris@42 142 Tv = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 143 Tw = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 144 T5m = VFMA(LDK(KP414213562), T4b, T4c);
Chris@42 145 T4d = VFNMS(LDK(KP414213562), T4c, T4b);
Chris@42 146 Tl = VSUB(Th, Tk);
Chris@42 147 T2X = VADD(Th, Tk);
Chris@42 148 Ty = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 149 Tz = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 150 }
Chris@42 151 T3g = VADD(Tq, Tt);
Chris@42 152 Tu = VSUB(Tq, Tt);
Chris@42 153 Tx = VADD(Tv, Tw);
Chris@42 154 T4h = VSUB(Tv, Tw);
Chris@42 155 T6B = VSUB(T4a, T4d);
Chris@42 156 T4e = VADD(T4a, T4d);
Chris@42 157 T6a = VADD(T5l, T5m);
Chris@42 158 T5n = VSUB(T5l, T5m);
Chris@42 159 T3M = VSUB(T2W, T2X);
Chris@42 160 T2Y = VADD(T2W, T2X);
Chris@42 161 T27 = VSUB(Te, Tl);
Chris@42 162 Tm = VADD(Te, Tl);
Chris@42 163 TA = VADD(Ty, Tz);
Chris@42 164 T4i = VSUB(Ty, Tz);
Chris@42 165 }
Chris@42 166 }
Chris@42 167 }
Chris@42 168 {
Chris@42 169 V TK, T4p, T4u, T4k, T6d, T4n, T6c, TL, TN, TO, T3j, TJ, TF, TI;
Chris@42 170 {
Chris@42 171 V TD, TE, TG, TH;
Chris@42 172 TD = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 173 TE = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 174 TG = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 175 TH = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 176 TK = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 177 {
Chris@42 178 V T3h, TB, T4j, T4m;
Chris@42 179 T3h = VADD(Tx, TA);
Chris@42 180 TB = VSUB(Tx, TA);
Chris@42 181 T4j = VADD(T4h, T4i);
Chris@42 182 T4m = VSUB(T4h, T4i);
Chris@42 183 T4p = VSUB(TD, TE);
Chris@42 184 TF = VADD(TD, TE);
Chris@42 185 T4u = VSUB(TH, TG);
Chris@42 186 TI = VADD(TG, TH);
Chris@42 187 T3A = VSUB(T3g, T3h);
Chris@42 188 T3i = VADD(T3g, T3h);
Chris@42 189 T29 = VFMA(LDK(KP414213562), Tu, TB);
Chris@42 190 TC = VFNMS(LDK(KP414213562), TB, Tu);
Chris@42 191 T4k = VFMA(LDK(KP707106781), T4j, T4g);
Chris@42 192 T6d = VFNMS(LDK(KP707106781), T4j, T4g);
Chris@42 193 T4n = VFMA(LDK(KP707106781), T4m, T4l);
Chris@42 194 T6c = VFNMS(LDK(KP707106781), T4m, T4l);
Chris@42 195 TL = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 196 }
Chris@42 197 TN = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 198 TO = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 199 }
Chris@42 200 T3j = VADD(TF, TI);
Chris@42 201 TJ = VSUB(TF, TI);
Chris@42 202 {
Chris@42 203 V T3a, T1E, T52, T5b, T1x, T4Z, T6r, T6u, T5a, T1U, T55, T5c, T1L, T3b;
Chris@42 204 {
Chris@42 205 V T4V, T1t, T58, T1w, T1Q, T1T, T1I, T4Y, T59, T1J, T53, T1H;
Chris@42 206 {
Chris@42 207 V T1r, TM, T4r, TP, T4q, T1s, T1u, T1v;
Chris@42 208 T1r = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 209 T5p = VFMA(LDK(KP198912367), T4k, T4n);
Chris@42 210 T4o = VFNMS(LDK(KP198912367), T4n, T4k);
Chris@42 211 T6D = VFMA(LDK(KP668178637), T6c, T6d);
Chris@42 212 T6e = VFNMS(LDK(KP668178637), T6d, T6c);
Chris@42 213 TM = VADD(TK, TL);
Chris@42 214 T4r = VSUB(TK, TL);
Chris@42 215 TP = VADD(TN, TO);
Chris@42 216 T4q = VSUB(TN, TO);
Chris@42 217 T1s = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 218 T1u = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 219 T1v = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 220 {
Chris@42 221 V T1R, T4X, T6g, T4t, T6f, T4w, T1S, T1O, T1P;
Chris@42 222 T1O = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 223 T1P = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 224 T1R = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 225 {
Chris@42 226 V T3k, TQ, T4s, T4v;
Chris@42 227 T3k = VADD(TP, TM);
Chris@42 228 TQ = VSUB(TM, TP);
Chris@42 229 T4s = VADD(T4q, T4r);
Chris@42 230 T4v = VSUB(T4r, T4q);
Chris@42 231 T4V = VSUB(T1r, T1s);
Chris@42 232 T1t = VADD(T1r, T1s);
Chris@42 233 T58 = VSUB(T1v, T1u);
Chris@42 234 T1w = VADD(T1u, T1v);
Chris@42 235 T4X = VSUB(T1O, T1P);
Chris@42 236 T1Q = VADD(T1O, T1P);
Chris@42 237 T3l = VADD(T3j, T3k);
Chris@42 238 T3B = VSUB(T3j, T3k);
Chris@42 239 TR = VFNMS(LDK(KP414213562), TQ, TJ);
Chris@42 240 T2a = VFMA(LDK(KP414213562), TJ, TQ);
Chris@42 241 T6g = VFNMS(LDK(KP707106781), T4s, T4p);
Chris@42 242 T4t = VFMA(LDK(KP707106781), T4s, T4p);
Chris@42 243 T6f = VFNMS(LDK(KP707106781), T4v, T4u);
Chris@42 244 T4w = VFMA(LDK(KP707106781), T4v, T4u);
Chris@42 245 T1S = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 246 }
Chris@42 247 {
Chris@42 248 V T4W, T1A, T50, T51, T1D, T1F, T1G;
Chris@42 249 {
Chris@42 250 V T1y, T1z, T1B, T1C;
Chris@42 251 T1y = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 252 T1z = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 253 T1B = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 254 T1C = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 255 T4x = VFNMS(LDK(KP198912367), T4w, T4t);
Chris@42 256 T5q = VFMA(LDK(KP198912367), T4t, T4w);
Chris@42 257 T6h = VFNMS(LDK(KP668178637), T6g, T6f);
Chris@42 258 T6E = VFMA(LDK(KP668178637), T6f, T6g);
Chris@42 259 T4W = VSUB(T1R, T1S);
Chris@42 260 T1T = VADD(T1R, T1S);
Chris@42 261 T1A = VADD(T1y, T1z);
Chris@42 262 T50 = VSUB(T1y, T1z);
Chris@42 263 T51 = VSUB(T1C, T1B);
Chris@42 264 T1D = VADD(T1B, T1C);
Chris@42 265 }
Chris@42 266 T1F = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 267 T1G = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 268 T1I = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 269 T4Y = VADD(T4W, T4X);
Chris@42 270 T59 = VSUB(T4X, T4W);
Chris@42 271 T1J = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 272 T3a = VADD(T1A, T1D);
Chris@42 273 T1E = VSUB(T1A, T1D);
Chris@42 274 T52 = VFMA(LDK(KP414213562), T51, T50);
Chris@42 275 T5b = VFNMS(LDK(KP414213562), T50, T51);
Chris@42 276 T53 = VSUB(T1F, T1G);
Chris@42 277 T1H = VADD(T1F, T1G);
Chris@42 278 }
Chris@42 279 }
Chris@42 280 }
Chris@42 281 {
Chris@42 282 V T37, T54, T1K, T38;
Chris@42 283 T1x = VSUB(T1t, T1w);
Chris@42 284 T37 = VADD(T1t, T1w);
Chris@42 285 T4Z = VFMA(LDK(KP707106781), T4Y, T4V);
Chris@42 286 T6r = VFNMS(LDK(KP707106781), T4Y, T4V);
Chris@42 287 T54 = VSUB(T1J, T1I);
Chris@42 288 T1K = VADD(T1I, T1J);
Chris@42 289 T6u = VFNMS(LDK(KP707106781), T59, T58);
Chris@42 290 T5a = VFMA(LDK(KP707106781), T59, T58);
Chris@42 291 T38 = VADD(T1T, T1Q);
Chris@42 292 T1U = VSUB(T1Q, T1T);
Chris@42 293 T55 = VFNMS(LDK(KP414213562), T54, T53);
Chris@42 294 T5c = VFMA(LDK(KP414213562), T53, T54);
Chris@42 295 T1L = VSUB(T1H, T1K);
Chris@42 296 T3b = VADD(T1H, T1K);
Chris@42 297 T39 = VADD(T37, T38);
Chris@42 298 T3H = VSUB(T37, T38);
Chris@42 299 }
Chris@42 300 }
Chris@42 301 {
Chris@42 302 V T4A, TW, T4N, TZ, T1j, T1m, T4O, T4D, T13, T4F, T16, T4G, T1a, T4I, T4J;
Chris@42 303 V T1d;
Chris@42 304 {
Chris@42 305 V TU, TV, TX, TY, T56, T6v;
Chris@42 306 TU = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 307 T56 = VADD(T52, T55);
Chris@42 308 T6v = VSUB(T55, T52);
Chris@42 309 {
Chris@42 310 V T5d, T6s, T1V, T1M;
Chris@42 311 T5d = VADD(T5b, T5c);
Chris@42 312 T6s = VSUB(T5c, T5b);
Chris@42 313 T1V = VSUB(T1L, T1E);
Chris@42 314 T1M = VADD(T1E, T1L);
Chris@42 315 T3I = VSUB(T3b, T3a);
Chris@42 316 T3c = VADD(T3a, T3b);
Chris@42 317 T5N = VFNMS(LDK(KP923879532), T56, T4Z);
Chris@42 318 T57 = VFMA(LDK(KP923879532), T56, T4Z);
Chris@42 319 T72 = VFNMS(LDK(KP923879532), T6v, T6u);
Chris@42 320 T6w = VFMA(LDK(KP923879532), T6v, T6u);
Chris@42 321 T5O = VFNMS(LDK(KP923879532), T5d, T5a);
Chris@42 322 T5e = VFMA(LDK(KP923879532), T5d, T5a);
Chris@42 323 T71 = VFMA(LDK(KP923879532), T6s, T6r);
Chris@42 324 T6t = VFNMS(LDK(KP923879532), T6s, T6r);
Chris@42 325 T2y = VFNMS(LDK(KP707106781), T1V, T1U);
Chris@42 326 T1W = VFMA(LDK(KP707106781), T1V, T1U);
Chris@42 327 T2x = VFNMS(LDK(KP707106781), T1M, T1x);
Chris@42 328 T1N = VFMA(LDK(KP707106781), T1M, T1x);
Chris@42 329 TV = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 330 }
Chris@42 331 TX = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 332 TY = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 333 {
Chris@42 334 V T1h, T1i, T1k, T1l;
Chris@42 335 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 336 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 337 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 338 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 339 {
Chris@42 340 V T11, T4B, T4C, T12, T14, T15;
Chris@42 341 T11 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 342 T4A = VSUB(TU, TV);
Chris@42 343 TW = VADD(TU, TV);
Chris@42 344 T4N = VSUB(TX, TY);
Chris@42 345 TZ = VADD(TX, TY);
Chris@42 346 T1j = VADD(T1h, T1i);
Chris@42 347 T4B = VSUB(T1h, T1i);
Chris@42 348 T1m = VADD(T1k, T1l);
Chris@42 349 T4C = VSUB(T1k, T1l);
Chris@42 350 T12 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 351 T14 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 352 T15 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 353 {
Chris@42 354 V T18, T19, T1b, T1c;
Chris@42 355 T18 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 356 T19 = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 357 T1b = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 358 T1c = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 359 T4O = VSUB(T4B, T4C);
Chris@42 360 T4D = VADD(T4B, T4C);
Chris@42 361 T13 = VADD(T11, T12);
Chris@42 362 T4F = VSUB(T11, T12);
Chris@42 363 T16 = VADD(T14, T15);
Chris@42 364 T4G = VSUB(T14, T15);
Chris@42 365 T1a = VADD(T18, T19);
Chris@42 366 T4I = VSUB(T18, T19);
Chris@42 367 T4J = VSUB(T1b, T1c);
Chris@42 368 T1d = VADD(T1b, T1c);
Chris@42 369 }
Chris@42 370 }
Chris@42 371 }
Chris@42 372 }
Chris@42 373 {
Chris@42 374 V T30, T10, T6k, T4E, T4Q, T4H, T17, T6n, T4P, T1e, T4K, T4R, T1n, T31;
Chris@42 375 T30 = VADD(TW, TZ);
Chris@42 376 T10 = VSUB(TW, TZ);
Chris@42 377 T6k = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@42 378 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@42 379 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
Chris@42 380 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@42 381 T33 = VADD(T13, T16);
Chris@42 382 T17 = VSUB(T13, T16);
Chris@42 383 T6n = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@42 384 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@42 385 T34 = VADD(T1a, T1d);
Chris@42 386 T1e = VSUB(T1a, T1d);
Chris@42 387 T4K = VFMA(LDK(KP414213562), T4J, T4I);
Chris@42 388 T4R = VFNMS(LDK(KP414213562), T4I, T4J);
Chris@42 389 T1n = VSUB(T1j, T1m);
Chris@42 390 T31 = VADD(T1j, T1m);
Chris@42 391 {
Chris@42 392 V T1f, T1o, T6o, T4L, T4S, T6l;
Chris@42 393 T1f = VADD(T17, T1e);
Chris@42 394 T1o = VSUB(T17, T1e);
Chris@42 395 T6o = VSUB(T4H, T4K);
Chris@42 396 T4L = VADD(T4H, T4K);
Chris@42 397 T4S = VADD(T4Q, T4R);
Chris@42 398 T6l = VSUB(T4Q, T4R);
Chris@42 399 T3E = VSUB(T30, T31);
Chris@42 400 T32 = VADD(T30, T31);
Chris@42 401 T1p = VFMA(LDK(KP707106781), T1o, T1n);
Chris@42 402 T2v = VFNMS(LDK(KP707106781), T1o, T1n);
Chris@42 403 T1g = VFMA(LDK(KP707106781), T1f, T10);
Chris@42 404 T2u = VFNMS(LDK(KP707106781), T1f, T10);
Chris@42 405 T4M = VFMA(LDK(KP923879532), T4L, T4E);
Chris@42 406 T5K = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@42 407 T6p = VFMA(LDK(KP923879532), T6o, T6n);
Chris@42 408 T6Z = VFNMS(LDK(KP923879532), T6o, T6n);
Chris@42 409 T6m = VFNMS(LDK(KP923879532), T6l, T6k);
Chris@42 410 T6Y = VFMA(LDK(KP923879532), T6l, T6k);
Chris@42 411 T5L = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@42 412 T4T = VFMA(LDK(KP923879532), T4S, T4P);
Chris@42 413 }
Chris@42 414 }
Chris@42 415 }
Chris@42 416 }
Chris@42 417 }
Chris@42 418 }
Chris@42 419 {
Chris@42 420 V T6b, T6F, T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7f;
Chris@42 421 V T6X, T70, T79, T7a, T73, T6C, T76, T77, T6i;
Chris@42 422 {
Chris@42 423 V T2Z, T3r, T3s, T3m, T3d, T3v;
Chris@42 424 T2Z = VSUB(T2V, T2Y);
Chris@42 425 T3r = VADD(T2V, T2Y);
Chris@42 426 T3s = VADD(T3i, T3l);
Chris@42 427 T3m = VSUB(T3i, T3l);
Chris@42 428 T3d = VSUB(T39, T3c);
Chris@42 429 T3v = VADD(T39, T3c);
Chris@42 430 {
Chris@42 431 V T3x, T3t, T3Q, T3J, T3D, T3V, T3G, T3P, T3u, T36, T3O, T3Y, T6V, T6W;
Chris@42 432 {
Chris@42 433 V T3N, T3C, T3F, T35;
Chris@42 434 T3N = VSUB(T3A, T3B);
Chris@42 435 T3C = VADD(T3A, T3B);
Chris@42 436 T3F = VSUB(T33, T34);
Chris@42 437 T35 = VADD(T33, T34);
Chris@42 438 T3x = VADD(T3r, T3s);
Chris@42 439 T3t = VSUB(T3r, T3s);
Chris@42 440 T3Q = VFMA(LDK(KP414213562), T3H, T3I);
Chris@42 441 T3J = VFNMS(LDK(KP414213562), T3I, T3H);
Chris@42 442 T3D = VFMA(LDK(KP707106781), T3C, T3z);
Chris@42 443 T3V = VFNMS(LDK(KP707106781), T3C, T3z);
Chris@42 444 T3G = VFNMS(LDK(KP414213562), T3F, T3E);
Chris@42 445 T3P = VFMA(LDK(KP414213562), T3E, T3F);
Chris@42 446 T3u = VADD(T32, T35);
Chris@42 447 T36 = VSUB(T32, T35);
Chris@42 448 T3O = VFMA(LDK(KP707106781), T3N, T3M);
Chris@42 449 T3Y = VFNMS(LDK(KP707106781), T3N, T3M);
Chris@42 450 }
Chris@42 451 T6b = VFNMS(LDK(KP923879532), T6a, T69);
Chris@42 452 T6V = VFMA(LDK(KP923879532), T6a, T69);
Chris@42 453 T6W = VADD(T6D, T6E);
Chris@42 454 T6F = VSUB(T6D, T6E);
Chris@42 455 {
Chris@42 456 V T3R, T3W, T3K, T3Z;
Chris@42 457 T3R = VSUB(T3P, T3Q);
Chris@42 458 T3W = VADD(T3P, T3Q);
Chris@42 459 T3K = VADD(T3G, T3J);
Chris@42 460 T3Z = VSUB(T3G, T3J);
Chris@42 461 {
Chris@42 462 V T3e, T3n, T3w, T3y;
Chris@42 463 T3e = VADD(T36, T3d);
Chris@42 464 T3n = VSUB(T36, T3d);
Chris@42 465 T3w = VSUB(T3u, T3v);
Chris@42 466 T3y = VADD(T3u, T3v);
Chris@42 467 {
Chris@42 468 V T41, T3X, T3S, T3U;
Chris@42 469 T41 = VFMA(LDK(KP923879532), T3W, T3V);
Chris@42 470 T3X = VFNMS(LDK(KP923879532), T3W, T3V);
Chris@42 471 T3S = VFNMS(LDK(KP923879532), T3R, T3O);
Chris@42 472 T3U = VFMA(LDK(KP923879532), T3R, T3O);
Chris@42 473 {
Chris@42 474 V T42, T40, T3L, T3T;
Chris@42 475 T42 = VFNMS(LDK(KP923879532), T3Z, T3Y);
Chris@42 476 T40 = VFMA(LDK(KP923879532), T3Z, T3Y);
Chris@42 477 T3L = VFNMS(LDK(KP923879532), T3K, T3D);
Chris@42 478 T3T = VFMA(LDK(KP923879532), T3K, T3D);
Chris@42 479 {
Chris@42 480 V T3o, T3q, T3f, T3p;
Chris@42 481 T3o = VFNMS(LDK(KP707106781), T3n, T3m);
Chris@42 482 T3q = VFMA(LDK(KP707106781), T3n, T3m);
Chris@42 483 T3f = VFNMS(LDK(KP707106781), T3e, T2Z);
Chris@42 484 T3p = VFMA(LDK(KP707106781), T3e, T2Z);
Chris@42 485 T7n = VSUB(T3x, T3y);
Chris@42 486 STM2(&(xo[64]), T7n, ovs, &(xo[0]));
Chris@42 487 T7o = VADD(T3x, T3y);
Chris@42 488 STM2(&(xo[0]), T7o, ovs, &(xo[0]));
Chris@42 489 T7p = VFMAI(T3w, T3t);
Chris@42 490 STM2(&(xo[32]), T7p, ovs, &(xo[0]));
Chris@42 491 T7q = VFNMSI(T3w, T3t);
Chris@42 492 STM2(&(xo[96]), T7q, ovs, &(xo[0]));
Chris@42 493 T7r = VFNMSI(T40, T3X);
Chris@42 494 STM2(&(xo[88]), T7r, ovs, &(xo[0]));
Chris@42 495 T7s = VFMAI(T40, T3X);
Chris@42 496 STM2(&(xo[40]), T7s, ovs, &(xo[0]));
Chris@42 497 T7t = VFMAI(T42, T41);
Chris@42 498 STM2(&(xo[104]), T7t, ovs, &(xo[0]));
Chris@42 499 T7u = VFNMSI(T42, T41);
Chris@42 500 STM2(&(xo[24]), T7u, ovs, &(xo[0]));
Chris@42 501 T7v = VFMAI(T3U, T3T);
Chris@42 502 STM2(&(xo[8]), T7v, ovs, &(xo[0]));
Chris@42 503 T7w = VFNMSI(T3U, T3T);
Chris@42 504 STM2(&(xo[120]), T7w, ovs, &(xo[0]));
Chris@42 505 T7x = VFMAI(T3S, T3L);
Chris@42 506 STM2(&(xo[72]), T7x, ovs, &(xo[0]));
Chris@42 507 T7y = VFNMSI(T3S, T3L);
Chris@42 508 STM2(&(xo[56]), T7y, ovs, &(xo[0]));
Chris@42 509 T7z = VFNMSI(T3q, T3p);
Chris@42 510 STM2(&(xo[112]), T7z, ovs, &(xo[0]));
Chris@42 511 T7A = VFMAI(T3q, T3p);
Chris@42 512 STM2(&(xo[16]), T7A, ovs, &(xo[0]));
Chris@42 513 T7B = VFMAI(T3o, T3f);
Chris@42 514 STM2(&(xo[80]), T7B, ovs, &(xo[0]));
Chris@42 515 T7C = VFNMSI(T3o, T3f);
Chris@42 516 STM2(&(xo[48]), T7C, ovs, &(xo[0]));
Chris@42 517 T7f = VFNMS(LDK(KP831469612), T6W, T6V);
Chris@42 518 T6X = VFMA(LDK(KP831469612), T6W, T6V);
Chris@42 519 }
Chris@42 520 }
Chris@42 521 }
Chris@42 522 }
Chris@42 523 }
Chris@42 524 T70 = VFMA(LDK(KP303346683), T6Z, T6Y);
Chris@42 525 T79 = VFNMS(LDK(KP303346683), T6Y, T6Z);
Chris@42 526 T7a = VFNMS(LDK(KP303346683), T71, T72);
Chris@42 527 T73 = VFMA(LDK(KP303346683), T72, T71);
Chris@42 528 T6C = VFMA(LDK(KP923879532), T6B, T6A);
Chris@42 529 T76 = VFNMS(LDK(KP923879532), T6B, T6A);
Chris@42 530 T77 = VSUB(T6e, T6h);
Chris@42 531 T6i = VADD(T6e, T6h);
Chris@42 532 }
Chris@42 533 }
Chris@42 534 {
Chris@42 535 V T2r, T2D, T2C, T2s, T5H, T5o, T5v, T5D, T7L, T7O, T7Q, T7S, T5r, T5I, T5x;
Chris@42 536 V T5h, T5F, T5B;
Chris@42 537 {
Chris@42 538 V TT, T2f, T7E, T7F, T7I, T7K, T2n, T1Y, T28, T2b, T2l, T2p, T2j, T2k;
Chris@42 539 {
Chris@42 540 V T1q, T2d, T7h, T7l, T2e, T1X, T75, T7d, T7m, T7k, T7c, T7e, Tn, TS;
Chris@42 541 T2r = VFNMS(LDK(KP707106781), Tm, T7);
Chris@42 542 Tn = VFMA(LDK(KP707106781), Tm, T7);
Chris@42 543 TS = VADD(TC, TR);
Chris@42 544 T2D = VSUB(TC, TR);
Chris@42 545 {
Chris@42 546 V T7b, T7j, T74, T7i, T78, T7g;
Chris@42 547 T1q = VFNMS(LDK(KP198912367), T1p, T1g);
Chris@42 548 T2d = VFMA(LDK(KP198912367), T1g, T1p);
Chris@42 549 T7g = VADD(T79, T7a);
Chris@42 550 T7b = VSUB(T79, T7a);
Chris@42 551 T7j = VSUB(T70, T73);
Chris@42 552 T74 = VADD(T70, T73);
Chris@42 553 T7i = VFNMS(LDK(KP831469612), T77, T76);
Chris@42 554 T78 = VFMA(LDK(KP831469612), T77, T76);
Chris@42 555 T2j = VFNMS(LDK(KP923879532), TS, Tn);
Chris@42 556 TT = VFMA(LDK(KP923879532), TS, Tn);
Chris@42 557 T7h = VFMA(LDK(KP956940335), T7g, T7f);
Chris@42 558 T7l = VFNMS(LDK(KP956940335), T7g, T7f);
Chris@42 559 T2e = VFMA(LDK(KP198912367), T1N, T1W);
Chris@42 560 T1X = VFNMS(LDK(KP198912367), T1W, T1N);
Chris@42 561 T75 = VFNMS(LDK(KP956940335), T74, T6X);
Chris@42 562 T7d = VFMA(LDK(KP956940335), T74, T6X);
Chris@42 563 T7m = VFMA(LDK(KP956940335), T7j, T7i);
Chris@42 564 T7k = VFNMS(LDK(KP956940335), T7j, T7i);
Chris@42 565 T7c = VFNMS(LDK(KP956940335), T7b, T78);
Chris@42 566 T7e = VFMA(LDK(KP956940335), T7b, T78);
Chris@42 567 }
Chris@42 568 T2k = VADD(T2d, T2e);
Chris@42 569 T2f = VSUB(T2d, T2e);
Chris@42 570 {
Chris@42 571 V T7D, T7G, T7H, T7J;
Chris@42 572 T7D = VFMAI(T7k, T7h);
Chris@42 573 STM2(&(xo[90]), T7D, ovs, &(xo[2]));
Chris@42 574 STN2(&(xo[88]), T7r, T7D, ovs);
Chris@42 575 T7E = VFNMSI(T7k, T7h);
Chris@42 576 STM2(&(xo[38]), T7E, ovs, &(xo[2]));
Chris@42 577 T7F = VFNMSI(T7m, T7l);
Chris@42 578 STM2(&(xo[102]), T7F, ovs, &(xo[2]));
Chris@42 579 T7G = VFMAI(T7m, T7l);
Chris@42 580 STM2(&(xo[26]), T7G, ovs, &(xo[2]));
Chris@42 581 STN2(&(xo[24]), T7u, T7G, ovs);
Chris@42 582 T7H = VFMAI(T7e, T7d);
Chris@42 583 STM2(&(xo[122]), T7H, ovs, &(xo[2]));
Chris@42 584 STN2(&(xo[120]), T7w, T7H, ovs);
Chris@42 585 T7I = VFNMSI(T7e, T7d);
Chris@42 586 STM2(&(xo[6]), T7I, ovs, &(xo[2]));
Chris@42 587 T7J = VFMAI(T7c, T75);
Chris@42 588 STM2(&(xo[58]), T7J, ovs, &(xo[2]));
Chris@42 589 STN2(&(xo[56]), T7y, T7J, ovs);
Chris@42 590 T7K = VFNMSI(T7c, T75);
Chris@42 591 STM2(&(xo[70]), T7K, ovs, &(xo[2]));
Chris@42 592 T2n = VSUB(T1q, T1X);
Chris@42 593 T1Y = VADD(T1q, T1X);
Chris@42 594 }
Chris@42 595 T2C = VFNMS(LDK(KP707106781), T27, T26);
Chris@42 596 T28 = VFMA(LDK(KP707106781), T27, T26);
Chris@42 597 T2b = VSUB(T29, T2a);
Chris@42 598 T2s = VADD(T29, T2a);
Chris@42 599 }
Chris@42 600 T2l = VFNMS(LDK(KP980785280), T2k, T2j);
Chris@42 601 T2p = VFMA(LDK(KP980785280), T2k, T2j);
Chris@42 602 {
Chris@42 603 V T5z, T4z, T5A, T5g;
Chris@42 604 {
Chris@42 605 V T4f, T4y, T1Z, T2h, T4U, T5t, T2m, T2c, T5u, T5f;
Chris@42 606 T5H = VFNMS(LDK(KP923879532), T4e, T47);
Chris@42 607 T4f = VFMA(LDK(KP923879532), T4e, T47);
Chris@42 608 T4y = VADD(T4o, T4x);
Chris@42 609 T5T = VSUB(T4o, T4x);
Chris@42 610 T1Z = VFNMS(LDK(KP980785280), T1Y, TT);
Chris@42 611 T2h = VFMA(LDK(KP980785280), T1Y, TT);
Chris@42 612 T4U = VFNMS(LDK(KP098491403), T4T, T4M);
Chris@42 613 T5t = VFMA(LDK(KP098491403), T4M, T4T);
Chris@42 614 T2m = VFNMS(LDK(KP923879532), T2b, T28);
Chris@42 615 T2c = VFMA(LDK(KP923879532), T2b, T28);
Chris@42 616 T5u = VFMA(LDK(KP098491403), T57, T5e);
Chris@42 617 T5f = VFNMS(LDK(KP098491403), T5e, T57);
Chris@42 618 T5z = VFNMS(LDK(KP980785280), T4y, T4f);
Chris@42 619 T4z = VFMA(LDK(KP980785280), T4y, T4f);
Chris@42 620 T5S = VFNMS(LDK(KP923879532), T5n, T5k);
Chris@42 621 T5o = VFMA(LDK(KP923879532), T5n, T5k);
Chris@42 622 {
Chris@42 623 V T2o, T2q, T2i, T2g;
Chris@42 624 T2o = VFMA(LDK(KP980785280), T2n, T2m);
Chris@42 625 T2q = VFNMS(LDK(KP980785280), T2n, T2m);
Chris@42 626 T2i = VFMA(LDK(KP980785280), T2f, T2c);
Chris@42 627 T2g = VFNMS(LDK(KP980785280), T2f, T2c);
Chris@42 628 T5A = VADD(T5t, T5u);
Chris@42 629 T5v = VSUB(T5t, T5u);
Chris@42 630 T5D = VSUB(T4U, T5f);
Chris@42 631 T5g = VADD(T4U, T5f);
Chris@42 632 T7L = VFNMSI(T2o, T2l);
Chris@42 633 STM2(&(xo[92]), T7L, ovs, &(xo[0]));
Chris@42 634 {
Chris@42 635 V T7M, T7N, T7P, T7R;
Chris@42 636 T7M = VFMAI(T2o, T2l);
Chris@42 637 STM2(&(xo[36]), T7M, ovs, &(xo[0]));
Chris@42 638 STN2(&(xo[36]), T7M, T7E, ovs);
Chris@42 639 T7N = VFMAI(T2q, T2p);
Chris@42 640 STM2(&(xo[100]), T7N, ovs, &(xo[0]));
Chris@42 641 STN2(&(xo[100]), T7N, T7F, ovs);
Chris@42 642 T7O = VFNMSI(T2q, T2p);
Chris@42 643 STM2(&(xo[28]), T7O, ovs, &(xo[0]));
Chris@42 644 T7P = VFMAI(T2i, T2h);
Chris@42 645 STM2(&(xo[4]), T7P, ovs, &(xo[0]));
Chris@42 646 STN2(&(xo[4]), T7P, T7I, ovs);
Chris@42 647 T7Q = VFNMSI(T2i, T2h);
Chris@42 648 STM2(&(xo[124]), T7Q, ovs, &(xo[0]));
Chris@42 649 T7R = VFMAI(T2g, T1Z);
Chris@42 650 STM2(&(xo[68]), T7R, ovs, &(xo[0]));
Chris@42 651 STN2(&(xo[68]), T7R, T7K, ovs);
Chris@42 652 T7S = VFNMSI(T2g, T1Z);
Chris@42 653 STM2(&(xo[60]), T7S, ovs, &(xo[0]));
Chris@42 654 T5r = VSUB(T5p, T5q);
Chris@42 655 T5I = VADD(T5p, T5q);
Chris@42 656 }
Chris@42 657 }
Chris@42 658 }
Chris@42 659 T5x = VFMA(LDK(KP995184726), T5g, T4z);
Chris@42 660 T5h = VFNMS(LDK(KP995184726), T5g, T4z);
Chris@42 661 T5F = VFMA(LDK(KP995184726), T5A, T5z);
Chris@42 662 T5B = VFNMS(LDK(KP995184726), T5A, T5z);
Chris@42 663 }
Chris@42 664 }
Chris@42 665 {
Chris@42 666 V T6J, T6R, T6L, T6z, T6T, T6P;
Chris@42 667 {
Chris@42 668 V T6N, T6j, T6O, T6y;
Chris@42 669 {
Chris@42 670 V T6q, T6H, T5C, T5s, T6I, T6x;
Chris@42 671 T6q = VFNMS(LDK(KP534511135), T6p, T6m);
Chris@42 672 T6H = VFMA(LDK(KP534511135), T6m, T6p);
Chris@42 673 T5C = VFNMS(LDK(KP980785280), T5r, T5o);
Chris@42 674 T5s = VFMA(LDK(KP980785280), T5r, T5o);
Chris@42 675 T6I = VFMA(LDK(KP534511135), T6t, T6w);
Chris@42 676 T6x = VFNMS(LDK(KP534511135), T6w, T6t);
Chris@42 677 T6N = VFMA(LDK(KP831469612), T6i, T6b);
Chris@42 678 T6j = VFNMS(LDK(KP831469612), T6i, T6b);
Chris@42 679 {
Chris@42 680 V T5E, T5G, T5y, T5w;
Chris@42 681 T5E = VFMA(LDK(KP995184726), T5D, T5C);
Chris@42 682 T5G = VFNMS(LDK(KP995184726), T5D, T5C);
Chris@42 683 T5y = VFMA(LDK(KP995184726), T5v, T5s);
Chris@42 684 T5w = VFNMS(LDK(KP995184726), T5v, T5s);
Chris@42 685 T6O = VADD(T6H, T6I);
Chris@42 686 T6J = VSUB(T6H, T6I);
Chris@42 687 T6R = VSUB(T6q, T6x);
Chris@42 688 T6y = VADD(T6q, T6x);
Chris@42 689 {
Chris@42 690 V T7T, T7U, T7V, T7W;
Chris@42 691 T7T = VFNMSI(T5E, T5B);
Chris@42 692 STM2(&(xo[94]), T7T, ovs, &(xo[2]));
Chris@42 693 STN2(&(xo[92]), T7L, T7T, ovs);
Chris@42 694 T7U = VFMAI(T5E, T5B);
Chris@42 695 STM2(&(xo[34]), T7U, ovs, &(xo[2]));
Chris@42 696 STN2(&(xo[32]), T7p, T7U, ovs);
Chris@42 697 T7V = VFMAI(T5G, T5F);
Chris@42 698 STM2(&(xo[98]), T7V, ovs, &(xo[2]));
Chris@42 699 STN2(&(xo[96]), T7q, T7V, ovs);
Chris@42 700 T7W = VFNMSI(T5G, T5F);
Chris@42 701 STM2(&(xo[30]), T7W, ovs, &(xo[2]));
Chris@42 702 STN2(&(xo[28]), T7O, T7W, ovs);
Chris@42 703 {
Chris@42 704 V T7X, T7Y, T7Z, T80;
Chris@42 705 T7X = VFMAI(T5y, T5x);
Chris@42 706 STM2(&(xo[2]), T7X, ovs, &(xo[2]));
Chris@42 707 STN2(&(xo[0]), T7o, T7X, ovs);
Chris@42 708 T7Y = VFNMSI(T5y, T5x);
Chris@42 709 STM2(&(xo[126]), T7Y, ovs, &(xo[2]));
Chris@42 710 STN2(&(xo[124]), T7Q, T7Y, ovs);
Chris@42 711 T7Z = VFMAI(T5w, T5h);
Chris@42 712 STM2(&(xo[66]), T7Z, ovs, &(xo[2]));
Chris@42 713 STN2(&(xo[64]), T7n, T7Z, ovs);
Chris@42 714 T80 = VFNMSI(T5w, T5h);
Chris@42 715 STM2(&(xo[62]), T80, ovs, &(xo[2]));
Chris@42 716 STN2(&(xo[60]), T7S, T80, ovs);
Chris@42 717 }
Chris@42 718 }
Chris@42 719 }
Chris@42 720 }
Chris@42 721 T6L = VFMA(LDK(KP881921264), T6y, T6j);
Chris@42 722 T6z = VFNMS(LDK(KP881921264), T6y, T6j);
Chris@42 723 T6T = VFMA(LDK(KP881921264), T6O, T6N);
Chris@42 724 T6P = VFNMS(LDK(KP881921264), T6O, T6N);
Chris@42 725 }
Chris@42 726 {
Chris@42 727 V T2H, T2P, T81, T84, T86, T88, T2J, T2B, T2R, T2N;
Chris@42 728 {
Chris@42 729 V T2L, T2t, T2M, T2A;
Chris@42 730 {
Chris@42 731 V T2w, T2F, T6Q, T6G, T2G, T2z;
Chris@42 732 T2w = VFMA(LDK(KP668178637), T2v, T2u);
Chris@42 733 T2F = VFNMS(LDK(KP668178637), T2u, T2v);
Chris@42 734 T6Q = VFNMS(LDK(KP831469612), T6F, T6C);
Chris@42 735 T6G = VFMA(LDK(KP831469612), T6F, T6C);
Chris@42 736 T2G = VFNMS(LDK(KP668178637), T2x, T2y);
Chris@42 737 T2z = VFMA(LDK(KP668178637), T2y, T2x);
Chris@42 738 T2L = VFNMS(LDK(KP923879532), T2s, T2r);
Chris@42 739 T2t = VFMA(LDK(KP923879532), T2s, T2r);
Chris@42 740 {
Chris@42 741 V T6S, T6U, T6M, T6K;
Chris@42 742 T6S = VFMA(LDK(KP881921264), T6R, T6Q);
Chris@42 743 T6U = VFNMS(LDK(KP881921264), T6R, T6Q);
Chris@42 744 T6M = VFMA(LDK(KP881921264), T6J, T6G);
Chris@42 745 T6K = VFNMS(LDK(KP881921264), T6J, T6G);
Chris@42 746 T2M = VADD(T2F, T2G);
Chris@42 747 T2H = VSUB(T2F, T2G);
Chris@42 748 T2P = VSUB(T2w, T2z);
Chris@42 749 T2A = VADD(T2w, T2z);
Chris@42 750 T81 = VFNMSI(T6S, T6P);
Chris@42 751 STM2(&(xo[86]), T81, ovs, &(xo[2]));
Chris@42 752 {
Chris@42 753 V T82, T83, T85, T87;
Chris@42 754 T82 = VFMAI(T6S, T6P);
Chris@42 755 STM2(&(xo[42]), T82, ovs, &(xo[2]));
Chris@42 756 STN2(&(xo[40]), T7s, T82, ovs);
Chris@42 757 T83 = VFMAI(T6U, T6T);
Chris@42 758 STM2(&(xo[106]), T83, ovs, &(xo[2]));
Chris@42 759 STN2(&(xo[104]), T7t, T83, ovs);
Chris@42 760 T84 = VFNMSI(T6U, T6T);
Chris@42 761 STM2(&(xo[22]), T84, ovs, &(xo[2]));
Chris@42 762 T85 = VFMAI(T6M, T6L);
Chris@42 763 STM2(&(xo[10]), T85, ovs, &(xo[2]));
Chris@42 764 STN2(&(xo[8]), T7v, T85, ovs);
Chris@42 765 T86 = VFNMSI(T6M, T6L);
Chris@42 766 STM2(&(xo[118]), T86, ovs, &(xo[2]));
Chris@42 767 T87 = VFMAI(T6K, T6z);
Chris@42 768 STM2(&(xo[74]), T87, ovs, &(xo[2]));
Chris@42 769 STN2(&(xo[72]), T7x, T87, ovs);
Chris@42 770 T88 = VFNMSI(T6K, T6z);
Chris@42 771 STM2(&(xo[54]), T88, ovs, &(xo[2]));
Chris@42 772 }
Chris@42 773 }
Chris@42 774 }
Chris@42 775 T2J = VFMA(LDK(KP831469612), T2A, T2t);
Chris@42 776 T2B = VFNMS(LDK(KP831469612), T2A, T2t);
Chris@42 777 T2R = VFNMS(LDK(KP831469612), T2M, T2L);
Chris@42 778 T2N = VFMA(LDK(KP831469612), T2M, T2L);
Chris@42 779 }
Chris@42 780 {
Chris@42 781 V T61, T5J, T62, T5Q;
Chris@42 782 {
Chris@42 783 V T5M, T5V, T2O, T2E, T5W, T5P;
Chris@42 784 T5M = VFMA(LDK(KP820678790), T5L, T5K);
Chris@42 785 T5V = VFNMS(LDK(KP820678790), T5K, T5L);
Chris@42 786 T2O = VFMA(LDK(KP923879532), T2D, T2C);
Chris@42 787 T2E = VFNMS(LDK(KP923879532), T2D, T2C);
Chris@42 788 T5W = VFNMS(LDK(KP820678790), T5N, T5O);
Chris@42 789 T5P = VFMA(LDK(KP820678790), T5O, T5N);
Chris@42 790 T61 = VFNMS(LDK(KP980785280), T5I, T5H);
Chris@42 791 T5J = VFMA(LDK(KP980785280), T5I, T5H);
Chris@42 792 {
Chris@42 793 V T2Q, T2S, T2K, T2I;
Chris@42 794 T2Q = VFNMS(LDK(KP831469612), T2P, T2O);
Chris@42 795 T2S = VFMA(LDK(KP831469612), T2P, T2O);
Chris@42 796 T2K = VFMA(LDK(KP831469612), T2H, T2E);
Chris@42 797 T2I = VFNMS(LDK(KP831469612), T2H, T2E);
Chris@42 798 T62 = VADD(T5V, T5W);
Chris@42 799 T5X = VSUB(T5V, T5W);
Chris@42 800 T65 = VSUB(T5M, T5P);
Chris@42 801 T5Q = VADD(T5M, T5P);
Chris@42 802 {
Chris@42 803 V T89, T8c, T8d, T8f;
Chris@42 804 T89 = VFMAI(T2Q, T2N);
Chris@42 805 STM2(&(xo[84]), T89, ovs, &(xo[0]));
Chris@42 806 STN2(&(xo[84]), T89, T81, ovs);
Chris@42 807 T8a = VFNMSI(T2Q, T2N);
Chris@42 808 STM2(&(xo[44]), T8a, ovs, &(xo[0]));
Chris@42 809 T8b = VFNMSI(T2S, T2R);
Chris@42 810 STM2(&(xo[108]), T8b, ovs, &(xo[0]));
Chris@42 811 T8c = VFMAI(T2S, T2R);
Chris@42 812 STM2(&(xo[20]), T8c, ovs, &(xo[0]));
Chris@42 813 STN2(&(xo[20]), T8c, T84, ovs);
Chris@42 814 T8d = VFMAI(T2K, T2J);
Chris@42 815 STM2(&(xo[116]), T8d, ovs, &(xo[0]));
Chris@42 816 STN2(&(xo[116]), T8d, T86, ovs);
Chris@42 817 T8e = VFNMSI(T2K, T2J);
Chris@42 818 STM2(&(xo[12]), T8e, ovs, &(xo[0]));
Chris@42 819 T8f = VFMAI(T2I, T2B);
Chris@42 820 STM2(&(xo[52]), T8f, ovs, &(xo[0]));
Chris@42 821 STN2(&(xo[52]), T8f, T88, ovs);
Chris@42 822 T8g = VFNMSI(T2I, T2B);
Chris@42 823 STM2(&(xo[76]), T8g, ovs, &(xo[0]));
Chris@42 824 }
Chris@42 825 }
Chris@42 826 }
Chris@42 827 T5Z = VFMA(LDK(KP773010453), T5Q, T5J);
Chris@42 828 T5R = VFNMS(LDK(KP773010453), T5Q, T5J);
Chris@42 829 T67 = VFNMS(LDK(KP773010453), T62, T61);
Chris@42 830 T63 = VFMA(LDK(KP773010453), T62, T61);
Chris@42 831 }
Chris@42 832 }
Chris@42 833 }
Chris@42 834 }
Chris@42 835 }
Chris@42 836 }
Chris@42 837 T5U = VFNMS(LDK(KP980785280), T5T, T5S);
Chris@42 838 T64 = VFMA(LDK(KP980785280), T5T, T5S);
Chris@42 839 {
Chris@42 840 V T68, T66, T5Y, T60;
Chris@42 841 T68 = VFMA(LDK(KP773010453), T65, T64);
Chris@42 842 T66 = VFNMS(LDK(KP773010453), T65, T64);
Chris@42 843 T5Y = VFNMS(LDK(KP773010453), T5X, T5U);
Chris@42 844 T60 = VFMA(LDK(KP773010453), T5X, T5U);
Chris@42 845 {
Chris@42 846 V T8h, T8i, T8j, T8k;
Chris@42 847 T8h = VFMAI(T66, T63);
Chris@42 848 STM2(&(xo[82]), T8h, ovs, &(xo[2]));
Chris@42 849 STN2(&(xo[80]), T7B, T8h, ovs);
Chris@42 850 T8i = VFNMSI(T66, T63);
Chris@42 851 STM2(&(xo[46]), T8i, ovs, &(xo[2]));
Chris@42 852 STN2(&(xo[44]), T8a, T8i, ovs);
Chris@42 853 T8j = VFNMSI(T68, T67);
Chris@42 854 STM2(&(xo[110]), T8j, ovs, &(xo[2]));
Chris@42 855 STN2(&(xo[108]), T8b, T8j, ovs);
Chris@42 856 T8k = VFMAI(T68, T67);
Chris@42 857 STM2(&(xo[18]), T8k, ovs, &(xo[2]));
Chris@42 858 STN2(&(xo[16]), T7A, T8k, ovs);
Chris@42 859 {
Chris@42 860 V T8l, T8m, T8n, T8o;
Chris@42 861 T8l = VFMAI(T60, T5Z);
Chris@42 862 STM2(&(xo[114]), T8l, ovs, &(xo[2]));
Chris@42 863 STN2(&(xo[112]), T7z, T8l, ovs);
Chris@42 864 T8m = VFNMSI(T60, T5Z);
Chris@42 865 STM2(&(xo[14]), T8m, ovs, &(xo[2]));
Chris@42 866 STN2(&(xo[12]), T8e, T8m, ovs);
Chris@42 867 T8n = VFMAI(T5Y, T5R);
Chris@42 868 STM2(&(xo[50]), T8n, ovs, &(xo[2]));
Chris@42 869 STN2(&(xo[48]), T7C, T8n, ovs);
Chris@42 870 T8o = VFNMSI(T5Y, T5R);
Chris@42 871 STM2(&(xo[78]), T8o, ovs, &(xo[2]));
Chris@42 872 STN2(&(xo[76]), T8g, T8o, ovs);
Chris@42 873 }
Chris@42 874 }
Chris@42 875 }
Chris@42 876 }
Chris@42 877 }
Chris@42 878 VLEAVE();
Chris@42 879 }
Chris@42 880
Chris@42 881 static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {198, 0, 258, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 882
Chris@42 883 void XSIMD(codelet_n2bv_64) (planner *p) {
Chris@42 884 X(kdft_register) (p, n2bv_64, &desc);
Chris@42 885 }
Chris@42 886
Chris@42 887 #else /* HAVE_FMA */
Chris@42 888
Chris@42 889 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 64 -name n2bv_64 -with-ostride 2 -include n2b.h -store-multiple 2 */
Chris@42 890
Chris@42 891 /*
Chris@42 892 * This function contains 456 FP additions, 124 FP multiplications,
Chris@42 893 * (or, 404 additions, 72 multiplications, 52 fused multiply/add),
Chris@42 894 * 128 stack variables, 15 constants, and 160 memory accesses
Chris@42 895 */
Chris@42 896 #include "n2b.h"
Chris@42 897
Chris@42 898 static void n2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 899 {
Chris@42 900 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@42 901 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@42 902 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@42 903 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@42 904 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@42 905 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@42 906 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@42 907 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@42 908 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 909 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 910 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 911 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 912 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 913 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 914 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 915 {
Chris@42 916 INT i;
Chris@42 917 const R *xi;
Chris@42 918 R *xo;
Chris@42 919 xi = ii;
Chris@42 920 xo = io;
Chris@42 921 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@42 922 V T4p, T5u, Tb, T3A, T2q, T3v, T6G, T78, Tq, T3w, T6B, T79, T2l, T3B, T4w;
Chris@42 923 V T5r, TI, T2g, T6u, T74, T3q, T3D, T4E, T5o, TZ, T2h, T6x, T75, T3t, T3E;
Chris@42 924 V T4L, T5p, T23, T2N, T6m, T70, T6p, T71, T2c, T2O, T3i, T3Y, T5f, T5R, T5k;
Chris@42 925 V T5S, T3l, T3Z, T1s, T2K, T6f, T6X, T6i, T6Y, T1B, T2L, T3b, T3V, T4Y, T5O;
Chris@42 926 V T53, T5P, T3e, T3W;
Chris@42 927 {
Chris@42 928 V T3, T4n, T2p, T4o, T6, T5s, T9, T5t;
Chris@42 929 {
Chris@42 930 V T1, T2, T2n, T2o;
Chris@42 931 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 932 T2 = LD(&(xi[WS(is, 32)]), ivs, &(xi[0]));
Chris@42 933 T3 = VSUB(T1, T2);
Chris@42 934 T4n = VADD(T1, T2);
Chris@42 935 T2n = LD(&(xi[WS(is, 16)]), ivs, &(xi[0]));
Chris@42 936 T2o = LD(&(xi[WS(is, 48)]), ivs, &(xi[0]));
Chris@42 937 T2p = VSUB(T2n, T2o);
Chris@42 938 T4o = VADD(T2n, T2o);
Chris@42 939 }
Chris@42 940 {
Chris@42 941 V T4, T5, T7, T8;
Chris@42 942 T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 943 T5 = LD(&(xi[WS(is, 40)]), ivs, &(xi[0]));
Chris@42 944 T6 = VSUB(T4, T5);
Chris@42 945 T5s = VADD(T4, T5);
Chris@42 946 T7 = LD(&(xi[WS(is, 56)]), ivs, &(xi[0]));
Chris@42 947 T8 = LD(&(xi[WS(is, 24)]), ivs, &(xi[0]));
Chris@42 948 T9 = VSUB(T7, T8);
Chris@42 949 T5t = VADD(T7, T8);
Chris@42 950 }
Chris@42 951 T4p = VSUB(T4n, T4o);
Chris@42 952 T5u = VSUB(T5s, T5t);
Chris@42 953 {
Chris@42 954 V Ta, T2m, T6E, T6F;
Chris@42 955 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 956 Tb = VSUB(T3, Ta);
Chris@42 957 T3A = VADD(T3, Ta);
Chris@42 958 T2m = VMUL(LDK(KP707106781), VSUB(T6, T9));
Chris@42 959 T2q = VSUB(T2m, T2p);
Chris@42 960 T3v = VADD(T2p, T2m);
Chris@42 961 T6E = VADD(T4n, T4o);
Chris@42 962 T6F = VADD(T5s, T5t);
Chris@42 963 T6G = VSUB(T6E, T6F);
Chris@42 964 T78 = VADD(T6E, T6F);
Chris@42 965 }
Chris@42 966 }
Chris@42 967 {
Chris@42 968 V Te, T4q, To, T4t, Th, T4r, Tl, T4u;
Chris@42 969 {
Chris@42 970 V Tc, Td, Tm, Tn;
Chris@42 971 Tc = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 972 Td = LD(&(xi[WS(is, 36)]), ivs, &(xi[0]));
Chris@42 973 Te = VSUB(Tc, Td);
Chris@42 974 T4q = VADD(Tc, Td);
Chris@42 975 Tm = LD(&(xi[WS(is, 60)]), ivs, &(xi[0]));
Chris@42 976 Tn = LD(&(xi[WS(is, 28)]), ivs, &(xi[0]));
Chris@42 977 To = VSUB(Tm, Tn);
Chris@42 978 T4t = VADD(Tm, Tn);
Chris@42 979 }
Chris@42 980 {
Chris@42 981 V Tf, Tg, Tj, Tk;
Chris@42 982 Tf = LD(&(xi[WS(is, 20)]), ivs, &(xi[0]));
Chris@42 983 Tg = LD(&(xi[WS(is, 52)]), ivs, &(xi[0]));
Chris@42 984 Th = VSUB(Tf, Tg);
Chris@42 985 T4r = VADD(Tf, Tg);
Chris@42 986 Tj = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 987 Tk = LD(&(xi[WS(is, 44)]), ivs, &(xi[0]));
Chris@42 988 Tl = VSUB(Tj, Tk);
Chris@42 989 T4u = VADD(Tj, Tk);
Chris@42 990 }
Chris@42 991 {
Chris@42 992 V Ti, Tp, T6z, T6A;
Chris@42 993 Ti = VFMA(LDK(KP382683432), Te, VMUL(LDK(KP923879532), Th));
Chris@42 994 Tp = VFNMS(LDK(KP382683432), To, VMUL(LDK(KP923879532), Tl));
Chris@42 995 Tq = VSUB(Ti, Tp);
Chris@42 996 T3w = VADD(Ti, Tp);
Chris@42 997 T6z = VADD(T4q, T4r);
Chris@42 998 T6A = VADD(T4t, T4u);
Chris@42 999 T6B = VSUB(T6z, T6A);
Chris@42 1000 T79 = VADD(T6z, T6A);
Chris@42 1001 }
Chris@42 1002 {
Chris@42 1003 V T2j, T2k, T4s, T4v;
Chris@42 1004 T2j = VFNMS(LDK(KP382683432), Th, VMUL(LDK(KP923879532), Te));
Chris@42 1005 T2k = VFMA(LDK(KP923879532), To, VMUL(LDK(KP382683432), Tl));
Chris@42 1006 T2l = VSUB(T2j, T2k);
Chris@42 1007 T3B = VADD(T2j, T2k);
Chris@42 1008 T4s = VSUB(T4q, T4r);
Chris@42 1009 T4v = VSUB(T4t, T4u);
Chris@42 1010 T4w = VMUL(LDK(KP707106781), VADD(T4s, T4v));
Chris@42 1011 T5r = VMUL(LDK(KP707106781), VSUB(T4s, T4v));
Chris@42 1012 }
Chris@42 1013 }
Chris@42 1014 {
Chris@42 1015 V TB, T4z, TF, T4y, Ty, T4C, TG, T4B;
Chris@42 1016 {
Chris@42 1017 V Tz, TA, TD, TE;
Chris@42 1018 Tz = LD(&(xi[WS(is, 18)]), ivs, &(xi[0]));
Chris@42 1019 TA = LD(&(xi[WS(is, 50)]), ivs, &(xi[0]));
Chris@42 1020 TB = VSUB(Tz, TA);
Chris@42 1021 T4z = VADD(Tz, TA);
Chris@42 1022 TD = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 1023 TE = LD(&(xi[WS(is, 34)]), ivs, &(xi[0]));
Chris@42 1024 TF = VSUB(TD, TE);
Chris@42 1025 T4y = VADD(TD, TE);
Chris@42 1026 {
Chris@42 1027 V Ts, Tt, Tu, Tv, Tw, Tx;
Chris@42 1028 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 1029 Tt = LD(&(xi[WS(is, 42)]), ivs, &(xi[0]));
Chris@42 1030 Tu = VSUB(Ts, Tt);
Chris@42 1031 Tv = LD(&(xi[WS(is, 58)]), ivs, &(xi[0]));
Chris@42 1032 Tw = LD(&(xi[WS(is, 26)]), ivs, &(xi[0]));
Chris@42 1033 Tx = VSUB(Tv, Tw);
Chris@42 1034 Ty = VMUL(LDK(KP707106781), VSUB(Tu, Tx));
Chris@42 1035 T4C = VADD(Tv, Tw);
Chris@42 1036 TG = VMUL(LDK(KP707106781), VADD(Tu, Tx));
Chris@42 1037 T4B = VADD(Ts, Tt);
Chris@42 1038 }
Chris@42 1039 }
Chris@42 1040 {
Chris@42 1041 V TC, TH, T6s, T6t;
Chris@42 1042 TC = VSUB(Ty, TB);
Chris@42 1043 TH = VSUB(TF, TG);
Chris@42 1044 TI = VFMA(LDK(KP831469612), TC, VMUL(LDK(KP555570233), TH));
Chris@42 1045 T2g = VFNMS(LDK(KP555570233), TC, VMUL(LDK(KP831469612), TH));
Chris@42 1046 T6s = VADD(T4y, T4z);
Chris@42 1047 T6t = VADD(T4B, T4C);
Chris@42 1048 T6u = VSUB(T6s, T6t);
Chris@42 1049 T74 = VADD(T6s, T6t);
Chris@42 1050 }
Chris@42 1051 {
Chris@42 1052 V T3o, T3p, T4A, T4D;
Chris@42 1053 T3o = VADD(TB, Ty);
Chris@42 1054 T3p = VADD(TF, TG);
Chris@42 1055 T3q = VFMA(LDK(KP980785280), T3o, VMUL(LDK(KP195090322), T3p));
Chris@42 1056 T3D = VFNMS(LDK(KP195090322), T3o, VMUL(LDK(KP980785280), T3p));
Chris@42 1057 T4A = VSUB(T4y, T4z);
Chris@42 1058 T4D = VSUB(T4B, T4C);
Chris@42 1059 T4E = VFMA(LDK(KP382683432), T4A, VMUL(LDK(KP923879532), T4D));
Chris@42 1060 T5o = VFNMS(LDK(KP382683432), T4D, VMUL(LDK(KP923879532), T4A));
Chris@42 1061 }
Chris@42 1062 }
Chris@42 1063 {
Chris@42 1064 V TS, T4J, TW, T4I, TP, T4G, TX, T4F;
Chris@42 1065 {
Chris@42 1066 V TQ, TR, TU, TV;
Chris@42 1067 TQ = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 1068 TR = LD(&(xi[WS(is, 46)]), ivs, &(xi[0]));
Chris@42 1069 TS = VSUB(TQ, TR);
Chris@42 1070 T4J = VADD(TQ, TR);
Chris@42 1071 TU = LD(&(xi[WS(is, 62)]), ivs, &(xi[0]));
Chris@42 1072 TV = LD(&(xi[WS(is, 30)]), ivs, &(xi[0]));
Chris@42 1073 TW = VSUB(TU, TV);
Chris@42 1074 T4I = VADD(TU, TV);
Chris@42 1075 {
Chris@42 1076 V TJ, TK, TL, TM, TN, TO;
Chris@42 1077 TJ = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 1078 TK = LD(&(xi[WS(is, 38)]), ivs, &(xi[0]));
Chris@42 1079 TL = VSUB(TJ, TK);
Chris@42 1080 TM = LD(&(xi[WS(is, 54)]), ivs, &(xi[0]));
Chris@42 1081 TN = LD(&(xi[WS(is, 22)]), ivs, &(xi[0]));
Chris@42 1082 TO = VSUB(TM, TN);
Chris@42 1083 TP = VMUL(LDK(KP707106781), VSUB(TL, TO));
Chris@42 1084 T4G = VADD(TM, TN);
Chris@42 1085 TX = VMUL(LDK(KP707106781), VADD(TL, TO));
Chris@42 1086 T4F = VADD(TJ, TK);
Chris@42 1087 }
Chris@42 1088 }
Chris@42 1089 {
Chris@42 1090 V TT, TY, T6v, T6w;
Chris@42 1091 TT = VSUB(TP, TS);
Chris@42 1092 TY = VSUB(TW, TX);
Chris@42 1093 TZ = VFNMS(LDK(KP555570233), TY, VMUL(LDK(KP831469612), TT));
Chris@42 1094 T2h = VFMA(LDK(KP555570233), TT, VMUL(LDK(KP831469612), TY));
Chris@42 1095 T6v = VADD(T4I, T4J);
Chris@42 1096 T6w = VADD(T4F, T4G);
Chris@42 1097 T6x = VSUB(T6v, T6w);
Chris@42 1098 T75 = VADD(T6v, T6w);
Chris@42 1099 }
Chris@42 1100 {
Chris@42 1101 V T3r, T3s, T4H, T4K;
Chris@42 1102 T3r = VADD(TS, TP);
Chris@42 1103 T3s = VADD(TW, TX);
Chris@42 1104 T3t = VFNMS(LDK(KP195090322), T3s, VMUL(LDK(KP980785280), T3r));
Chris@42 1105 T3E = VFMA(LDK(KP195090322), T3r, VMUL(LDK(KP980785280), T3s));
Chris@42 1106 T4H = VSUB(T4F, T4G);
Chris@42 1107 T4K = VSUB(T4I, T4J);
Chris@42 1108 T4L = VFNMS(LDK(KP382683432), T4K, VMUL(LDK(KP923879532), T4H));
Chris@42 1109 T5p = VFMA(LDK(KP923879532), T4K, VMUL(LDK(KP382683432), T4H));
Chris@42 1110 }
Chris@42 1111 }
Chris@42 1112 {
Chris@42 1113 V T21, T5h, T26, T5g, T1Y, T5d, T27, T5c, T55, T56, T1J, T57, T29, T58, T59;
Chris@42 1114 V T1Q, T5a, T2a;
Chris@42 1115 {
Chris@42 1116 V T1Z, T20, T24, T25;
Chris@42 1117 T1Z = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1118 T20 = LD(&(xi[WS(is, 47)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1119 T21 = VSUB(T1Z, T20);
Chris@42 1120 T5h = VADD(T1Z, T20);
Chris@42 1121 T24 = LD(&(xi[WS(is, 63)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1122 T25 = LD(&(xi[WS(is, 31)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1123 T26 = VSUB(T24, T25);
Chris@42 1124 T5g = VADD(T24, T25);
Chris@42 1125 }
Chris@42 1126 {
Chris@42 1127 V T1S, T1T, T1U, T1V, T1W, T1X;
Chris@42 1128 T1S = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1129 T1T = LD(&(xi[WS(is, 39)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1130 T1U = VSUB(T1S, T1T);
Chris@42 1131 T1V = LD(&(xi[WS(is, 55)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1132 T1W = LD(&(xi[WS(is, 23)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1133 T1X = VSUB(T1V, T1W);
Chris@42 1134 T1Y = VMUL(LDK(KP707106781), VSUB(T1U, T1X));
Chris@42 1135 T5d = VADD(T1V, T1W);
Chris@42 1136 T27 = VMUL(LDK(KP707106781), VADD(T1U, T1X));
Chris@42 1137 T5c = VADD(T1S, T1T);
Chris@42 1138 }
Chris@42 1139 {
Chris@42 1140 V T1F, T1I, T1M, T1P;
Chris@42 1141 {
Chris@42 1142 V T1D, T1E, T1G, T1H;
Chris@42 1143 T1D = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1144 T1E = LD(&(xi[WS(is, 35)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1145 T1F = VSUB(T1D, T1E);
Chris@42 1146 T55 = VADD(T1D, T1E);
Chris@42 1147 T1G = LD(&(xi[WS(is, 19)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1148 T1H = LD(&(xi[WS(is, 51)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1149 T1I = VSUB(T1G, T1H);
Chris@42 1150 T56 = VADD(T1G, T1H);
Chris@42 1151 }
Chris@42 1152 T1J = VFNMS(LDK(KP382683432), T1I, VMUL(LDK(KP923879532), T1F));
Chris@42 1153 T57 = VSUB(T55, T56);
Chris@42 1154 T29 = VFMA(LDK(KP382683432), T1F, VMUL(LDK(KP923879532), T1I));
Chris@42 1155 {
Chris@42 1156 V T1K, T1L, T1N, T1O;
Chris@42 1157 T1K = LD(&(xi[WS(is, 59)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1158 T1L = LD(&(xi[WS(is, 27)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1159 T1M = VSUB(T1K, T1L);
Chris@42 1160 T58 = VADD(T1K, T1L);
Chris@42 1161 T1N = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1162 T1O = LD(&(xi[WS(is, 43)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1163 T1P = VSUB(T1N, T1O);
Chris@42 1164 T59 = VADD(T1N, T1O);
Chris@42 1165 }
Chris@42 1166 T1Q = VFMA(LDK(KP923879532), T1M, VMUL(LDK(KP382683432), T1P));
Chris@42 1167 T5a = VSUB(T58, T59);
Chris@42 1168 T2a = VFNMS(LDK(KP382683432), T1M, VMUL(LDK(KP923879532), T1P));
Chris@42 1169 }
Chris@42 1170 {
Chris@42 1171 V T1R, T22, T6k, T6l;
Chris@42 1172 T1R = VSUB(T1J, T1Q);
Chris@42 1173 T22 = VSUB(T1Y, T21);
Chris@42 1174 T23 = VSUB(T1R, T22);
Chris@42 1175 T2N = VADD(T22, T1R);
Chris@42 1176 T6k = VADD(T5g, T5h);
Chris@42 1177 T6l = VADD(T5c, T5d);
Chris@42 1178 T6m = VSUB(T6k, T6l);
Chris@42 1179 T70 = VADD(T6k, T6l);
Chris@42 1180 }
Chris@42 1181 {
Chris@42 1182 V T6n, T6o, T28, T2b;
Chris@42 1183 T6n = VADD(T55, T56);
Chris@42 1184 T6o = VADD(T58, T59);
Chris@42 1185 T6p = VSUB(T6n, T6o);
Chris@42 1186 T71 = VADD(T6n, T6o);
Chris@42 1187 T28 = VSUB(T26, T27);
Chris@42 1188 T2b = VSUB(T29, T2a);
Chris@42 1189 T2c = VSUB(T28, T2b);
Chris@42 1190 T2O = VADD(T28, T2b);
Chris@42 1191 }
Chris@42 1192 {
Chris@42 1193 V T3g, T3h, T5b, T5e;
Chris@42 1194 T3g = VADD(T26, T27);
Chris@42 1195 T3h = VADD(T1J, T1Q);
Chris@42 1196 T3i = VADD(T3g, T3h);
Chris@42 1197 T3Y = VSUB(T3g, T3h);
Chris@42 1198 T5b = VMUL(LDK(KP707106781), VSUB(T57, T5a));
Chris@42 1199 T5e = VSUB(T5c, T5d);
Chris@42 1200 T5f = VSUB(T5b, T5e);
Chris@42 1201 T5R = VADD(T5e, T5b);
Chris@42 1202 }
Chris@42 1203 {
Chris@42 1204 V T5i, T5j, T3j, T3k;
Chris@42 1205 T5i = VSUB(T5g, T5h);
Chris@42 1206 T5j = VMUL(LDK(KP707106781), VADD(T57, T5a));
Chris@42 1207 T5k = VSUB(T5i, T5j);
Chris@42 1208 T5S = VADD(T5i, T5j);
Chris@42 1209 T3j = VADD(T21, T1Y);
Chris@42 1210 T3k = VADD(T29, T2a);
Chris@42 1211 T3l = VADD(T3j, T3k);
Chris@42 1212 T3Z = VSUB(T3k, T3j);
Chris@42 1213 }
Chris@42 1214 }
Chris@42 1215 {
Chris@42 1216 V T1q, T50, T1v, T4Z, T1n, T4W, T1w, T4V, T4O, T4P, T18, T4Q, T1y, T4R, T4S;
Chris@42 1217 V T1f, T4T, T1z;
Chris@42 1218 {
Chris@42 1219 V T1o, T1p, T1t, T1u;
Chris@42 1220 T1o = LD(&(xi[WS(is, 17)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1221 T1p = LD(&(xi[WS(is, 49)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1222 T1q = VSUB(T1o, T1p);
Chris@42 1223 T50 = VADD(T1o, T1p);
Chris@42 1224 T1t = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1225 T1u = LD(&(xi[WS(is, 33)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1226 T1v = VSUB(T1t, T1u);
Chris@42 1227 T4Z = VADD(T1t, T1u);
Chris@42 1228 }
Chris@42 1229 {
Chris@42 1230 V T1h, T1i, T1j, T1k, T1l, T1m;
Chris@42 1231 T1h = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1232 T1i = LD(&(xi[WS(is, 41)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1233 T1j = VSUB(T1h, T1i);
Chris@42 1234 T1k = LD(&(xi[WS(is, 57)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1235 T1l = LD(&(xi[WS(is, 25)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1236 T1m = VSUB(T1k, T1l);
Chris@42 1237 T1n = VMUL(LDK(KP707106781), VSUB(T1j, T1m));
Chris@42 1238 T4W = VADD(T1k, T1l);
Chris@42 1239 T1w = VMUL(LDK(KP707106781), VADD(T1j, T1m));
Chris@42 1240 T4V = VADD(T1h, T1i);
Chris@42 1241 }
Chris@42 1242 {
Chris@42 1243 V T14, T17, T1b, T1e;
Chris@42 1244 {
Chris@42 1245 V T12, T13, T15, T16;
Chris@42 1246 T12 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1247 T13 = LD(&(xi[WS(is, 37)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1248 T14 = VSUB(T12, T13);
Chris@42 1249 T4O = VADD(T12, T13);
Chris@42 1250 T15 = LD(&(xi[WS(is, 21)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1251 T16 = LD(&(xi[WS(is, 53)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1252 T17 = VSUB(T15, T16);
Chris@42 1253 T4P = VADD(T15, T16);
Chris@42 1254 }
Chris@42 1255 T18 = VFNMS(LDK(KP382683432), T17, VMUL(LDK(KP923879532), T14));
Chris@42 1256 T4Q = VSUB(T4O, T4P);
Chris@42 1257 T1y = VFMA(LDK(KP382683432), T14, VMUL(LDK(KP923879532), T17));
Chris@42 1258 {
Chris@42 1259 V T19, T1a, T1c, T1d;
Chris@42 1260 T19 = LD(&(xi[WS(is, 61)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1261 T1a = LD(&(xi[WS(is, 29)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1262 T1b = VSUB(T19, T1a);
Chris@42 1263 T4R = VADD(T19, T1a);
Chris@42 1264 T1c = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1265 T1d = LD(&(xi[WS(is, 45)]), ivs, &(xi[WS(is, 1)]));
Chris@42 1266 T1e = VSUB(T1c, T1d);
Chris@42 1267 T4S = VADD(T1c, T1d);
Chris@42 1268 }
Chris@42 1269 T1f = VFMA(LDK(KP923879532), T1b, VMUL(LDK(KP382683432), T1e));
Chris@42 1270 T4T = VSUB(T4R, T4S);
Chris@42 1271 T1z = VFNMS(LDK(KP382683432), T1b, VMUL(LDK(KP923879532), T1e));
Chris@42 1272 }
Chris@42 1273 {
Chris@42 1274 V T1g, T1r, T6d, T6e;
Chris@42 1275 T1g = VSUB(T18, T1f);
Chris@42 1276 T1r = VSUB(T1n, T1q);
Chris@42 1277 T1s = VSUB(T1g, T1r);
Chris@42 1278 T2K = VADD(T1r, T1g);
Chris@42 1279 T6d = VADD(T4Z, T50);
Chris@42 1280 T6e = VADD(T4V, T4W);
Chris@42 1281 T6f = VSUB(T6d, T6e);
Chris@42 1282 T6X = VADD(T6d, T6e);
Chris@42 1283 }
Chris@42 1284 {
Chris@42 1285 V T6g, T6h, T1x, T1A;
Chris@42 1286 T6g = VADD(T4O, T4P);
Chris@42 1287 T6h = VADD(T4R, T4S);
Chris@42 1288 T6i = VSUB(T6g, T6h);
Chris@42 1289 T6Y = VADD(T6g, T6h);
Chris@42 1290 T1x = VSUB(T1v, T1w);
Chris@42 1291 T1A = VSUB(T1y, T1z);
Chris@42 1292 T1B = VSUB(T1x, T1A);
Chris@42 1293 T2L = VADD(T1x, T1A);
Chris@42 1294 }
Chris@42 1295 {
Chris@42 1296 V T39, T3a, T4U, T4X;
Chris@42 1297 T39 = VADD(T1v, T1w);
Chris@42 1298 T3a = VADD(T18, T1f);
Chris@42 1299 T3b = VADD(T39, T3a);
Chris@42 1300 T3V = VSUB(T39, T3a);
Chris@42 1301 T4U = VMUL(LDK(KP707106781), VSUB(T4Q, T4T));
Chris@42 1302 T4X = VSUB(T4V, T4W);
Chris@42 1303 T4Y = VSUB(T4U, T4X);
Chris@42 1304 T5O = VADD(T4X, T4U);
Chris@42 1305 }
Chris@42 1306 {
Chris@42 1307 V T51, T52, T3c, T3d;
Chris@42 1308 T51 = VSUB(T4Z, T50);
Chris@42 1309 T52 = VMUL(LDK(KP707106781), VADD(T4Q, T4T));
Chris@42 1310 T53 = VSUB(T51, T52);
Chris@42 1311 T5P = VADD(T51, T52);
Chris@42 1312 T3c = VADD(T1q, T1n);
Chris@42 1313 T3d = VADD(T1y, T1z);
Chris@42 1314 T3e = VADD(T3c, T3d);
Chris@42 1315 T3W = VSUB(T3d, T3c);
Chris@42 1316 }
Chris@42 1317 }
Chris@42 1318 {
Chris@42 1319 V T7n, T7o, T7p, T7q, T7r, T7s, T7t, T7u, T7v, T7w, T7x, T7y, T7z, T7A, T7B;
Chris@42 1320 V T7C, T7D, T7E, T7F, T7G, T7H, T7I, T7J, T7K;
Chris@42 1321 {
Chris@42 1322 V T7h, T7l, T7k, T7m;
Chris@42 1323 {
Chris@42 1324 V T7f, T7g, T7i, T7j;
Chris@42 1325 T7f = VADD(T78, T79);
Chris@42 1326 T7g = VADD(T74, T75);
Chris@42 1327 T7h = VSUB(T7f, T7g);
Chris@42 1328 T7l = VADD(T7f, T7g);
Chris@42 1329 T7i = VADD(T6X, T6Y);
Chris@42 1330 T7j = VADD(T70, T71);
Chris@42 1331 T7k = VBYI(VSUB(T7i, T7j));
Chris@42 1332 T7m = VADD(T7i, T7j);
Chris@42 1333 }
Chris@42 1334 T7n = VSUB(T7h, T7k);
Chris@42 1335 STM2(&(xo[96]), T7n, ovs, &(xo[0]));
Chris@42 1336 T7o = VADD(T7l, T7m);
Chris@42 1337 STM2(&(xo[0]), T7o, ovs, &(xo[0]));
Chris@42 1338 T7p = VADD(T7h, T7k);
Chris@42 1339 STM2(&(xo[32]), T7p, ovs, &(xo[0]));
Chris@42 1340 T7q = VSUB(T7l, T7m);
Chris@42 1341 STM2(&(xo[64]), T7q, ovs, &(xo[0]));
Chris@42 1342 }
Chris@42 1343 {
Chris@42 1344 V T76, T7a, T73, T7b, T6Z, T72;
Chris@42 1345 T76 = VSUB(T74, T75);
Chris@42 1346 T7a = VSUB(T78, T79);
Chris@42 1347 T6Z = VSUB(T6X, T6Y);
Chris@42 1348 T72 = VSUB(T70, T71);
Chris@42 1349 T73 = VMUL(LDK(KP707106781), VSUB(T6Z, T72));
Chris@42 1350 T7b = VMUL(LDK(KP707106781), VADD(T6Z, T72));
Chris@42 1351 {
Chris@42 1352 V T77, T7c, T7d, T7e;
Chris@42 1353 T77 = VBYI(VSUB(T73, T76));
Chris@42 1354 T7c = VSUB(T7a, T7b);
Chris@42 1355 T7r = VADD(T77, T7c);
Chris@42 1356 STM2(&(xo[48]), T7r, ovs, &(xo[0]));
Chris@42 1357 T7s = VSUB(T7c, T77);
Chris@42 1358 STM2(&(xo[80]), T7s, ovs, &(xo[0]));
Chris@42 1359 T7d = VBYI(VADD(T76, T73));
Chris@42 1360 T7e = VADD(T7a, T7b);
Chris@42 1361 T7t = VADD(T7d, T7e);
Chris@42 1362 STM2(&(xo[16]), T7t, ovs, &(xo[0]));
Chris@42 1363 T7u = VSUB(T7e, T7d);
Chris@42 1364 STM2(&(xo[112]), T7u, ovs, &(xo[0]));
Chris@42 1365 }
Chris@42 1366 }
Chris@42 1367 {
Chris@42 1368 V T6C, T6S, T6I, T6P, T6r, T6Q, T6L, T6T, T6y, T6H;
Chris@42 1369 T6y = VMUL(LDK(KP707106781), VSUB(T6u, T6x));
Chris@42 1370 T6C = VSUB(T6y, T6B);
Chris@42 1371 T6S = VADD(T6B, T6y);
Chris@42 1372 T6H = VMUL(LDK(KP707106781), VADD(T6u, T6x));
Chris@42 1373 T6I = VSUB(T6G, T6H);
Chris@42 1374 T6P = VADD(T6G, T6H);
Chris@42 1375 {
Chris@42 1376 V T6j, T6q, T6J, T6K;
Chris@42 1377 T6j = VFNMS(LDK(KP382683432), T6i, VMUL(LDK(KP923879532), T6f));
Chris@42 1378 T6q = VFMA(LDK(KP923879532), T6m, VMUL(LDK(KP382683432), T6p));
Chris@42 1379 T6r = VSUB(T6j, T6q);
Chris@42 1380 T6Q = VADD(T6j, T6q);
Chris@42 1381 T6J = VFMA(LDK(KP382683432), T6f, VMUL(LDK(KP923879532), T6i));
Chris@42 1382 T6K = VFNMS(LDK(KP382683432), T6m, VMUL(LDK(KP923879532), T6p));
Chris@42 1383 T6L = VSUB(T6J, T6K);
Chris@42 1384 T6T = VADD(T6J, T6K);
Chris@42 1385 }
Chris@42 1386 {
Chris@42 1387 V T6D, T6M, T6V, T6W;
Chris@42 1388 T6D = VBYI(VSUB(T6r, T6C));
Chris@42 1389 T6M = VSUB(T6I, T6L);
Chris@42 1390 T7v = VADD(T6D, T6M);
Chris@42 1391 STM2(&(xo[40]), T7v, ovs, &(xo[0]));
Chris@42 1392 T7w = VSUB(T6M, T6D);
Chris@42 1393 STM2(&(xo[88]), T7w, ovs, &(xo[0]));
Chris@42 1394 T6V = VSUB(T6P, T6Q);
Chris@42 1395 T6W = VBYI(VSUB(T6T, T6S));
Chris@42 1396 T7x = VSUB(T6V, T6W);
Chris@42 1397 STM2(&(xo[72]), T7x, ovs, &(xo[0]));
Chris@42 1398 T7y = VADD(T6V, T6W);
Chris@42 1399 STM2(&(xo[56]), T7y, ovs, &(xo[0]));
Chris@42 1400 }
Chris@42 1401 {
Chris@42 1402 V T6N, T6O, T6R, T6U;
Chris@42 1403 T6N = VBYI(VADD(T6C, T6r));
Chris@42 1404 T6O = VADD(T6I, T6L);
Chris@42 1405 T7z = VADD(T6N, T6O);
Chris@42 1406 STM2(&(xo[24]), T7z, ovs, &(xo[0]));
Chris@42 1407 T7A = VSUB(T6O, T6N);
Chris@42 1408 STM2(&(xo[104]), T7A, ovs, &(xo[0]));
Chris@42 1409 T6R = VADD(T6P, T6Q);
Chris@42 1410 T6U = VBYI(VADD(T6S, T6T));
Chris@42 1411 T7B = VSUB(T6R, T6U);
Chris@42 1412 STM2(&(xo[120]), T7B, ovs, &(xo[0]));
Chris@42 1413 T7C = VADD(T6R, T6U);
Chris@42 1414 STM2(&(xo[8]), T7C, ovs, &(xo[0]));
Chris@42 1415 }
Chris@42 1416 }
Chris@42 1417 {
Chris@42 1418 V T5N, T68, T61, T69, T5U, T65, T5Y, T66;
Chris@42 1419 {
Chris@42 1420 V T5L, T5M, T5Z, T60;
Chris@42 1421 T5L = VADD(T4p, T4w);
Chris@42 1422 T5M = VADD(T5o, T5p);
Chris@42 1423 T5N = VSUB(T5L, T5M);
Chris@42 1424 T68 = VADD(T5L, T5M);
Chris@42 1425 T5Z = VFNMS(LDK(KP195090322), T5O, VMUL(LDK(KP980785280), T5P));
Chris@42 1426 T60 = VFMA(LDK(KP195090322), T5R, VMUL(LDK(KP980785280), T5S));
Chris@42 1427 T61 = VSUB(T5Z, T60);
Chris@42 1428 T69 = VADD(T5Z, T60);
Chris@42 1429 }
Chris@42 1430 {
Chris@42 1431 V T5Q, T5T, T5W, T5X;
Chris@42 1432 T5Q = VFMA(LDK(KP980785280), T5O, VMUL(LDK(KP195090322), T5P));
Chris@42 1433 T5T = VFNMS(LDK(KP195090322), T5S, VMUL(LDK(KP980785280), T5R));
Chris@42 1434 T5U = VSUB(T5Q, T5T);
Chris@42 1435 T65 = VADD(T5Q, T5T);
Chris@42 1436 T5W = VADD(T4E, T4L);
Chris@42 1437 T5X = VADD(T5u, T5r);
Chris@42 1438 T5Y = VSUB(T5W, T5X);
Chris@42 1439 T66 = VADD(T5X, T5W);
Chris@42 1440 }
Chris@42 1441 {
Chris@42 1442 V T5V, T62, T6b, T6c;
Chris@42 1443 T5V = VADD(T5N, T5U);
Chris@42 1444 T62 = VBYI(VADD(T5Y, T61));
Chris@42 1445 T7D = VSUB(T5V, T62);
Chris@42 1446 STM2(&(xo[100]), T7D, ovs, &(xo[0]));
Chris@42 1447 T7E = VADD(T5V, T62);
Chris@42 1448 STM2(&(xo[28]), T7E, ovs, &(xo[0]));
Chris@42 1449 T6b = VBYI(VADD(T66, T65));
Chris@42 1450 T6c = VADD(T68, T69);
Chris@42 1451 T7F = VADD(T6b, T6c);
Chris@42 1452 STM2(&(xo[4]), T7F, ovs, &(xo[0]));
Chris@42 1453 T7G = VSUB(T6c, T6b);
Chris@42 1454 STM2(&(xo[124]), T7G, ovs, &(xo[0]));
Chris@42 1455 }
Chris@42 1456 {
Chris@42 1457 V T63, T64, T67, T6a;
Chris@42 1458 T63 = VSUB(T5N, T5U);
Chris@42 1459 T64 = VBYI(VSUB(T61, T5Y));
Chris@42 1460 T7H = VSUB(T63, T64);
Chris@42 1461 STM2(&(xo[92]), T7H, ovs, &(xo[0]));
Chris@42 1462 T7I = VADD(T63, T64);
Chris@42 1463 STM2(&(xo[36]), T7I, ovs, &(xo[0]));
Chris@42 1464 T67 = VBYI(VSUB(T65, T66));
Chris@42 1465 T6a = VSUB(T68, T69);
Chris@42 1466 T7J = VADD(T67, T6a);
Chris@42 1467 STM2(&(xo[60]), T7J, ovs, &(xo[0]));
Chris@42 1468 T7K = VSUB(T6a, T67);
Chris@42 1469 STM2(&(xo[68]), T7K, ovs, &(xo[0]));
Chris@42 1470 }
Chris@42 1471 }
Chris@42 1472 {
Chris@42 1473 V T7M, T7O, T7P, T7R;
Chris@42 1474 {
Chris@42 1475 V T11, T2C, T2v, T2D, T2e, T2z, T2s, T2A;
Chris@42 1476 {
Chris@42 1477 V Tr, T10, T2t, T2u;
Chris@42 1478 Tr = VSUB(Tb, Tq);
Chris@42 1479 T10 = VSUB(TI, TZ);
Chris@42 1480 T11 = VSUB(Tr, T10);
Chris@42 1481 T2C = VADD(Tr, T10);
Chris@42 1482 T2t = VFNMS(LDK(KP471396736), T1s, VMUL(LDK(KP881921264), T1B));
Chris@42 1483 T2u = VFMA(LDK(KP471396736), T23, VMUL(LDK(KP881921264), T2c));
Chris@42 1484 T2v = VSUB(T2t, T2u);
Chris@42 1485 T2D = VADD(T2t, T2u);
Chris@42 1486 }
Chris@42 1487 {
Chris@42 1488 V T1C, T2d, T2i, T2r;
Chris@42 1489 T1C = VFMA(LDK(KP881921264), T1s, VMUL(LDK(KP471396736), T1B));
Chris@42 1490 T2d = VFNMS(LDK(KP471396736), T2c, VMUL(LDK(KP881921264), T23));
Chris@42 1491 T2e = VSUB(T1C, T2d);
Chris@42 1492 T2z = VADD(T1C, T2d);
Chris@42 1493 T2i = VSUB(T2g, T2h);
Chris@42 1494 T2r = VSUB(T2l, T2q);
Chris@42 1495 T2s = VSUB(T2i, T2r);
Chris@42 1496 T2A = VADD(T2r, T2i);
Chris@42 1497 }
Chris@42 1498 {
Chris@42 1499 V T2f, T2w, T7L, T2F, T2G, T7N;
Chris@42 1500 T2f = VADD(T11, T2e);
Chris@42 1501 T2w = VBYI(VADD(T2s, T2v));
Chris@42 1502 T7L = VSUB(T2f, T2w);
Chris@42 1503 STM2(&(xo[106]), T7L, ovs, &(xo[2]));
Chris@42 1504 STN2(&(xo[104]), T7A, T7L, ovs);
Chris@42 1505 T7M = VADD(T2f, T2w);
Chris@42 1506 STM2(&(xo[22]), T7M, ovs, &(xo[2]));
Chris@42 1507 T2F = VBYI(VADD(T2A, T2z));
Chris@42 1508 T2G = VADD(T2C, T2D);
Chris@42 1509 T7N = VADD(T2F, T2G);
Chris@42 1510 STM2(&(xo[10]), T7N, ovs, &(xo[2]));
Chris@42 1511 STN2(&(xo[8]), T7C, T7N, ovs);
Chris@42 1512 T7O = VSUB(T2G, T2F);
Chris@42 1513 STM2(&(xo[118]), T7O, ovs, &(xo[2]));
Chris@42 1514 }
Chris@42 1515 {
Chris@42 1516 V T2x, T2y, T7Q, T2B, T2E, T7S;
Chris@42 1517 T2x = VSUB(T11, T2e);
Chris@42 1518 T2y = VBYI(VSUB(T2v, T2s));
Chris@42 1519 T7P = VSUB(T2x, T2y);
Chris@42 1520 STM2(&(xo[86]), T7P, ovs, &(xo[2]));
Chris@42 1521 T7Q = VADD(T2x, T2y);
Chris@42 1522 STM2(&(xo[42]), T7Q, ovs, &(xo[2]));
Chris@42 1523 STN2(&(xo[40]), T7v, T7Q, ovs);
Chris@42 1524 T2B = VBYI(VSUB(T2z, T2A));
Chris@42 1525 T2E = VSUB(T2C, T2D);
Chris@42 1526 T7R = VADD(T2B, T2E);
Chris@42 1527 STM2(&(xo[54]), T7R, ovs, &(xo[2]));
Chris@42 1528 T7S = VSUB(T2E, T2B);
Chris@42 1529 STM2(&(xo[74]), T7S, ovs, &(xo[2]));
Chris@42 1530 STN2(&(xo[72]), T7x, T7S, ovs);
Chris@42 1531 }
Chris@42 1532 }
Chris@42 1533 {
Chris@42 1534 V T3n, T3O, T3J, T3R, T3y, T3Q, T3G, T3N;
Chris@42 1535 {
Chris@42 1536 V T3f, T3m, T3H, T3I;
Chris@42 1537 T3f = VFNMS(LDK(KP098017140), T3e, VMUL(LDK(KP995184726), T3b));
Chris@42 1538 T3m = VFMA(LDK(KP995184726), T3i, VMUL(LDK(KP098017140), T3l));
Chris@42 1539 T3n = VSUB(T3f, T3m);
Chris@42 1540 T3O = VADD(T3f, T3m);
Chris@42 1541 T3H = VFMA(LDK(KP098017140), T3b, VMUL(LDK(KP995184726), T3e));
Chris@42 1542 T3I = VFNMS(LDK(KP098017140), T3i, VMUL(LDK(KP995184726), T3l));
Chris@42 1543 T3J = VSUB(T3H, T3I);
Chris@42 1544 T3R = VADD(T3H, T3I);
Chris@42 1545 }
Chris@42 1546 {
Chris@42 1547 V T3u, T3x, T3C, T3F;
Chris@42 1548 T3u = VADD(T3q, T3t);
Chris@42 1549 T3x = VADD(T3v, T3w);
Chris@42 1550 T3y = VSUB(T3u, T3x);
Chris@42 1551 T3Q = VADD(T3x, T3u);
Chris@42 1552 T3C = VADD(T3A, T3B);
Chris@42 1553 T3F = VADD(T3D, T3E);
Chris@42 1554 T3G = VSUB(T3C, T3F);
Chris@42 1555 T3N = VADD(T3C, T3F);
Chris@42 1556 }
Chris@42 1557 {
Chris@42 1558 V T3z, T3K, T7T, T7U;
Chris@42 1559 T3z = VBYI(VSUB(T3n, T3y));
Chris@42 1560 T3K = VSUB(T3G, T3J);
Chris@42 1561 T7T = VADD(T3z, T3K);
Chris@42 1562 STM2(&(xo[34]), T7T, ovs, &(xo[2]));
Chris@42 1563 STN2(&(xo[32]), T7p, T7T, ovs);
Chris@42 1564 T7U = VSUB(T3K, T3z);
Chris@42 1565 STM2(&(xo[94]), T7U, ovs, &(xo[2]));
Chris@42 1566 STN2(&(xo[92]), T7H, T7U, ovs);
Chris@42 1567 }
Chris@42 1568 {
Chris@42 1569 V T3T, T3U, T7V, T7W;
Chris@42 1570 T3T = VSUB(T3N, T3O);
Chris@42 1571 T3U = VBYI(VSUB(T3R, T3Q));
Chris@42 1572 T7V = VSUB(T3T, T3U);
Chris@42 1573 STM2(&(xo[66]), T7V, ovs, &(xo[2]));
Chris@42 1574 STN2(&(xo[64]), T7q, T7V, ovs);
Chris@42 1575 T7W = VADD(T3T, T3U);
Chris@42 1576 STM2(&(xo[62]), T7W, ovs, &(xo[2]));
Chris@42 1577 STN2(&(xo[60]), T7J, T7W, ovs);
Chris@42 1578 }
Chris@42 1579 {
Chris@42 1580 V T3L, T3M, T7X, T7Y;
Chris@42 1581 T3L = VBYI(VADD(T3y, T3n));
Chris@42 1582 T3M = VADD(T3G, T3J);
Chris@42 1583 T7X = VADD(T3L, T3M);
Chris@42 1584 STM2(&(xo[30]), T7X, ovs, &(xo[2]));
Chris@42 1585 STN2(&(xo[28]), T7E, T7X, ovs);
Chris@42 1586 T7Y = VSUB(T3M, T3L);
Chris@42 1587 STM2(&(xo[98]), T7Y, ovs, &(xo[2]));
Chris@42 1588 STN2(&(xo[96]), T7n, T7Y, ovs);
Chris@42 1589 }
Chris@42 1590 {
Chris@42 1591 V T3P, T3S, T7Z, T80;
Chris@42 1592 T3P = VADD(T3N, T3O);
Chris@42 1593 T3S = VBYI(VADD(T3Q, T3R));
Chris@42 1594 T7Z = VSUB(T3P, T3S);
Chris@42 1595 STM2(&(xo[126]), T7Z, ovs, &(xo[2]));
Chris@42 1596 STN2(&(xo[124]), T7G, T7Z, ovs);
Chris@42 1597 T80 = VADD(T3P, T3S);
Chris@42 1598 STM2(&(xo[2]), T80, ovs, &(xo[2]));
Chris@42 1599 STN2(&(xo[0]), T7o, T80, ovs);
Chris@42 1600 }
Chris@42 1601 }
Chris@42 1602 {
Chris@42 1603 V T81, T83, T86, T88;
Chris@42 1604 {
Chris@42 1605 V T4N, T5G, T5z, T5H, T5m, T5D, T5w, T5E;
Chris@42 1606 {
Chris@42 1607 V T4x, T4M, T5x, T5y;
Chris@42 1608 T4x = VSUB(T4p, T4w);
Chris@42 1609 T4M = VSUB(T4E, T4L);
Chris@42 1610 T4N = VSUB(T4x, T4M);
Chris@42 1611 T5G = VADD(T4x, T4M);
Chris@42 1612 T5x = VFNMS(LDK(KP555570233), T4Y, VMUL(LDK(KP831469612), T53));
Chris@42 1613 T5y = VFMA(LDK(KP555570233), T5f, VMUL(LDK(KP831469612), T5k));
Chris@42 1614 T5z = VSUB(T5x, T5y);
Chris@42 1615 T5H = VADD(T5x, T5y);
Chris@42 1616 }
Chris@42 1617 {
Chris@42 1618 V T54, T5l, T5q, T5v;
Chris@42 1619 T54 = VFMA(LDK(KP831469612), T4Y, VMUL(LDK(KP555570233), T53));
Chris@42 1620 T5l = VFNMS(LDK(KP555570233), T5k, VMUL(LDK(KP831469612), T5f));
Chris@42 1621 T5m = VSUB(T54, T5l);
Chris@42 1622 T5D = VADD(T54, T5l);
Chris@42 1623 T5q = VSUB(T5o, T5p);
Chris@42 1624 T5v = VSUB(T5r, T5u);
Chris@42 1625 T5w = VSUB(T5q, T5v);
Chris@42 1626 T5E = VADD(T5v, T5q);
Chris@42 1627 }
Chris@42 1628 {
Chris@42 1629 V T5n, T5A, T82, T5J, T5K, T84;
Chris@42 1630 T5n = VADD(T4N, T5m);
Chris@42 1631 T5A = VBYI(VADD(T5w, T5z));
Chris@42 1632 T81 = VSUB(T5n, T5A);
Chris@42 1633 STM2(&(xo[108]), T81, ovs, &(xo[0]));
Chris@42 1634 T82 = VADD(T5n, T5A);
Chris@42 1635 STM2(&(xo[20]), T82, ovs, &(xo[0]));
Chris@42 1636 STN2(&(xo[20]), T82, T7M, ovs);
Chris@42 1637 T5J = VBYI(VADD(T5E, T5D));
Chris@42 1638 T5K = VADD(T5G, T5H);
Chris@42 1639 T83 = VADD(T5J, T5K);
Chris@42 1640 STM2(&(xo[12]), T83, ovs, &(xo[0]));
Chris@42 1641 T84 = VSUB(T5K, T5J);
Chris@42 1642 STM2(&(xo[116]), T84, ovs, &(xo[0]));
Chris@42 1643 STN2(&(xo[116]), T84, T7O, ovs);
Chris@42 1644 }
Chris@42 1645 {
Chris@42 1646 V T5B, T5C, T85, T5F, T5I, T87;
Chris@42 1647 T5B = VSUB(T4N, T5m);
Chris@42 1648 T5C = VBYI(VSUB(T5z, T5w));
Chris@42 1649 T85 = VSUB(T5B, T5C);
Chris@42 1650 STM2(&(xo[84]), T85, ovs, &(xo[0]));
Chris@42 1651 STN2(&(xo[84]), T85, T7P, ovs);
Chris@42 1652 T86 = VADD(T5B, T5C);
Chris@42 1653 STM2(&(xo[44]), T86, ovs, &(xo[0]));
Chris@42 1654 T5F = VBYI(VSUB(T5D, T5E));
Chris@42 1655 T5I = VSUB(T5G, T5H);
Chris@42 1656 T87 = VADD(T5F, T5I);
Chris@42 1657 STM2(&(xo[52]), T87, ovs, &(xo[0]));
Chris@42 1658 STN2(&(xo[52]), T87, T7R, ovs);
Chris@42 1659 T88 = VSUB(T5I, T5F);
Chris@42 1660 STM2(&(xo[76]), T88, ovs, &(xo[0]));
Chris@42 1661 }
Chris@42 1662 }
Chris@42 1663 {
Chris@42 1664 V T2J, T34, T2X, T35, T2Q, T31, T2U, T32;
Chris@42 1665 {
Chris@42 1666 V T2H, T2I, T2V, T2W;
Chris@42 1667 T2H = VADD(Tb, Tq);
Chris@42 1668 T2I = VADD(T2g, T2h);
Chris@42 1669 T2J = VSUB(T2H, T2I);
Chris@42 1670 T34 = VADD(T2H, T2I);
Chris@42 1671 T2V = VFNMS(LDK(KP290284677), T2K, VMUL(LDK(KP956940335), T2L));
Chris@42 1672 T2W = VFMA(LDK(KP290284677), T2N, VMUL(LDK(KP956940335), T2O));
Chris@42 1673 T2X = VSUB(T2V, T2W);
Chris@42 1674 T35 = VADD(T2V, T2W);
Chris@42 1675 }
Chris@42 1676 {
Chris@42 1677 V T2M, T2P, T2S, T2T;
Chris@42 1678 T2M = VFMA(LDK(KP956940335), T2K, VMUL(LDK(KP290284677), T2L));
Chris@42 1679 T2P = VFNMS(LDK(KP290284677), T2O, VMUL(LDK(KP956940335), T2N));
Chris@42 1680 T2Q = VSUB(T2M, T2P);
Chris@42 1681 T31 = VADD(T2M, T2P);
Chris@42 1682 T2S = VADD(TI, TZ);
Chris@42 1683 T2T = VADD(T2q, T2l);
Chris@42 1684 T2U = VSUB(T2S, T2T);
Chris@42 1685 T32 = VADD(T2T, T2S);
Chris@42 1686 }
Chris@42 1687 {
Chris@42 1688 V T2R, T2Y, T89, T8a;
Chris@42 1689 T2R = VADD(T2J, T2Q);
Chris@42 1690 T2Y = VBYI(VADD(T2U, T2X));
Chris@42 1691 T89 = VSUB(T2R, T2Y);
Chris@42 1692 STM2(&(xo[102]), T89, ovs, &(xo[2]));
Chris@42 1693 STN2(&(xo[100]), T7D, T89, ovs);
Chris@42 1694 T8a = VADD(T2R, T2Y);
Chris@42 1695 STM2(&(xo[26]), T8a, ovs, &(xo[2]));
Chris@42 1696 STN2(&(xo[24]), T7z, T8a, ovs);
Chris@42 1697 }
Chris@42 1698 {
Chris@42 1699 V T37, T38, T8b, T8c;
Chris@42 1700 T37 = VBYI(VADD(T32, T31));
Chris@42 1701 T38 = VADD(T34, T35);
Chris@42 1702 T8b = VADD(T37, T38);
Chris@42 1703 STM2(&(xo[6]), T8b, ovs, &(xo[2]));
Chris@42 1704 STN2(&(xo[4]), T7F, T8b, ovs);
Chris@42 1705 T8c = VSUB(T38, T37);
Chris@42 1706 STM2(&(xo[122]), T8c, ovs, &(xo[2]));
Chris@42 1707 STN2(&(xo[120]), T7B, T8c, ovs);
Chris@42 1708 }
Chris@42 1709 {
Chris@42 1710 V T2Z, T30, T8d, T8e;
Chris@42 1711 T2Z = VSUB(T2J, T2Q);
Chris@42 1712 T30 = VBYI(VSUB(T2X, T2U));
Chris@42 1713 T8d = VSUB(T2Z, T30);
Chris@42 1714 STM2(&(xo[90]), T8d, ovs, &(xo[2]));
Chris@42 1715 STN2(&(xo[88]), T7w, T8d, ovs);
Chris@42 1716 T8e = VADD(T2Z, T30);
Chris@42 1717 STM2(&(xo[38]), T8e, ovs, &(xo[2]));
Chris@42 1718 STN2(&(xo[36]), T7I, T8e, ovs);
Chris@42 1719 }
Chris@42 1720 {
Chris@42 1721 V T33, T36, T8f, T8g;
Chris@42 1722 T33 = VBYI(VSUB(T31, T32));
Chris@42 1723 T36 = VSUB(T34, T35);
Chris@42 1724 T8f = VADD(T33, T36);
Chris@42 1725 STM2(&(xo[58]), T8f, ovs, &(xo[2]));
Chris@42 1726 STN2(&(xo[56]), T7y, T8f, ovs);
Chris@42 1727 T8g = VSUB(T36, T33);
Chris@42 1728 STM2(&(xo[70]), T8g, ovs, &(xo[2]));
Chris@42 1729 STN2(&(xo[68]), T7K, T8g, ovs);
Chris@42 1730 }
Chris@42 1731 }
Chris@42 1732 {
Chris@42 1733 V T41, T4g, T4b, T4j, T44, T4i, T48, T4f;
Chris@42 1734 {
Chris@42 1735 V T3X, T40, T49, T4a;
Chris@42 1736 T3X = VFNMS(LDK(KP634393284), T3W, VMUL(LDK(KP773010453), T3V));
Chris@42 1737 T40 = VFMA(LDK(KP773010453), T3Y, VMUL(LDK(KP634393284), T3Z));
Chris@42 1738 T41 = VSUB(T3X, T40);
Chris@42 1739 T4g = VADD(T3X, T40);
Chris@42 1740 T49 = VFMA(LDK(KP634393284), T3V, VMUL(LDK(KP773010453), T3W));
Chris@42 1741 T4a = VFNMS(LDK(KP634393284), T3Y, VMUL(LDK(KP773010453), T3Z));
Chris@42 1742 T4b = VSUB(T49, T4a);
Chris@42 1743 T4j = VADD(T49, T4a);
Chris@42 1744 }
Chris@42 1745 {
Chris@42 1746 V T42, T43, T46, T47;
Chris@42 1747 T42 = VSUB(T3D, T3E);
Chris@42 1748 T43 = VSUB(T3w, T3v);
Chris@42 1749 T44 = VSUB(T42, T43);
Chris@42 1750 T4i = VADD(T43, T42);
Chris@42 1751 T46 = VSUB(T3A, T3B);
Chris@42 1752 T47 = VSUB(T3q, T3t);
Chris@42 1753 T48 = VSUB(T46, T47);
Chris@42 1754 T4f = VADD(T46, T47);
Chris@42 1755 }
Chris@42 1756 {
Chris@42 1757 V T45, T4c, T8h, T8i;
Chris@42 1758 T45 = VBYI(VSUB(T41, T44));
Chris@42 1759 T4c = VSUB(T48, T4b);
Chris@42 1760 T8h = VADD(T45, T4c);
Chris@42 1761 STM2(&(xo[46]), T8h, ovs, &(xo[2]));
Chris@42 1762 STN2(&(xo[44]), T86, T8h, ovs);
Chris@42 1763 T8i = VSUB(T4c, T45);
Chris@42 1764 STM2(&(xo[82]), T8i, ovs, &(xo[2]));
Chris@42 1765 STN2(&(xo[80]), T7s, T8i, ovs);
Chris@42 1766 }
Chris@42 1767 {
Chris@42 1768 V T4l, T4m, T8j, T8k;
Chris@42 1769 T4l = VSUB(T4f, T4g);
Chris@42 1770 T4m = VBYI(VSUB(T4j, T4i));
Chris@42 1771 T8j = VSUB(T4l, T4m);
Chris@42 1772 STM2(&(xo[78]), T8j, ovs, &(xo[2]));
Chris@42 1773 STN2(&(xo[76]), T88, T8j, ovs);
Chris@42 1774 T8k = VADD(T4l, T4m);
Chris@42 1775 STM2(&(xo[50]), T8k, ovs, &(xo[2]));
Chris@42 1776 STN2(&(xo[48]), T7r, T8k, ovs);
Chris@42 1777 }
Chris@42 1778 {
Chris@42 1779 V T4d, T4e, T8l, T8m;
Chris@42 1780 T4d = VBYI(VADD(T44, T41));
Chris@42 1781 T4e = VADD(T48, T4b);
Chris@42 1782 T8l = VADD(T4d, T4e);
Chris@42 1783 STM2(&(xo[18]), T8l, ovs, &(xo[2]));
Chris@42 1784 STN2(&(xo[16]), T7t, T8l, ovs);
Chris@42 1785 T8m = VSUB(T4e, T4d);
Chris@42 1786 STM2(&(xo[110]), T8m, ovs, &(xo[2]));
Chris@42 1787 STN2(&(xo[108]), T81, T8m, ovs);
Chris@42 1788 }
Chris@42 1789 {
Chris@42 1790 V T4h, T4k, T8n, T8o;
Chris@42 1791 T4h = VADD(T4f, T4g);
Chris@42 1792 T4k = VBYI(VADD(T4i, T4j));
Chris@42 1793 T8n = VSUB(T4h, T4k);
Chris@42 1794 STM2(&(xo[114]), T8n, ovs, &(xo[2]));
Chris@42 1795 STN2(&(xo[112]), T7u, T8n, ovs);
Chris@42 1796 T8o = VADD(T4h, T4k);
Chris@42 1797 STM2(&(xo[14]), T8o, ovs, &(xo[2]));
Chris@42 1798 STN2(&(xo[12]), T83, T8o, ovs);
Chris@42 1799 }
Chris@42 1800 }
Chris@42 1801 }
Chris@42 1802 }
Chris@42 1803 }
Chris@42 1804 }
Chris@42 1805 }
Chris@42 1806 VLEAVE();
Chris@42 1807 }
Chris@42 1808
Chris@42 1809 static const kdft_desc desc = { 64, XSIMD_STRING("n2bv_64"), {404, 72, 52, 0}, &GENUS, 0, 2, 0, 0 };
Chris@42 1810
Chris@42 1811 void XSIMD(codelet_n2bv_64) (planner *p) {
Chris@42 1812 X(kdft_register) (p, n2bv_64, &desc);
Chris@42 1813 }
Chris@42 1814
Chris@42 1815 #endif /* HAVE_FMA */