annotate src/fftw-3.3.3/dft/simd/common/t1fv_64.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:38:10 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1fv_64 -include t1f.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 519 FP additions, 384 FP multiplications,
Chris@10 32 * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
Chris@10 33 * 187 stack variables, 15 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "t1f.h"
Chris@10 36
Chris@10 37 static void t1fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@10 40 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@10 41 DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@10 42 DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@10 43 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@10 44 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@10 45 DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@10 46 DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@10 47 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 48 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 49 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 50 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 51 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 52 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 53 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 54 {
Chris@10 55 INT m;
Chris@10 56 R *x;
Chris@10 57 x = ri;
Chris@10 58 for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 59 V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h;
Chris@10 60 V T6E;
Chris@10 61 {
Chris@10 62 V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43;
Chris@10 63 V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3a;
Chris@10 64 V T4j, T4C, T7e, T5l, T7d, T5o, T3b, TV, T4B, T4m, T3X, T3Y, T6o, T7b, T5f;
Chris@10 65 V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25;
Chris@10 66 V T1S, T2q, T2u, T2w;
Chris@10 67 {
Chris@10 68 V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e;
Chris@10 69 {
Chris@10 70 V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21;
Chris@10 71 {
Chris@10 72 V T1, T2, T7, T5, T32, T34, T2X, T2Z;
Chris@10 73 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 74 T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
Chris@10 75 T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
Chris@10 76 T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 77 T32 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 78 T34 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
Chris@10 79 T2X = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
Chris@10 80 T2Z = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@10 81 {
Chris@10 82 V T1m, T54, T6j, T36, T55, T31, T56, T1n, T1q, T1s, T4, T9;
Chris@10 83 {
Chris@10 84 V T3, T8, T6, T33, T35, T2Y, T30, T1l;
Chris@10 85 T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 86 T3 = BYTWJ(&(W[TWVL * 62]), T2);
Chris@10 87 T8 = BYTWJ(&(W[TWVL * 94]), T7);
Chris@10 88 T6 = BYTWJ(&(W[TWVL * 30]), T5);
Chris@10 89 T33 = BYTWJ(&(W[TWVL * 14]), T32);
Chris@10 90 T35 = BYTWJ(&(W[TWVL * 78]), T34);
Chris@10 91 T2Y = BYTWJ(&(W[TWVL * 110]), T2X);
Chris@10 92 T30 = BYTWJ(&(W[TWVL * 46]), T2Z);
Chris@10 93 T1m = BYTWJ(&(W[0]), T1l);
Chris@10 94 T54 = VSUB(T1, T3);
Chris@10 95 T4 = VADD(T1, T3);
Chris@10 96 T6j = VSUB(T6, T8);
Chris@10 97 T9 = VADD(T6, T8);
Chris@10 98 T36 = VADD(T33, T35);
Chris@10 99 T55 = VSUB(T33, T35);
Chris@10 100 T31 = VADD(T2Y, T30);
Chris@10 101 T56 = VSUB(T2Y, T30);
Chris@10 102 T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
Chris@10 103 }
Chris@10 104 T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 105 T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
Chris@10 106 Ta = VSUB(T4, T9);
Chris@10 107 T3U = VADD(T4, T9);
Chris@10 108 {
Chris@10 109 V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z;
Chris@10 110 T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 111 T3V = VADD(T36, T31);
Chris@10 112 T37 = VSUB(T31, T36);
Chris@10 113 T57 = VADD(T55, T56);
Chris@10 114 T6k = VSUB(T56, T55);
Chris@10 115 T1o = BYTWJ(&(W[TWVL * 64]), T1n);
Chris@10 116 T1r = BYTWJ(&(W[TWVL * 32]), T1q);
Chris@10 117 T1t = BYTWJ(&(W[TWVL * 96]), T1s);
Chris@10 118 T1V = BYTWJ(&(W[TWVL * 16]), T1U);
Chris@10 119 T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
Chris@10 120 T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
Chris@10 121 T7a = VFNMS(LDK(KP707106781), T57, T54);
Chris@10 122 T58 = VFMA(LDK(KP707106781), T57, T54);
Chris@10 123 T7B = VFMA(LDK(KP707106781), T6k, T6j);
Chris@10 124 T6l = VFNMS(LDK(KP707106781), T6k, T6j);
Chris@10 125 T1p = VADD(T1m, T1o);
Chris@10 126 T5B = VSUB(T1m, T1o);
Chris@10 127 T5O = VSUB(T1r, T1t);
Chris@10 128 T1u = VADD(T1r, T1t);
Chris@10 129 T1X = BYTWJ(&(W[TWVL * 80]), T1W);
Chris@10 130 T20 = BYTWJ(&(W[TWVL * 112]), T1Z);
Chris@10 131 T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@10 132 }
Chris@10 133 }
Chris@10 134 }
Chris@10 135 {
Chris@10 136 V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h;
Chris@10 137 {
Chris@10 138 V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J;
Chris@10 139 T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
Chris@10 140 T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@10 141 T1v = VSUB(T1p, T1u);
Chris@10 142 T41 = VADD(T1p, T1u);
Chris@10 143 T1Y = VADD(T1V, T1X);
Chris@10 144 T5C = VSUB(T1V, T1X);
Chris@10 145 T22 = BYTWJ(&(W[TWVL * 48]), T21);
Chris@10 146 T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 147 T29 = BYTWJ(&(W[TWVL * 124]), T28);
Chris@10 148 T2b = BYTWJ(&(W[TWVL * 60]), T2a);
Chris@10 149 T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
Chris@10 150 T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
Chris@10 151 T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@10 152 {
Chris@10 153 V T23, T5D, T2e, T2g, T2I, T2K, T2M;
Chris@10 154 T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 155 T23 = VADD(T20, T22);
Chris@10 156 T5D = VSUB(T20, T22);
Chris@10 157 T2e = BYTWJ(&(W[TWVL * 28]), T2d);
Chris@10 158 T2c = VADD(T29, T2b);
Chris@10 159 T5W = VSUB(T29, T2b);
Chris@10 160 T2g = BYTWJ(&(W[TWVL * 92]), T2f);
Chris@10 161 T2I = BYTWJ(&(W[TWVL * 108]), T2H);
Chris@10 162 T2K = BYTWJ(&(W[TWVL * 44]), T2J);
Chris@10 163 T2N = BYTWJ(&(W[TWVL * 12]), T2M);
Chris@10 164 {
Chris@10 165 V T5E, T5P, T42, T2O;
Chris@10 166 T5E = VADD(T5C, T5D);
Chris@10 167 T5P = VSUB(T5C, T5D);
Chris@10 168 T24 = VSUB(T1Y, T23);
Chris@10 169 T42 = VADD(T1Y, T23);
Chris@10 170 T69 = VSUB(T2g, T2e);
Chris@10 171 T2h = VADD(T2e, T2g);
Chris@10 172 T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
Chris@10 173 T2L = VADD(T2I, T2K);
Chris@10 174 T5Y = VSUB(T2I, T2K);
Chris@10 175 T5Q = VFMA(LDK(KP707106781), T5P, T5O);
Chris@10 176 T7o = VFNMS(LDK(KP707106781), T5P, T5O);
Chris@10 177 T5F = VFMA(LDK(KP707106781), T5E, T5B);
Chris@10 178 T7l = VFNMS(LDK(KP707106781), T5E, T5B);
Chris@10 179 T43 = VADD(T41, T42);
Chris@10 180 T4F = VSUB(T41, T42);
Chris@10 181 T2P = BYTWJ(&(W[TWVL * 76]), T2O);
Chris@10 182 }
Chris@10 183 }
Chris@10 184 }
Chris@10 185 T2i = VSUB(T2c, T2h);
Chris@10 186 T48 = VADD(T2c, T2h);
Chris@10 187 {
Chris@10 188 V TW, TY, T11, T2Q, T5X, T13;
Chris@10 189 TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
Chris@10 190 TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@10 191 T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 192 T2Q = VADD(T2N, T2P);
Chris@10 193 T5X = VSUB(T2N, T2P);
Chris@10 194 T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
Chris@10 195 {
Chris@10 196 V T12, T5Z, T6a, T49, T14, T18, T1a;
Chris@10 197 {
Chris@10 198 V T17, T19, TX, TZ;
Chris@10 199 T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
Chris@10 200 T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@10 201 TX = BYTWJ(&(W[TWVL * 122]), TW);
Chris@10 202 TZ = BYTWJ(&(W[TWVL * 58]), TY);
Chris@10 203 T12 = BYTWJ(&(W[TWVL * 26]), T11);
Chris@10 204 T5Z = VADD(T5X, T5Y);
Chris@10 205 T6a = VSUB(T5Y, T5X);
Chris@10 206 T2R = VSUB(T2L, T2Q);
Chris@10 207 T49 = VADD(T2Q, T2L);
Chris@10 208 T14 = BYTWJ(&(W[TWVL * 90]), T13);
Chris@10 209 T18 = BYTWJ(&(W[TWVL * 106]), T17);
Chris@10 210 T5q = VSUB(TX, TZ);
Chris@10 211 T10 = VADD(TX, TZ);
Chris@10 212 T1a = BYTWJ(&(W[TWVL * 42]), T19);
Chris@10 213 }
Chris@10 214 T6b = VFMA(LDK(KP707106781), T6a, T69);
Chris@10 215 T7v = VFNMS(LDK(KP707106781), T6a, T69);
Chris@10 216 T60 = VFMA(LDK(KP707106781), T5Z, T5W);
Chris@10 217 T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
Chris@10 218 T4a = VADD(T48, T49);
Chris@10 219 T4I = VSUB(T48, T49);
Chris@10 220 T5v = VSUB(T14, T12);
Chris@10 221 T15 = VADD(T12, T14);
Chris@10 222 T1b = VADD(T18, T1a);
Chris@10 223 T5s = VSUB(T18, T1a);
Chris@10 224 }
Chris@10 225 T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 226 T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
Chris@10 227 }
Chris@10 228 }
Chris@10 229 }
Chris@10 230 {
Chris@10 231 V Th, T59, Tf, Tv, T5d, Tj, Tm, To;
Chris@10 232 {
Chris@10 233 V T5h, TQ, T5m, T5i, TO, TS, TJ, T4k, TD, TI;
Chris@10 234 {
Chris@10 235 V T4h, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC;
Chris@10 236 Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 237 T4h = VADD(T10, T15);
Chris@10 238 T16 = VSUB(T10, T15);
Chris@10 239 TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
Chris@10 240 T1d = BYTWJ(&(W[TWVL * 10]), T1c);
Chris@10 241 T1f = BYTWJ(&(W[TWVL * 74]), T1e);
Chris@10 242 TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 243 TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
Chris@10 244 TA = BYTWJ(&(W[TWVL * 2]), Tz);
Chris@10 245 TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 246 TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
Chris@10 247 TC = BYTWJ(&(W[TWVL * 66]), TB);
Chris@10 248 {
Chris@10 249 V T1g, T5r, TF, TH, TL, TN, TP;
Chris@10 250 TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
Chris@10 251 T1g = VADD(T1d, T1f);
Chris@10 252 T5r = VSUB(T1d, T1f);
Chris@10 253 TF = BYTWJ(&(W[TWVL * 34]), TE);
Chris@10 254 TH = BYTWJ(&(W[TWVL * 98]), TG);
Chris@10 255 TL = BYTWJ(&(W[TWVL * 18]), TK);
Chris@10 256 TN = BYTWJ(&(W[TWVL * 82]), TM);
Chris@10 257 T5h = VSUB(TA, TC);
Chris@10 258 TD = VADD(TA, TC);
Chris@10 259 TQ = BYTWJ(&(W[TWVL * 114]), TP);
Chris@10 260 {
Chris@10 261 V T5w, T5t, T4i, T1h, TR;
Chris@10 262 T5w = VSUB(T5s, T5r);
Chris@10 263 T5t = VADD(T5r, T5s);
Chris@10 264 T4i = VADD(T1g, T1b);
Chris@10 265 T1h = VSUB(T1b, T1g);
Chris@10 266 T5m = VSUB(TF, TH);
Chris@10 267 TI = VADD(TF, TH);
Chris@10 268 T5i = VSUB(TL, TN);
Chris@10 269 TO = VADD(TL, TN);
Chris@10 270 TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@10 271 T5u = VFMA(LDK(KP707106781), T5t, T5q);
Chris@10 272 T7h = VFNMS(LDK(KP707106781), T5t, T5q);
Chris@10 273 T5x = VFMA(LDK(KP707106781), T5w, T5v);
Chris@10 274 T7g = VFNMS(LDK(KP707106781), T5w, T5v);
Chris@10 275 T1i = VFNMS(LDK(KP414213562), T1h, T16);
Chris@10 276 T3a = VFMA(LDK(KP414213562), T16, T1h);
Chris@10 277 T4j = VADD(T4h, T4i);
Chris@10 278 T4C = VSUB(T4h, T4i);
Chris@10 279 TS = BYTWJ(&(W[TWVL * 50]), TR);
Chris@10 280 }
Chris@10 281 }
Chris@10 282 }
Chris@10 283 TJ = VSUB(TD, TI);
Chris@10 284 T4k = VADD(TD, TI);
Chris@10 285 {
Chris@10 286 V Tb, Td, Tr, T5j, TT, Tt, Tg;
Chris@10 287 Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 288 Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
Chris@10 289 Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 290 T5j = VSUB(TQ, TS);
Chris@10 291 TT = VADD(TQ, TS);
Chris@10 292 Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
Chris@10 293 Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@10 294 {
Chris@10 295 V Ti, Tc, Te, Ts;
Chris@10 296 Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
Chris@10 297 Tc = BYTWJ(&(W[TWVL * 6]), Tb);
Chris@10 298 Te = BYTWJ(&(W[TWVL * 70]), Td);
Chris@10 299 Ts = BYTWJ(&(W[TWVL * 22]), Tr);
Chris@10 300 {
Chris@10 301 V T5k, T5n, TU, T4l, Tu;
Chris@10 302 T5k = VADD(T5i, T5j);
Chris@10 303 T5n = VSUB(T5i, T5j);
Chris@10 304 TU = VSUB(TO, TT);
Chris@10 305 T4l = VADD(TO, TT);
Chris@10 306 Tu = BYTWJ(&(W[TWVL * 86]), Tt);
Chris@10 307 Th = BYTWJ(&(W[TWVL * 38]), Tg);
Chris@10 308 T59 = VSUB(Tc, Te);
Chris@10 309 Tf = VADD(Tc, Te);
Chris@10 310 T7e = VFNMS(LDK(KP707106781), T5k, T5h);
Chris@10 311 T5l = VFMA(LDK(KP707106781), T5k, T5h);
Chris@10 312 T7d = VFNMS(LDK(KP707106781), T5n, T5m);
Chris@10 313 T5o = VFMA(LDK(KP707106781), T5n, T5m);
Chris@10 314 T3b = VFMA(LDK(KP414213562), TJ, TU);
Chris@10 315 TV = VFNMS(LDK(KP414213562), TU, TJ);
Chris@10 316 T4B = VSUB(T4k, T4l);
Chris@10 317 T4m = VADD(T4k, T4l);
Chris@10 318 Tv = VADD(Ts, Tu);
Chris@10 319 T5d = VSUB(Tu, Ts);
Chris@10 320 Tj = BYTWJ(&(W[TWVL * 102]), Ti);
Chris@10 321 }
Chris@10 322 }
Chris@10 323 Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
Chris@10 324 To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@10 325 }
Chris@10 326 }
Chris@10 327 {
Chris@10 328 V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J;
Chris@10 329 {
Chris@10 330 V T1w, T1y, T1M, T1O, Tq, T5c, T1B;
Chris@10 331 T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 332 T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
Chris@10 333 T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 334 T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
Chris@10 335 T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@10 336 {
Chris@10 337 V Tk, T5a, Tn, Tp;
Chris@10 338 Tk = VADD(Th, Tj);
Chris@10 339 T5a = VSUB(Th, Tj);
Chris@10 340 Tn = BYTWJ(&(W[TWVL * 118]), Tm);
Chris@10 341 Tp = BYTWJ(&(W[TWVL * 54]), To);
Chris@10 342 {
Chris@10 343 V T1x, T1z, T1N, T1P;
Chris@10 344 T1x = BYTWJ(&(W[TWVL * 8]), T1w);
Chris@10 345 T1z = BYTWJ(&(W[TWVL * 72]), T1y);
Chris@10 346 T1N = BYTWJ(&(W[TWVL * 24]), T1M);
Chris@10 347 T1P = BYTWJ(&(W[TWVL * 88]), T1O);
Chris@10 348 T5b = VFNMS(LDK(KP414213562), T5a, T59);
Chris@10 349 T6m = VFMA(LDK(KP414213562), T59, T5a);
Chris@10 350 T3X = VADD(Tf, Tk);
Chris@10 351 Tl = VSUB(Tf, Tk);
Chris@10 352 Tq = VADD(Tn, Tp);
Chris@10 353 T5c = VSUB(Tn, Tp);
Chris@10 354 T1A = VADD(T1x, T1z);
Chris@10 355 T5G = VSUB(T1x, T1z);
Chris@10 356 T1Q = VADD(T1N, T1P);
Chris@10 357 T5K = VSUB(T1N, T1P);
Chris@10 358 T1C = BYTWJ(&(W[TWVL * 40]), T1B);
Chris@10 359 }
Chris@10 360 }
Chris@10 361 T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
Chris@10 362 T5e = VFNMS(LDK(KP414213562), T5d, T5c);
Chris@10 363 T6n = VFMA(LDK(KP414213562), T5c, T5d);
Chris@10 364 T3Y = VADD(Tq, Tv);
Chris@10 365 Tw = VSUB(Tq, Tv);
Chris@10 366 T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
Chris@10 367 T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@10 368 }
Chris@10 369 {
Chris@10 370 V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E;
Chris@10 371 T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 372 T1E = BYTWJ(&(W[TWVL * 104]), T1D);
Chris@10 373 T6o = VSUB(T6m, T6n);
Chris@10 374 T7b = VADD(T6m, T6n);
Chris@10 375 T5f = VADD(T5b, T5e);
Chris@10 376 T7C = VSUB(T5e, T5b);
Chris@10 377 Tx = VADD(Tl, Tw);
Chris@10 378 T38 = VSUB(Tw, Tl);
Chris@10 379 T1I = BYTWJ(&(W[TWVL * 120]), T1H);
Chris@10 380 T1K = BYTWJ(&(W[TWVL * 56]), T1J);
Chris@10 381 T1F = VADD(T1C, T1E);
Chris@10 382 T5H = VSUB(T1C, T1E);
Chris@10 383 T2k = BYTWJ(&(W[TWVL * 4]), T2j);
Chris@10 384 T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
Chris@10 385 T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 386 T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
Chris@10 387 {
Chris@10 388 V T5I, T5R, T44, T1G, T2m, T2A, T2C, T5S, T5L, T1R, T45, T2o, T5J, T1L;
Chris@10 389 T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 390 T5J = VSUB(T1I, T1K);
Chris@10 391 T1L = VADD(T1I, T1K);
Chris@10 392 T5I = VFNMS(LDK(KP414213562), T5H, T5G);
Chris@10 393 T5R = VFMA(LDK(KP414213562), T5G, T5H);
Chris@10 394 T44 = VADD(T1A, T1F);
Chris@10 395 T1G = VSUB(T1A, T1F);
Chris@10 396 T2m = BYTWJ(&(W[TWVL * 68]), T2l);
Chris@10 397 T2A = BYTWJ(&(W[TWVL * 20]), T2z);
Chris@10 398 T2C = BYTWJ(&(W[TWVL * 84]), T2B);
Chris@10 399 T5S = VFNMS(LDK(KP414213562), T5J, T5K);
Chris@10 400 T5L = VFMA(LDK(KP414213562), T5K, T5J);
Chris@10 401 T1R = VSUB(T1L, T1Q);
Chris@10 402 T45 = VADD(T1L, T1Q);
Chris@10 403 T2p = BYTWJ(&(W[TWVL * 36]), T2o);
Chris@10 404 T61 = VSUB(T2k, T2m);
Chris@10 405 T2n = VADD(T2k, T2m);
Chris@10 406 T65 = VSUB(T2C, T2A);
Chris@10 407 T2D = VADD(T2A, T2C);
Chris@10 408 T7p = VSUB(T5I, T5L);
Chris@10 409 T5M = VADD(T5I, T5L);
Chris@10 410 T7m = VSUB(T5R, T5S);
Chris@10 411 T5T = VADD(T5R, T5S);
Chris@10 412 T4G = VSUB(T44, T45);
Chris@10 413 T46 = VADD(T44, T45);
Chris@10 414 T25 = VSUB(T1G, T1R);
Chris@10 415 T1S = VADD(T1G, T1R);
Chris@10 416 T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
Chris@10 417 }
Chris@10 418 T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
Chris@10 419 T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@10 420 }
Chris@10 421 }
Chris@10 422 }
Chris@10 423 }
Chris@10 424 {
Chris@10 425 V T67, T7w, T6e, T7t, T3s, T3E, T39, T3D, T1k, T3k, T3t, T3c, T1T, T3v, T3w;
Chris@10 426 V T26, T2G, T3y, T3z, T2T;
Chris@10 427 {
Chris@10 428 V T4A, T4N, T47, T4v, T2r, T2v, T2x, T4s, T40, T3W, T3Z;
Chris@10 429 T4A = VSUB(T3U, T3V);
Chris@10 430 T3W = VADD(T3U, T3V);
Chris@10 431 T3Z = VADD(T3X, T3Y);
Chris@10 432 T4N = VSUB(T3Y, T3X);
Chris@10 433 T47 = VSUB(T43, T46);
Chris@10 434 T4v = VADD(T43, T46);
Chris@10 435 T2r = BYTWJ(&(W[TWVL * 100]), T2q);
Chris@10 436 T2v = BYTWJ(&(W[TWVL * 116]), T2u);
Chris@10 437 T2x = BYTWJ(&(W[TWVL * 52]), T2w);
Chris@10 438 T4s = VADD(T3W, T3Z);
Chris@10 439 T40 = VSUB(T3W, T3Z);
Chris@10 440 {
Chris@10 441 V T4O, T4n, T4R, T4H, T4E, T4W, T4u, T4y, T4d, T4J, T2F, T2S;
Chris@10 442 {
Chris@10 443 V T6c, T63, T2t, T4b, T6d, T66, T2E, T4c;
Chris@10 444 {
Chris@10 445 V T4D, T62, T2s, T64, T2y, T4t;
Chris@10 446 T4O = VSUB(T4C, T4B);
Chris@10 447 T4D = VADD(T4B, T4C);
Chris@10 448 T62 = VSUB(T2r, T2p);
Chris@10 449 T2s = VADD(T2p, T2r);
Chris@10 450 T64 = VSUB(T2v, T2x);
Chris@10 451 T2y = VADD(T2v, T2x);
Chris@10 452 T4t = VADD(T4m, T4j);
Chris@10 453 T4n = VSUB(T4j, T4m);
Chris@10 454 T4R = VFMA(LDK(KP414213562), T4F, T4G);
Chris@10 455 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
Chris@10 456 T4E = VFMA(LDK(KP707106781), T4D, T4A);
Chris@10 457 T4W = VFNMS(LDK(KP707106781), T4D, T4A);
Chris@10 458 T6c = VFNMS(LDK(KP414213562), T61, T62);
Chris@10 459 T63 = VFMA(LDK(KP414213562), T62, T61);
Chris@10 460 T2t = VSUB(T2n, T2s);
Chris@10 461 T4b = VADD(T2n, T2s);
Chris@10 462 T6d = VFMA(LDK(KP414213562), T64, T65);
Chris@10 463 T66 = VFNMS(LDK(KP414213562), T65, T64);
Chris@10 464 T2E = VSUB(T2y, T2D);
Chris@10 465 T4c = VADD(T2y, T2D);
Chris@10 466 T4u = VADD(T4s, T4t);
Chris@10 467 T4y = VSUB(T4s, T4t);
Chris@10 468 }
Chris@10 469 T67 = VADD(T63, T66);
Chris@10 470 T7w = VSUB(T66, T63);
Chris@10 471 T6e = VADD(T6c, T6d);
Chris@10 472 T7t = VSUB(T6d, T6c);
Chris@10 473 T4d = VADD(T4b, T4c);
Chris@10 474 T4J = VSUB(T4c, T4b);
Chris@10 475 T2F = VADD(T2t, T2E);
Chris@10 476 T2S = VSUB(T2E, T2t);
Chris@10 477 }
Chris@10 478 {
Chris@10 479 V Ty, T1j, T4Q, T4K;
Chris@10 480 Ty = VFMA(LDK(KP707106781), Tx, Ta);
Chris@10 481 T3s = VFNMS(LDK(KP707106781), Tx, Ta);
Chris@10 482 T3E = VSUB(T1i, TV);
Chris@10 483 T1j = VADD(TV, T1i);
Chris@10 484 T39 = VFMA(LDK(KP707106781), T38, T37);
Chris@10 485 T3D = VFNMS(LDK(KP707106781), T38, T37);
Chris@10 486 T4Q = VFMA(LDK(KP414213562), T4I, T4J);
Chris@10 487 T4K = VFNMS(LDK(KP414213562), T4J, T4I);
Chris@10 488 {
Chris@10 489 V T4w, T4e, T4P, T4Z;
Chris@10 490 T4w = VADD(T4a, T4d);
Chris@10 491 T4e = VSUB(T4a, T4d);
Chris@10 492 T4P = VFMA(LDK(KP707106781), T4O, T4N);
Chris@10 493 T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
Chris@10 494 T1k = VFMA(LDK(KP923879532), T1j, Ty);
Chris@10 495 T3k = VFNMS(LDK(KP923879532), T1j, Ty);
Chris@10 496 {
Chris@10 497 V T4L, T50, T4S, T4X;
Chris@10 498 T4L = VADD(T4H, T4K);
Chris@10 499 T50 = VSUB(T4K, T4H);
Chris@10 500 T4S = VSUB(T4Q, T4R);
Chris@10 501 T4X = VADD(T4R, T4Q);
Chris@10 502 {
Chris@10 503 V T4f, T4o, T4x, T4z;
Chris@10 504 T4f = VADD(T47, T4e);
Chris@10 505 T4o = VSUB(T4e, T47);
Chris@10 506 T4x = VADD(T4v, T4w);
Chris@10 507 T4z = VSUB(T4w, T4v);
Chris@10 508 {
Chris@10 509 V T53, T51, T4M, T4U;
Chris@10 510 T53 = VFNMS(LDK(KP923879532), T50, T4Z);
Chris@10 511 T51 = VFMA(LDK(KP923879532), T50, T4Z);
Chris@10 512 T4M = VFNMS(LDK(KP923879532), T4L, T4E);
Chris@10 513 T4U = VFMA(LDK(KP923879532), T4L, T4E);
Chris@10 514 {
Chris@10 515 V T52, T4Y, T4T, T4V;
Chris@10 516 T52 = VFMA(LDK(KP923879532), T4X, T4W);
Chris@10 517 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
Chris@10 518 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
Chris@10 519 T4V = VFMA(LDK(KP923879532), T4S, T4P);
Chris@10 520 {
Chris@10 521 V T4p, T4r, T4g, T4q;
Chris@10 522 T4p = VFNMS(LDK(KP707106781), T4o, T4n);
Chris@10 523 T4r = VFMA(LDK(KP707106781), T4o, T4n);
Chris@10 524 T4g = VFNMS(LDK(KP707106781), T4f, T40);
Chris@10 525 T4q = VFMA(LDK(KP707106781), T4f, T40);
Chris@10 526 ST(&(x[WS(rs, 16)]), VFMAI(T4z, T4y), ms, &(x[0]));
Chris@10 527 ST(&(x[WS(rs, 48)]), VFNMSI(T4z, T4y), ms, &(x[0]));
Chris@10 528 ST(&(x[0]), VADD(T4u, T4x), ms, &(x[0]));
Chris@10 529 ST(&(x[WS(rs, 32)]), VSUB(T4u, T4x), ms, &(x[0]));
Chris@10 530 ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
Chris@10 531 ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
Chris@10 532 ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
Chris@10 533 ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
Chris@10 534 ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
Chris@10 535 ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
Chris@10 536 ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
Chris@10 537 ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
Chris@10 538 ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
Chris@10 539 ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
Chris@10 540 ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
Chris@10 541 ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
Chris@10 542 T3t = VADD(T3b, T3a);
Chris@10 543 T3c = VSUB(T3a, T3b);
Chris@10 544 }
Chris@10 545 }
Chris@10 546 }
Chris@10 547 }
Chris@10 548 }
Chris@10 549 }
Chris@10 550 T1T = VFMA(LDK(KP707106781), T1S, T1v);
Chris@10 551 T3v = VFNMS(LDK(KP707106781), T1S, T1v);
Chris@10 552 T3w = VFNMS(LDK(KP707106781), T25, T24);
Chris@10 553 T26 = VFMA(LDK(KP707106781), T25, T24);
Chris@10 554 T2G = VFMA(LDK(KP707106781), T2F, T2i);
Chris@10 555 T3y = VFNMS(LDK(KP707106781), T2F, T2i);
Chris@10 556 T3z = VFNMS(LDK(KP707106781), T2S, T2R);
Chris@10 557 T2T = VFMA(LDK(KP707106781), T2S, T2R);
Chris@10 558 }
Chris@10 559 }
Chris@10 560 }
Chris@10 561 {
Chris@10 562 V T3u, T3M, T3F, T3P, T3x, T3H, T3q, T3m, T3h, T3j, T3r, T3p, T2W, T3i;
Chris@10 563 {
Chris@10 564 V T3d, T3n, T27, T3f, T2U, T3e;
Chris@10 565 T3d = VFMA(LDK(KP923879532), T3c, T39);
Chris@10 566 T3n = VFNMS(LDK(KP923879532), T3c, T39);
Chris@10 567 T27 = VFNMS(LDK(KP198912367), T26, T1T);
Chris@10 568 T3f = VFMA(LDK(KP198912367), T1T, T26);
Chris@10 569 T2U = VFNMS(LDK(KP198912367), T2T, T2G);
Chris@10 570 T3e = VFMA(LDK(KP198912367), T2G, T2T);
Chris@10 571 T3u = VFMA(LDK(KP923879532), T3t, T3s);
Chris@10 572 T3M = VFNMS(LDK(KP923879532), T3t, T3s);
Chris@10 573 {
Chris@10 574 V T3g, T3l, T2V, T3o;
Chris@10 575 T3g = VSUB(T3e, T3f);
Chris@10 576 T3l = VADD(T3f, T3e);
Chris@10 577 T2V = VADD(T27, T2U);
Chris@10 578 T3o = VSUB(T2U, T27);
Chris@10 579 T3F = VFNMS(LDK(KP923879532), T3E, T3D);
Chris@10 580 T3P = VFMA(LDK(KP923879532), T3E, T3D);
Chris@10 581 T3x = VFMA(LDK(KP668178637), T3w, T3v);
Chris@10 582 T3H = VFNMS(LDK(KP668178637), T3v, T3w);
Chris@10 583 T3q = VFMA(LDK(KP980785280), T3l, T3k);
Chris@10 584 T3m = VFNMS(LDK(KP980785280), T3l, T3k);
Chris@10 585 T3h = VFNMS(LDK(KP980785280), T3g, T3d);
Chris@10 586 T3j = VFMA(LDK(KP980785280), T3g, T3d);
Chris@10 587 T3r = VFNMS(LDK(KP980785280), T3o, T3n);
Chris@10 588 T3p = VFMA(LDK(KP980785280), T3o, T3n);
Chris@10 589 T2W = VFNMS(LDK(KP980785280), T2V, T1k);
Chris@10 590 T3i = VFMA(LDK(KP980785280), T2V, T1k);
Chris@10 591 }
Chris@10 592 }
Chris@10 593 {
Chris@10 594 V T7n, T7Z, T8j, T89, T7k, T7O, T8g, T7Y, T7H, T7R, T80, T7q, T7u, T82, T83;
Chris@10 595 V T7x;
Chris@10 596 {
Chris@10 597 V T7c, T7W, T7D, T87, T7f, T7F, T3A, T3G, T7E, T7i;
Chris@10 598 T7c = VFNMS(LDK(KP923879532), T7b, T7a);
Chris@10 599 T7W = VFMA(LDK(KP923879532), T7b, T7a);
Chris@10 600 T7D = VFNMS(LDK(KP923879532), T7C, T7B);
Chris@10 601 T87 = VFMA(LDK(KP923879532), T7C, T7B);
Chris@10 602 T7f = VFNMS(LDK(KP668178637), T7e, T7d);
Chris@10 603 T7F = VFMA(LDK(KP668178637), T7d, T7e);
Chris@10 604 ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
Chris@10 605 ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
Chris@10 606 ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
Chris@10 607 ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
Chris@10 608 ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
Chris@10 609 ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
Chris@10 610 ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
Chris@10 611 ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
Chris@10 612 T3A = VFMA(LDK(KP668178637), T3z, T3y);
Chris@10 613 T3G = VFNMS(LDK(KP668178637), T3y, T3z);
Chris@10 614 T7E = VFMA(LDK(KP668178637), T7g, T7h);
Chris@10 615 T7i = VFNMS(LDK(KP668178637), T7h, T7g);
Chris@10 616 T7n = VFNMS(LDK(KP923879532), T7m, T7l);
Chris@10 617 T7Z = VFMA(LDK(KP923879532), T7m, T7l);
Chris@10 618 {
Chris@10 619 V T3I, T3N, T3B, T3Q;
Chris@10 620 T3I = VSUB(T3G, T3H);
Chris@10 621 T3N = VADD(T3H, T3G);
Chris@10 622 T3B = VADD(T3x, T3A);
Chris@10 623 T3Q = VSUB(T3A, T3x);
Chris@10 624 {
Chris@10 625 V T7j, T88, T7G, T7X;
Chris@10 626 T7j = VADD(T7f, T7i);
Chris@10 627 T88 = VSUB(T7f, T7i);
Chris@10 628 T7G = VSUB(T7E, T7F);
Chris@10 629 T7X = VADD(T7F, T7E);
Chris@10 630 {
Chris@10 631 V T3S, T3O, T3J, T3L;
Chris@10 632 T3S = VFNMS(LDK(KP831469612), T3N, T3M);
Chris@10 633 T3O = VFMA(LDK(KP831469612), T3N, T3M);
Chris@10 634 T3J = VFNMS(LDK(KP831469612), T3I, T3F);
Chris@10 635 T3L = VFMA(LDK(KP831469612), T3I, T3F);
Chris@10 636 {
Chris@10 637 V T3T, T3R, T3C, T3K;
Chris@10 638 T3T = VFMA(LDK(KP831469612), T3Q, T3P);
Chris@10 639 T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
Chris@10 640 T3C = VFNMS(LDK(KP831469612), T3B, T3u);
Chris@10 641 T3K = VFMA(LDK(KP831469612), T3B, T3u);
Chris@10 642 T8j = VFNMS(LDK(KP831469612), T88, T87);
Chris@10 643 T89 = VFMA(LDK(KP831469612), T88, T87);
Chris@10 644 T7k = VFNMS(LDK(KP831469612), T7j, T7c);
Chris@10 645 T7O = VFMA(LDK(KP831469612), T7j, T7c);
Chris@10 646 T8g = VFNMS(LDK(KP831469612), T7X, T7W);
Chris@10 647 T7Y = VFMA(LDK(KP831469612), T7X, T7W);
Chris@10 648 T7H = VFNMS(LDK(KP831469612), T7G, T7D);
Chris@10 649 T7R = VFMA(LDK(KP831469612), T7G, T7D);
Chris@10 650 ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
Chris@10 651 ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
Chris@10 652 ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
Chris@10 653 ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
Chris@10 654 ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
Chris@10 655 ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
Chris@10 656 ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
Chris@10 657 ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
Chris@10 658 T80 = VFNMS(LDK(KP923879532), T7p, T7o);
Chris@10 659 T7q = VFMA(LDK(KP923879532), T7p, T7o);
Chris@10 660 }
Chris@10 661 }
Chris@10 662 }
Chris@10 663 }
Chris@10 664 T7u = VFNMS(LDK(KP923879532), T7t, T7s);
Chris@10 665 T82 = VFMA(LDK(KP923879532), T7t, T7s);
Chris@10 666 T83 = VFNMS(LDK(KP923879532), T7w, T7v);
Chris@10 667 T7x = VFMA(LDK(KP923879532), T7w, T7v);
Chris@10 668 }
Chris@10 669 {
Chris@10 670 V T5g, T6I, T6p, T6T, T5p, T6q, T6r, T5y;
Chris@10 671 T5g = VFMA(LDK(KP923879532), T5f, T58);
Chris@10 672 T6I = VFNMS(LDK(KP923879532), T5f, T58);
Chris@10 673 {
Chris@10 674 V T7r, T7I, T7y, T7J;
Chris@10 675 T7r = VFNMS(LDK(KP534511135), T7q, T7n);
Chris@10 676 T7I = VFMA(LDK(KP534511135), T7n, T7q);
Chris@10 677 T7y = VFNMS(LDK(KP534511135), T7x, T7u);
Chris@10 678 T7J = VFMA(LDK(KP534511135), T7u, T7x);
Chris@10 679 {
Chris@10 680 V T81, T8a, T84, T8b;
Chris@10 681 T81 = VFMA(LDK(KP303346683), T80, T7Z);
Chris@10 682 T8a = VFNMS(LDK(KP303346683), T7Z, T80);
Chris@10 683 T84 = VFMA(LDK(KP303346683), T83, T82);
Chris@10 684 T8b = VFNMS(LDK(KP303346683), T82, T83);
Chris@10 685 T6p = VFMA(LDK(KP923879532), T6o, T6l);
Chris@10 686 T6T = VFNMS(LDK(KP923879532), T6o, T6l);
Chris@10 687 T5p = VFNMS(LDK(KP198912367), T5o, T5l);
Chris@10 688 T6q = VFMA(LDK(KP198912367), T5l, T5o);
Chris@10 689 {
Chris@10 690 V T7K, T7P, T7z, T7S;
Chris@10 691 T7K = VSUB(T7I, T7J);
Chris@10 692 T7P = VADD(T7I, T7J);
Chris@10 693 T7z = VADD(T7r, T7y);
Chris@10 694 T7S = VSUB(T7y, T7r);
Chris@10 695 {
Chris@10 696 V T8c, T8h, T85, T8k;
Chris@10 697 T8c = VSUB(T8a, T8b);
Chris@10 698 T8h = VADD(T8a, T8b);
Chris@10 699 T85 = VADD(T81, T84);
Chris@10 700 T8k = VSUB(T84, T81);
Chris@10 701 {
Chris@10 702 V T7Q, T7U, T7L, T7N;
Chris@10 703 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
Chris@10 704 T7U = VFMA(LDK(KP881921264), T7P, T7O);
Chris@10 705 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
Chris@10 706 T7N = VFMA(LDK(KP881921264), T7K, T7H);
Chris@10 707 {
Chris@10 708 V T7T, T7V, T7A, T7M;
Chris@10 709 T7T = VFNMS(LDK(KP881921264), T7S, T7R);
Chris@10 710 T7V = VFMA(LDK(KP881921264), T7S, T7R);
Chris@10 711 T7A = VFNMS(LDK(KP881921264), T7z, T7k);
Chris@10 712 T7M = VFMA(LDK(KP881921264), T7z, T7k);
Chris@10 713 {
Chris@10 714 V T8i, T8m, T8d, T8f;
Chris@10 715 T8i = VFMA(LDK(KP956940335), T8h, T8g);
Chris@10 716 T8m = VFNMS(LDK(KP956940335), T8h, T8g);
Chris@10 717 T8d = VFNMS(LDK(KP956940335), T8c, T89);
Chris@10 718 T8f = VFMA(LDK(KP956940335), T8c, T89);
Chris@10 719 {
Chris@10 720 V T8l, T8n, T86, T8e;
Chris@10 721 T8l = VFMA(LDK(KP956940335), T8k, T8j);
Chris@10 722 T8n = VFNMS(LDK(KP956940335), T8k, T8j);
Chris@10 723 T86 = VFNMS(LDK(KP956940335), T85, T7Y);
Chris@10 724 T8e = VFMA(LDK(KP956940335), T85, T7Y);
Chris@10 725 ST(&(x[WS(rs, 53)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
Chris@10 726 ST(&(x[WS(rs, 11)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
Chris@10 727 ST(&(x[WS(rs, 43)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
Chris@10 728 ST(&(x[WS(rs, 21)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
Chris@10 729 ST(&(x[WS(rs, 59)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
Chris@10 730 ST(&(x[WS(rs, 5)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
Chris@10 731 ST(&(x[WS(rs, 27)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
Chris@10 732 ST(&(x[WS(rs, 37)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
Chris@10 733 ST(&(x[WS(rs, 51)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
Chris@10 734 ST(&(x[WS(rs, 13)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
Chris@10 735 ST(&(x[WS(rs, 45)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
Chris@10 736 ST(&(x[WS(rs, 19)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
Chris@10 737 ST(&(x[WS(rs, 3)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
Chris@10 738 ST(&(x[WS(rs, 61)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
Chris@10 739 ST(&(x[WS(rs, 35)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
Chris@10 740 ST(&(x[WS(rs, 29)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
Chris@10 741 T6r = VFMA(LDK(KP198912367), T5u, T5x);
Chris@10 742 T5y = VFNMS(LDK(KP198912367), T5x, T5u);
Chris@10 743 }
Chris@10 744 }
Chris@10 745 }
Chris@10 746 }
Chris@10 747 }
Chris@10 748 }
Chris@10 749 }
Chris@10 750 }
Chris@10 751 {
Chris@10 752 V T5N, T5U, T68, T5z, T6U, T6f;
Chris@10 753 T5N = VFMA(LDK(KP923879532), T5M, T5F);
Chris@10 754 T6L = VFNMS(LDK(KP923879532), T5M, T5F);
Chris@10 755 T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
Chris@10 756 T5U = VFMA(LDK(KP923879532), T5T, T5Q);
Chris@10 757 T68 = VFMA(LDK(KP923879532), T67, T60);
Chris@10 758 T6O = VFNMS(LDK(KP923879532), T67, T60);
Chris@10 759 T5z = VADD(T5p, T5y);
Chris@10 760 T6U = VSUB(T5y, T5p);
Chris@10 761 T6P = VFNMS(LDK(KP923879532), T6e, T6b);
Chris@10 762 T6f = VFMA(LDK(KP923879532), T6e, T6b);
Chris@10 763 {
Chris@10 764 V T5V, T6u, T6g, T6v, T6s, T6J;
Chris@10 765 T6s = VSUB(T6q, T6r);
Chris@10 766 T6J = VADD(T6q, T6r);
Chris@10 767 T5V = VFNMS(LDK(KP098491403), T5U, T5N);
Chris@10 768 T6u = VFMA(LDK(KP098491403), T5N, T5U);
Chris@10 769 T75 = VFNMS(LDK(KP980785280), T6U, T6T);
Chris@10 770 T6V = VFMA(LDK(KP980785280), T6U, T6T);
Chris@10 771 T5A = VFMA(LDK(KP980785280), T5z, T5g);
Chris@10 772 T6A = VFNMS(LDK(KP980785280), T5z, T5g);
Chris@10 773 T6g = VFNMS(LDK(KP098491403), T6f, T68);
Chris@10 774 T6v = VFMA(LDK(KP098491403), T68, T6f);
Chris@10 775 T72 = VFNMS(LDK(KP980785280), T6J, T6I);
Chris@10 776 T6K = VFMA(LDK(KP980785280), T6J, T6I);
Chris@10 777 T6t = VFMA(LDK(KP980785280), T6s, T6p);
Chris@10 778 T6D = VFNMS(LDK(KP980785280), T6s, T6p);
Chris@10 779 T6w = VSUB(T6u, T6v);
Chris@10 780 T6B = VADD(T6u, T6v);
Chris@10 781 T6h = VADD(T5V, T6g);
Chris@10 782 T6E = VSUB(T6g, T5V);
Chris@10 783 }
Chris@10 784 }
Chris@10 785 }
Chris@10 786 }
Chris@10 787 }
Chris@10 788 }
Chris@10 789 }
Chris@10 790 {
Chris@10 791 V T6W, T6N, T6G, T6C, T6z, T6x, T6H, T6F, T6y, T6i, T6X, T6Q;
Chris@10 792 T6W = VFNMS(LDK(KP820678790), T6L, T6M);
Chris@10 793 T6N = VFMA(LDK(KP820678790), T6M, T6L);
Chris@10 794 T6G = VFMA(LDK(KP995184726), T6B, T6A);
Chris@10 795 T6C = VFNMS(LDK(KP995184726), T6B, T6A);
Chris@10 796 T6z = VFMA(LDK(KP995184726), T6w, T6t);
Chris@10 797 T6x = VFNMS(LDK(KP995184726), T6w, T6t);
Chris@10 798 T6H = VFMA(LDK(KP995184726), T6E, T6D);
Chris@10 799 T6F = VFNMS(LDK(KP995184726), T6E, T6D);
Chris@10 800 T6y = VFMA(LDK(KP995184726), T6h, T5A);
Chris@10 801 T6i = VFNMS(LDK(KP995184726), T6h, T5A);
Chris@10 802 T6X = VFNMS(LDK(KP820678790), T6O, T6P);
Chris@10 803 T6Q = VFMA(LDK(KP820678790), T6P, T6O);
Chris@10 804 {
Chris@10 805 V T73, T6Y, T76, T6R;
Chris@10 806 ST(&(x[WS(rs, 49)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
Chris@10 807 ST(&(x[WS(rs, 15)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
Chris@10 808 ST(&(x[WS(rs, 47)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
Chris@10 809 ST(&(x[WS(rs, 17)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
Chris@10 810 ST(&(x[WS(rs, 63)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
Chris@10 811 ST(&(x[WS(rs, 1)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
Chris@10 812 ST(&(x[WS(rs, 31)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
Chris@10 813 ST(&(x[WS(rs, 33)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
Chris@10 814 T73 = VADD(T6W, T6X);
Chris@10 815 T6Y = VSUB(T6W, T6X);
Chris@10 816 T76 = VSUB(T6Q, T6N);
Chris@10 817 T6R = VADD(T6N, T6Q);
Chris@10 818 {
Chris@10 819 V T78, T74, T71, T6Z, T79, T77, T70, T6S;
Chris@10 820 T78 = VFNMS(LDK(KP773010453), T73, T72);
Chris@10 821 T74 = VFMA(LDK(KP773010453), T73, T72);
Chris@10 822 T71 = VFMA(LDK(KP773010453), T6Y, T6V);
Chris@10 823 T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
Chris@10 824 T79 = VFNMS(LDK(KP773010453), T76, T75);
Chris@10 825 T77 = VFMA(LDK(KP773010453), T76, T75);
Chris@10 826 T70 = VFMA(LDK(KP773010453), T6R, T6K);
Chris@10 827 T6S = VFNMS(LDK(KP773010453), T6R, T6K);
Chris@10 828 ST(&(x[WS(rs, 55)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
Chris@10 829 ST(&(x[WS(rs, 9)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
Chris@10 830 ST(&(x[WS(rs, 41)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
Chris@10 831 ST(&(x[WS(rs, 23)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
Chris@10 832 ST(&(x[WS(rs, 7)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
Chris@10 833 ST(&(x[WS(rs, 57)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
Chris@10 834 ST(&(x[WS(rs, 39)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
Chris@10 835 ST(&(x[WS(rs, 25)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
Chris@10 836 }
Chris@10 837 }
Chris@10 838 }
Chris@10 839 }
Chris@10 840 }
Chris@10 841 VLEAVE();
Chris@10 842 }
Chris@10 843
Chris@10 844 static const tw_instr twinstr[] = {
Chris@10 845 VTW(0, 1),
Chris@10 846 VTW(0, 2),
Chris@10 847 VTW(0, 3),
Chris@10 848 VTW(0, 4),
Chris@10 849 VTW(0, 5),
Chris@10 850 VTW(0, 6),
Chris@10 851 VTW(0, 7),
Chris@10 852 VTW(0, 8),
Chris@10 853 VTW(0, 9),
Chris@10 854 VTW(0, 10),
Chris@10 855 VTW(0, 11),
Chris@10 856 VTW(0, 12),
Chris@10 857 VTW(0, 13),
Chris@10 858 VTW(0, 14),
Chris@10 859 VTW(0, 15),
Chris@10 860 VTW(0, 16),
Chris@10 861 VTW(0, 17),
Chris@10 862 VTW(0, 18),
Chris@10 863 VTW(0, 19),
Chris@10 864 VTW(0, 20),
Chris@10 865 VTW(0, 21),
Chris@10 866 VTW(0, 22),
Chris@10 867 VTW(0, 23),
Chris@10 868 VTW(0, 24),
Chris@10 869 VTW(0, 25),
Chris@10 870 VTW(0, 26),
Chris@10 871 VTW(0, 27),
Chris@10 872 VTW(0, 28),
Chris@10 873 VTW(0, 29),
Chris@10 874 VTW(0, 30),
Chris@10 875 VTW(0, 31),
Chris@10 876 VTW(0, 32),
Chris@10 877 VTW(0, 33),
Chris@10 878 VTW(0, 34),
Chris@10 879 VTW(0, 35),
Chris@10 880 VTW(0, 36),
Chris@10 881 VTW(0, 37),
Chris@10 882 VTW(0, 38),
Chris@10 883 VTW(0, 39),
Chris@10 884 VTW(0, 40),
Chris@10 885 VTW(0, 41),
Chris@10 886 VTW(0, 42),
Chris@10 887 VTW(0, 43),
Chris@10 888 VTW(0, 44),
Chris@10 889 VTW(0, 45),
Chris@10 890 VTW(0, 46),
Chris@10 891 VTW(0, 47),
Chris@10 892 VTW(0, 48),
Chris@10 893 VTW(0, 49),
Chris@10 894 VTW(0, 50),
Chris@10 895 VTW(0, 51),
Chris@10 896 VTW(0, 52),
Chris@10 897 VTW(0, 53),
Chris@10 898 VTW(0, 54),
Chris@10 899 VTW(0, 55),
Chris@10 900 VTW(0, 56),
Chris@10 901 VTW(0, 57),
Chris@10 902 VTW(0, 58),
Chris@10 903 VTW(0, 59),
Chris@10 904 VTW(0, 60),
Chris@10 905 VTW(0, 61),
Chris@10 906 VTW(0, 62),
Chris@10 907 VTW(0, 63),
Chris@10 908 {TW_NEXT, VL, 0}
Chris@10 909 };
Chris@10 910
Chris@10 911 static const ct_desc desc = { 64, XSIMD_STRING("t1fv_64"), twinstr, &GENUS, {261, 126, 258, 0}, 0, 0, 0 };
Chris@10 912
Chris@10 913 void XSIMD(codelet_t1fv_64) (planner *p) {
Chris@10 914 X(kdft_dit_register) (p, t1fv_64, &desc);
Chris@10 915 }
Chris@10 916 #else /* HAVE_FMA */
Chris@10 917
Chris@10 918 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t1fv_64 -include t1f.h */
Chris@10 919
Chris@10 920 /*
Chris@10 921 * This function contains 519 FP additions, 250 FP multiplications,
Chris@10 922 * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
Chris@10 923 * 107 stack variables, 15 constants, and 128 memory accesses
Chris@10 924 */
Chris@10 925 #include "t1f.h"
Chris@10 926
Chris@10 927 static void t1fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 928 {
Chris@10 929 DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
Chris@10 930 DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
Chris@10 931 DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
Chris@10 932 DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
Chris@10 933 DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
Chris@10 934 DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
Chris@10 935 DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
Chris@10 936 DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
Chris@10 937 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 938 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 939 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 940 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 941 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 942 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 943 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 944 {
Chris@10 945 INT m;
Chris@10 946 R *x;
Chris@10 947 x = ri;
Chris@10 948 for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 949 V Tg, T4a, T6r, T7f, T3o, T4B, T5q, T7e, T5R, T62, T28, T4o, T2g, T4l, T7n;
Chris@10 950 V T7Z, T68, T6j, T2C, T4s, T3a, T4v, T7u, T82, T7E, T7F, T7V, T5F, T6u, T1k;
Chris@10 951 V T4e, T1r, T4d, T7B, T7C, T7W, T5M, T6v, TV, T4g, T12, T4h, T7h, T7i, TD;
Chris@10 952 V T4C, T3h, T4b, T5x, T6s, T1R, T4m, T7q, T80, T2j, T4p, T5Y, T63, T2Z, T4w;
Chris@10 953 V T7x, T83, T33, T4t, T6f, T6k;
Chris@10 954 {
Chris@10 955 V T1, T3, T3m, T3k, Tb, Td, Te, T6, T8, T9, T2, T3l, T3j;
Chris@10 956 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 957 T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
Chris@10 958 T3 = BYTWJ(&(W[TWVL * 62]), T2);
Chris@10 959 T3l = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
Chris@10 960 T3m = BYTWJ(&(W[TWVL * 94]), T3l);
Chris@10 961 T3j = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@10 962 T3k = BYTWJ(&(W[TWVL * 30]), T3j);
Chris@10 963 {
Chris@10 964 V Ta, Tc, T5, T7;
Chris@10 965 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
Chris@10 966 Tb = BYTWJ(&(W[TWVL * 110]), Ta);
Chris@10 967 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@10 968 Td = BYTWJ(&(W[TWVL * 46]), Tc);
Chris@10 969 Te = VSUB(Tb, Td);
Chris@10 970 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@10 971 T6 = BYTWJ(&(W[TWVL * 14]), T5);
Chris@10 972 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
Chris@10 973 T8 = BYTWJ(&(W[TWVL * 78]), T7);
Chris@10 974 T9 = VSUB(T6, T8);
Chris@10 975 }
Chris@10 976 {
Chris@10 977 V T4, Tf, T6p, T6q;
Chris@10 978 T4 = VSUB(T1, T3);
Chris@10 979 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
Chris@10 980 Tg = VADD(T4, Tf);
Chris@10 981 T4a = VSUB(T4, Tf);
Chris@10 982 T6p = VADD(Tb, Td);
Chris@10 983 T6q = VADD(T6, T8);
Chris@10 984 T6r = VSUB(T6p, T6q);
Chris@10 985 T7f = VADD(T6q, T6p);
Chris@10 986 }
Chris@10 987 {
Chris@10 988 V T3i, T3n, T5o, T5p;
Chris@10 989 T3i = VMUL(LDK(KP707106781), VSUB(Te, T9));
Chris@10 990 T3n = VSUB(T3k, T3m);
Chris@10 991 T3o = VSUB(T3i, T3n);
Chris@10 992 T4B = VADD(T3n, T3i);
Chris@10 993 T5o = VADD(T1, T3);
Chris@10 994 T5p = VADD(T3k, T3m);
Chris@10 995 T5q = VSUB(T5o, T5p);
Chris@10 996 T7e = VADD(T5o, T5p);
Chris@10 997 }
Chris@10 998 }
Chris@10 999 {
Chris@10 1000 V T24, T26, T5Q, T2b, T2d, T5P, T1W, T60, T21, T61, T22, T27;
Chris@10 1001 {
Chris@10 1002 V T23, T25, T2a, T2c;
Chris@10 1003 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@10 1004 T24 = BYTWJ(&(W[TWVL * 32]), T23);
Chris@10 1005 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
Chris@10 1006 T26 = BYTWJ(&(W[TWVL * 96]), T25);
Chris@10 1007 T5Q = VADD(T24, T26);
Chris@10 1008 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 1009 T2b = BYTWJ(&(W[0]), T2a);
Chris@10 1010 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
Chris@10 1011 T2d = BYTWJ(&(W[TWVL * 64]), T2c);
Chris@10 1012 T5P = VADD(T2b, T2d);
Chris@10 1013 }
Chris@10 1014 {
Chris@10 1015 V T1T, T1V, T1S, T1U;
Chris@10 1016 T1S = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
Chris@10 1017 T1T = BYTWJ(&(W[TWVL * 112]), T1S);
Chris@10 1018 T1U = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
Chris@10 1019 T1V = BYTWJ(&(W[TWVL * 48]), T1U);
Chris@10 1020 T1W = VSUB(T1T, T1V);
Chris@10 1021 T60 = VADD(T1T, T1V);
Chris@10 1022 }
Chris@10 1023 {
Chris@10 1024 V T1Y, T20, T1X, T1Z;
Chris@10 1025 T1X = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@10 1026 T1Y = BYTWJ(&(W[TWVL * 16]), T1X);
Chris@10 1027 T1Z = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
Chris@10 1028 T20 = BYTWJ(&(W[TWVL * 80]), T1Z);
Chris@10 1029 T21 = VSUB(T1Y, T20);
Chris@10 1030 T61 = VADD(T1Y, T20);
Chris@10 1031 }
Chris@10 1032 T5R = VSUB(T5P, T5Q);
Chris@10 1033 T62 = VSUB(T60, T61);
Chris@10 1034 T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
Chris@10 1035 T27 = VSUB(T24, T26);
Chris@10 1036 T28 = VSUB(T22, T27);
Chris@10 1037 T4o = VADD(T27, T22);
Chris@10 1038 {
Chris@10 1039 V T2e, T2f, T7l, T7m;
Chris@10 1040 T2e = VSUB(T2b, T2d);
Chris@10 1041 T2f = VMUL(LDK(KP707106781), VADD(T21, T1W));
Chris@10 1042 T2g = VADD(T2e, T2f);
Chris@10 1043 T4l = VSUB(T2e, T2f);
Chris@10 1044 T7l = VADD(T5P, T5Q);
Chris@10 1045 T7m = VADD(T61, T60);
Chris@10 1046 T7n = VADD(T7l, T7m);
Chris@10 1047 T7Z = VSUB(T7l, T7m);
Chris@10 1048 }
Chris@10 1049 }
Chris@10 1050 {
Chris@10 1051 V T2n, T2p, T66, T36, T38, T67, T2v, T6i, T2A, T6h, T2q, T2B;
Chris@10 1052 {
Chris@10 1053 V T2m, T2o, T35, T37;
Chris@10 1054 T2m = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
Chris@10 1055 T2n = BYTWJ(&(W[TWVL * 124]), T2m);
Chris@10 1056 T2o = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
Chris@10 1057 T2p = BYTWJ(&(W[TWVL * 60]), T2o);
Chris@10 1058 T66 = VADD(T2n, T2p);
Chris@10 1059 T35 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@10 1060 T36 = BYTWJ(&(W[TWVL * 28]), T35);
Chris@10 1061 T37 = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
Chris@10 1062 T38 = BYTWJ(&(W[TWVL * 92]), T37);
Chris@10 1063 T67 = VADD(T36, T38);
Chris@10 1064 }
Chris@10 1065 {
Chris@10 1066 V T2s, T2u, T2r, T2t;
Chris@10 1067 T2r = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 1068 T2s = BYTWJ(&(W[TWVL * 12]), T2r);
Chris@10 1069 T2t = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
Chris@10 1070 T2u = BYTWJ(&(W[TWVL * 76]), T2t);
Chris@10 1071 T2v = VSUB(T2s, T2u);
Chris@10 1072 T6i = VADD(T2s, T2u);
Chris@10 1073 }
Chris@10 1074 {
Chris@10 1075 V T2x, T2z, T2w, T2y;
Chris@10 1076 T2w = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
Chris@10 1077 T2x = BYTWJ(&(W[TWVL * 108]), T2w);
Chris@10 1078 T2y = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@10 1079 T2z = BYTWJ(&(W[TWVL * 44]), T2y);
Chris@10 1080 T2A = VSUB(T2x, T2z);
Chris@10 1081 T6h = VADD(T2x, T2z);
Chris@10 1082 }
Chris@10 1083 T68 = VSUB(T66, T67);
Chris@10 1084 T6j = VSUB(T6h, T6i);
Chris@10 1085 T2q = VSUB(T2n, T2p);
Chris@10 1086 T2B = VMUL(LDK(KP707106781), VADD(T2v, T2A));
Chris@10 1087 T2C = VADD(T2q, T2B);
Chris@10 1088 T4s = VSUB(T2q, T2B);
Chris@10 1089 {
Chris@10 1090 V T34, T39, T7s, T7t;
Chris@10 1091 T34 = VMUL(LDK(KP707106781), VSUB(T2A, T2v));
Chris@10 1092 T39 = VSUB(T36, T38);
Chris@10 1093 T3a = VSUB(T34, T39);
Chris@10 1094 T4v = VADD(T39, T34);
Chris@10 1095 T7s = VADD(T66, T67);
Chris@10 1096 T7t = VADD(T6i, T6h);
Chris@10 1097 T7u = VADD(T7s, T7t);
Chris@10 1098 T82 = VSUB(T7s, T7t);
Chris@10 1099 }
Chris@10 1100 }
Chris@10 1101 {
Chris@10 1102 V T1g, T1i, T5A, T1m, T1o, T5z, T18, T5C, T1d, T5D, T5B, T5E;
Chris@10 1103 {
Chris@10 1104 V T1f, T1h, T1l, T1n;
Chris@10 1105 T1f = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@10 1106 T1g = BYTWJ(&(W[TWVL * 34]), T1f);
Chris@10 1107 T1h = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
Chris@10 1108 T1i = BYTWJ(&(W[TWVL * 98]), T1h);
Chris@10 1109 T5A = VADD(T1g, T1i);
Chris@10 1110 T1l = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 1111 T1m = BYTWJ(&(W[TWVL * 2]), T1l);
Chris@10 1112 T1n = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
Chris@10 1113 T1o = BYTWJ(&(W[TWVL * 66]), T1n);
Chris@10 1114 T5z = VADD(T1m, T1o);
Chris@10 1115 }
Chris@10 1116 {
Chris@10 1117 V T15, T17, T14, T16;
Chris@10 1118 T14 = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
Chris@10 1119 T15 = BYTWJ(&(W[TWVL * 114]), T14);
Chris@10 1120 T16 = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
Chris@10 1121 T17 = BYTWJ(&(W[TWVL * 50]), T16);
Chris@10 1122 T18 = VSUB(T15, T17);
Chris@10 1123 T5C = VADD(T15, T17);
Chris@10 1124 }
Chris@10 1125 {
Chris@10 1126 V T1a, T1c, T19, T1b;
Chris@10 1127 T19 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@10 1128 T1a = BYTWJ(&(W[TWVL * 18]), T19);
Chris@10 1129 T1b = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
Chris@10 1130 T1c = BYTWJ(&(W[TWVL * 82]), T1b);
Chris@10 1131 T1d = VSUB(T1a, T1c);
Chris@10 1132 T5D = VADD(T1a, T1c);
Chris@10 1133 }
Chris@10 1134 T7E = VADD(T5z, T5A);
Chris@10 1135 T7F = VADD(T5D, T5C);
Chris@10 1136 T7V = VSUB(T7E, T7F);
Chris@10 1137 T5B = VSUB(T5z, T5A);
Chris@10 1138 T5E = VSUB(T5C, T5D);
Chris@10 1139 T5F = VFMA(LDK(KP923879532), T5B, VMUL(LDK(KP382683432), T5E));
Chris@10 1140 T6u = VFNMS(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
Chris@10 1141 {
Chris@10 1142 V T1e, T1j, T1p, T1q;
Chris@10 1143 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
Chris@10 1144 T1j = VSUB(T1g, T1i);
Chris@10 1145 T1k = VSUB(T1e, T1j);
Chris@10 1146 T4e = VADD(T1j, T1e);
Chris@10 1147 T1p = VSUB(T1m, T1o);
Chris@10 1148 T1q = VMUL(LDK(KP707106781), VADD(T1d, T18));
Chris@10 1149 T1r = VADD(T1p, T1q);
Chris@10 1150 T4d = VSUB(T1p, T1q);
Chris@10 1151 }
Chris@10 1152 }
Chris@10 1153 {
Chris@10 1154 V TG, TI, T5G, TY, T10, T5H, TO, T5K, TT, T5J, T5I, T5L;
Chris@10 1155 {
Chris@10 1156 V TF, TH, TX, TZ;
Chris@10 1157 TF = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
Chris@10 1158 TG = BYTWJ(&(W[TWVL * 122]), TF);
Chris@10 1159 TH = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
Chris@10 1160 TI = BYTWJ(&(W[TWVL * 58]), TH);
Chris@10 1161 T5G = VADD(TG, TI);
Chris@10 1162 TX = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@10 1163 TY = BYTWJ(&(W[TWVL * 26]), TX);
Chris@10 1164 TZ = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
Chris@10 1165 T10 = BYTWJ(&(W[TWVL * 90]), TZ);
Chris@10 1166 T5H = VADD(TY, T10);
Chris@10 1167 }
Chris@10 1168 {
Chris@10 1169 V TL, TN, TK, TM;
Chris@10 1170 TK = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 1171 TL = BYTWJ(&(W[TWVL * 10]), TK);
Chris@10 1172 TM = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
Chris@10 1173 TN = BYTWJ(&(W[TWVL * 74]), TM);
Chris@10 1174 TO = VSUB(TL, TN);
Chris@10 1175 T5K = VADD(TL, TN);
Chris@10 1176 }
Chris@10 1177 {
Chris@10 1178 V TQ, TS, TP, TR;
Chris@10 1179 TP = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
Chris@10 1180 TQ = BYTWJ(&(W[TWVL * 106]), TP);
Chris@10 1181 TR = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@10 1182 TS = BYTWJ(&(W[TWVL * 42]), TR);
Chris@10 1183 TT = VSUB(TQ, TS);
Chris@10 1184 T5J = VADD(TQ, TS);
Chris@10 1185 }
Chris@10 1186 T7B = VADD(T5G, T5H);
Chris@10 1187 T7C = VADD(T5K, T5J);
Chris@10 1188 T7W = VSUB(T7B, T7C);
Chris@10 1189 T5I = VSUB(T5G, T5H);
Chris@10 1190 T5L = VSUB(T5J, T5K);
Chris@10 1191 T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
Chris@10 1192 T6v = VFMA(LDK(KP382683432), T5I, VMUL(LDK(KP923879532), T5L));
Chris@10 1193 {
Chris@10 1194 V TJ, TU, TW, T11;
Chris@10 1195 TJ = VSUB(TG, TI);
Chris@10 1196 TU = VMUL(LDK(KP707106781), VADD(TO, TT));
Chris@10 1197 TV = VADD(TJ, TU);
Chris@10 1198 T4g = VSUB(TJ, TU);
Chris@10 1199 TW = VMUL(LDK(KP707106781), VSUB(TT, TO));
Chris@10 1200 T11 = VSUB(TY, T10);
Chris@10 1201 T12 = VSUB(TW, T11);
Chris@10 1202 T4h = VADD(T11, TW);
Chris@10 1203 }
Chris@10 1204 }
Chris@10 1205 {
Chris@10 1206 V Tl, T5r, TB, T5v, Tq, T5s, Tw, T5u, Tr, TC;
Chris@10 1207 {
Chris@10 1208 V Ti, Tk, Th, Tj;
Chris@10 1209 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 1210 Ti = BYTWJ(&(W[TWVL * 6]), Th);
Chris@10 1211 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
Chris@10 1212 Tk = BYTWJ(&(W[TWVL * 70]), Tj);
Chris@10 1213 Tl = VSUB(Ti, Tk);
Chris@10 1214 T5r = VADD(Ti, Tk);
Chris@10 1215 }
Chris@10 1216 {
Chris@10 1217 V Ty, TA, Tx, Tz;
Chris@10 1218 Tx = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@10 1219 Ty = BYTWJ(&(W[TWVL * 22]), Tx);
Chris@10 1220 Tz = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
Chris@10 1221 TA = BYTWJ(&(W[TWVL * 86]), Tz);
Chris@10 1222 TB = VSUB(Ty, TA);
Chris@10 1223 T5v = VADD(Ty, TA);
Chris@10 1224 }
Chris@10 1225 {
Chris@10 1226 V Tn, Tp, Tm, To;
Chris@10 1227 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@10 1228 Tn = BYTWJ(&(W[TWVL * 38]), Tm);
Chris@10 1229 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
Chris@10 1230 Tp = BYTWJ(&(W[TWVL * 102]), To);
Chris@10 1231 Tq = VSUB(Tn, Tp);
Chris@10 1232 T5s = VADD(Tn, Tp);
Chris@10 1233 }
Chris@10 1234 {
Chris@10 1235 V Tt, Tv, Ts, Tu;
Chris@10 1236 Ts = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
Chris@10 1237 Tt = BYTWJ(&(W[TWVL * 118]), Ts);
Chris@10 1238 Tu = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
Chris@10 1239 Tv = BYTWJ(&(W[TWVL * 54]), Tu);
Chris@10 1240 Tw = VSUB(Tt, Tv);
Chris@10 1241 T5u = VADD(Tt, Tv);
Chris@10 1242 }
Chris@10 1243 T7h = VADD(T5r, T5s);
Chris@10 1244 T7i = VADD(T5u, T5v);
Chris@10 1245 Tr = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
Chris@10 1246 TC = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), TB));
Chris@10 1247 TD = VADD(Tr, TC);
Chris@10 1248 T4C = VSUB(TC, Tr);
Chris@10 1249 {
Chris@10 1250 V T3f, T3g, T5t, T5w;
Chris@10 1251 T3f = VFNMS(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
Chris@10 1252 T3g = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
Chris@10 1253 T3h = VSUB(T3f, T3g);
Chris@10 1254 T4b = VADD(T3g, T3f);
Chris@10 1255 T5t = VSUB(T5r, T5s);
Chris@10 1256 T5w = VSUB(T5u, T5v);
Chris@10 1257 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
Chris@10 1258 T6s = VMUL(LDK(KP707106781), VSUB(T5w, T5t));
Chris@10 1259 }
Chris@10 1260 }
Chris@10 1261 {
Chris@10 1262 V T1z, T5V, T1P, T5T, T1E, T5W, T1K, T5S;
Chris@10 1263 {
Chris@10 1264 V T1w, T1y, T1v, T1x;
Chris@10 1265 T1v = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
Chris@10 1266 T1w = BYTWJ(&(W[TWVL * 120]), T1v);
Chris@10 1267 T1x = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
Chris@10 1268 T1y = BYTWJ(&(W[TWVL * 56]), T1x);
Chris@10 1269 T1z = VSUB(T1w, T1y);
Chris@10 1270 T5V = VADD(T1w, T1y);
Chris@10 1271 }
Chris@10 1272 {
Chris@10 1273 V T1M, T1O, T1L, T1N;
Chris@10 1274 T1L = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@10 1275 T1M = BYTWJ(&(W[TWVL * 40]), T1L);
Chris@10 1276 T1N = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
Chris@10 1277 T1O = BYTWJ(&(W[TWVL * 104]), T1N);
Chris@10 1278 T1P = VSUB(T1M, T1O);
Chris@10 1279 T5T = VADD(T1M, T1O);
Chris@10 1280 }
Chris@10 1281 {
Chris@10 1282 V T1B, T1D, T1A, T1C;
Chris@10 1283 T1A = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@10 1284 T1B = BYTWJ(&(W[TWVL * 24]), T1A);
Chris@10 1285 T1C = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
Chris@10 1286 T1D = BYTWJ(&(W[TWVL * 88]), T1C);
Chris@10 1287 T1E = VSUB(T1B, T1D);
Chris@10 1288 T5W = VADD(T1B, T1D);
Chris@10 1289 }
Chris@10 1290 {
Chris@10 1291 V T1H, T1J, T1G, T1I;
Chris@10 1292 T1G = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 1293 T1H = BYTWJ(&(W[TWVL * 8]), T1G);
Chris@10 1294 T1I = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
Chris@10 1295 T1J = BYTWJ(&(W[TWVL * 72]), T1I);
Chris@10 1296 T1K = VSUB(T1H, T1J);
Chris@10 1297 T5S = VADD(T1H, T1J);
Chris@10 1298 }
Chris@10 1299 {
Chris@10 1300 V T1F, T1Q, T7o, T7p;
Chris@10 1301 T1F = VFNMS(LDK(KP923879532), T1E, VMUL(LDK(KP382683432), T1z));
Chris@10 1302 T1Q = VFMA(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
Chris@10 1303 T1R = VSUB(T1F, T1Q);
Chris@10 1304 T4m = VADD(T1Q, T1F);
Chris@10 1305 T7o = VADD(T5S, T5T);
Chris@10 1306 T7p = VADD(T5V, T5W);
Chris@10 1307 T7q = VADD(T7o, T7p);
Chris@10 1308 T80 = VSUB(T7p, T7o);
Chris@10 1309 }
Chris@10 1310 {
Chris@10 1311 V T2h, T2i, T5U, T5X;
Chris@10 1312 T2h = VFNMS(LDK(KP382683432), T1P, VMUL(LDK(KP923879532), T1K));
Chris@10 1313 T2i = VFMA(LDK(KP923879532), T1z, VMUL(LDK(KP382683432), T1E));
Chris@10 1314 T2j = VADD(T2h, T2i);
Chris@10 1315 T4p = VSUB(T2i, T2h);
Chris@10 1316 T5U = VSUB(T5S, T5T);
Chris@10 1317 T5X = VSUB(T5V, T5W);
Chris@10 1318 T5Y = VMUL(LDK(KP707106781), VADD(T5U, T5X));
Chris@10 1319 T63 = VMUL(LDK(KP707106781), VSUB(T5X, T5U));
Chris@10 1320 }
Chris@10 1321 }
Chris@10 1322 {
Chris@10 1323 V T2H, T69, T2X, T6d, T2M, T6a, T2S, T6c;
Chris@10 1324 {
Chris@10 1325 V T2E, T2G, T2D, T2F;
Chris@10 1326 T2D = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 1327 T2E = BYTWJ(&(W[TWVL * 4]), T2D);
Chris@10 1328 T2F = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
Chris@10 1329 T2G = BYTWJ(&(W[TWVL * 68]), T2F);
Chris@10 1330 T2H = VSUB(T2E, T2G);
Chris@10 1331 T69 = VADD(T2E, T2G);
Chris@10 1332 }
Chris@10 1333 {
Chris@10 1334 V T2U, T2W, T2T, T2V;
Chris@10 1335 T2T = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@10 1336 T2U = BYTWJ(&(W[TWVL * 20]), T2T);
Chris@10 1337 T2V = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
Chris@10 1338 T2W = BYTWJ(&(W[TWVL * 84]), T2V);
Chris@10 1339 T2X = VSUB(T2U, T2W);
Chris@10 1340 T6d = VADD(T2U, T2W);
Chris@10 1341 }
Chris@10 1342 {
Chris@10 1343 V T2J, T2L, T2I, T2K;
Chris@10 1344 T2I = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@10 1345 T2J = BYTWJ(&(W[TWVL * 36]), T2I);
Chris@10 1346 T2K = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
Chris@10 1347 T2L = BYTWJ(&(W[TWVL * 100]), T2K);
Chris@10 1348 T2M = VSUB(T2J, T2L);
Chris@10 1349 T6a = VADD(T2J, T2L);
Chris@10 1350 }
Chris@10 1351 {
Chris@10 1352 V T2P, T2R, T2O, T2Q;
Chris@10 1353 T2O = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
Chris@10 1354 T2P = BYTWJ(&(W[TWVL * 116]), T2O);
Chris@10 1355 T2Q = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
Chris@10 1356 T2R = BYTWJ(&(W[TWVL * 52]), T2Q);
Chris@10 1357 T2S = VSUB(T2P, T2R);
Chris@10 1358 T6c = VADD(T2P, T2R);
Chris@10 1359 }
Chris@10 1360 {
Chris@10 1361 V T2N, T2Y, T7v, T7w;
Chris@10 1362 T2N = VFNMS(LDK(KP382683432), T2M, VMUL(LDK(KP923879532), T2H));
Chris@10 1363 T2Y = VFMA(LDK(KP923879532), T2S, VMUL(LDK(KP382683432), T2X));
Chris@10 1364 T2Z = VADD(T2N, T2Y);
Chris@10 1365 T4w = VSUB(T2Y, T2N);
Chris@10 1366 T7v = VADD(T69, T6a);
Chris@10 1367 T7w = VADD(T6c, T6d);
Chris@10 1368 T7x = VADD(T7v, T7w);
Chris@10 1369 T83 = VSUB(T7w, T7v);
Chris@10 1370 }
Chris@10 1371 {
Chris@10 1372 V T31, T32, T6b, T6e;
Chris@10 1373 T31 = VFNMS(LDK(KP923879532), T2X, VMUL(LDK(KP382683432), T2S));
Chris@10 1374 T32 = VFMA(LDK(KP382683432), T2H, VMUL(LDK(KP923879532), T2M));
Chris@10 1375 T33 = VSUB(T31, T32);
Chris@10 1376 T4t = VADD(T32, T31);
Chris@10 1377 T6b = VSUB(T69, T6a);
Chris@10 1378 T6e = VSUB(T6c, T6d);
Chris@10 1379 T6f = VMUL(LDK(KP707106781), VADD(T6b, T6e));
Chris@10 1380 T6k = VMUL(LDK(KP707106781), VSUB(T6e, T6b));
Chris@10 1381 }
Chris@10 1382 }
Chris@10 1383 {
Chris@10 1384 V T7k, T7M, T7R, T7T, T7z, T7I, T7H, T7N, T7O, T7S;
Chris@10 1385 {
Chris@10 1386 V T7g, T7j, T7P, T7Q;
Chris@10 1387 T7g = VADD(T7e, T7f);
Chris@10 1388 T7j = VADD(T7h, T7i);
Chris@10 1389 T7k = VSUB(T7g, T7j);
Chris@10 1390 T7M = VADD(T7g, T7j);
Chris@10 1391 T7P = VADD(T7n, T7q);
Chris@10 1392 T7Q = VADD(T7u, T7x);
Chris@10 1393 T7R = VADD(T7P, T7Q);
Chris@10 1394 T7T = VBYI(VSUB(T7Q, T7P));
Chris@10 1395 }
Chris@10 1396 {
Chris@10 1397 V T7r, T7y, T7D, T7G;
Chris@10 1398 T7r = VSUB(T7n, T7q);
Chris@10 1399 T7y = VSUB(T7u, T7x);
Chris@10 1400 T7z = VMUL(LDK(KP707106781), VADD(T7r, T7y));
Chris@10 1401 T7I = VMUL(LDK(KP707106781), VSUB(T7y, T7r));
Chris@10 1402 T7D = VADD(T7B, T7C);
Chris@10 1403 T7G = VADD(T7E, T7F);
Chris@10 1404 T7H = VSUB(T7D, T7G);
Chris@10 1405 T7N = VADD(T7G, T7D);
Chris@10 1406 }
Chris@10 1407 T7O = VADD(T7M, T7N);
Chris@10 1408 ST(&(x[WS(rs, 32)]), VSUB(T7O, T7R), ms, &(x[0]));
Chris@10 1409 ST(&(x[0]), VADD(T7O, T7R), ms, &(x[0]));
Chris@10 1410 T7S = VSUB(T7M, T7N);
Chris@10 1411 ST(&(x[WS(rs, 48)]), VSUB(T7S, T7T), ms, &(x[0]));
Chris@10 1412 ST(&(x[WS(rs, 16)]), VADD(T7S, T7T), ms, &(x[0]));
Chris@10 1413 {
Chris@10 1414 V T7A, T7J, T7K, T7L;
Chris@10 1415 T7A = VADD(T7k, T7z);
Chris@10 1416 T7J = VBYI(VADD(T7H, T7I));
Chris@10 1417 ST(&(x[WS(rs, 56)]), VSUB(T7A, T7J), ms, &(x[0]));
Chris@10 1418 ST(&(x[WS(rs, 8)]), VADD(T7A, T7J), ms, &(x[0]));
Chris@10 1419 T7K = VSUB(T7k, T7z);
Chris@10 1420 T7L = VBYI(VSUB(T7I, T7H));
Chris@10 1421 ST(&(x[WS(rs, 40)]), VSUB(T7K, T7L), ms, &(x[0]));
Chris@10 1422 ST(&(x[WS(rs, 24)]), VADD(T7K, T7L), ms, &(x[0]));
Chris@10 1423 }
Chris@10 1424 }
Chris@10 1425 {
Chris@10 1426 V T7Y, T8j, T8c, T8k, T85, T8g, T89, T8h;
Chris@10 1427 {
Chris@10 1428 V T7U, T7X, T8a, T8b;
Chris@10 1429 T7U = VSUB(T7e, T7f);
Chris@10 1430 T7X = VMUL(LDK(KP707106781), VADD(T7V, T7W));
Chris@10 1431 T7Y = VADD(T7U, T7X);
Chris@10 1432 T8j = VSUB(T7U, T7X);
Chris@10 1433 T8a = VFNMS(LDK(KP382683432), T7Z, VMUL(LDK(KP923879532), T80));
Chris@10 1434 T8b = VFMA(LDK(KP382683432), T82, VMUL(LDK(KP923879532), T83));
Chris@10 1435 T8c = VADD(T8a, T8b);
Chris@10 1436 T8k = VSUB(T8b, T8a);
Chris@10 1437 }
Chris@10 1438 {
Chris@10 1439 V T81, T84, T87, T88;
Chris@10 1440 T81 = VFMA(LDK(KP923879532), T7Z, VMUL(LDK(KP382683432), T80));
Chris@10 1441 T84 = VFNMS(LDK(KP382683432), T83, VMUL(LDK(KP923879532), T82));
Chris@10 1442 T85 = VADD(T81, T84);
Chris@10 1443 T8g = VSUB(T84, T81);
Chris@10 1444 T87 = VSUB(T7i, T7h);
Chris@10 1445 T88 = VMUL(LDK(KP707106781), VSUB(T7W, T7V));
Chris@10 1446 T89 = VADD(T87, T88);
Chris@10 1447 T8h = VSUB(T88, T87);
Chris@10 1448 }
Chris@10 1449 {
Chris@10 1450 V T86, T8d, T8m, T8n;
Chris@10 1451 T86 = VADD(T7Y, T85);
Chris@10 1452 T8d = VBYI(VADD(T89, T8c));
Chris@10 1453 ST(&(x[WS(rs, 60)]), VSUB(T86, T8d), ms, &(x[0]));
Chris@10 1454 ST(&(x[WS(rs, 4)]), VADD(T86, T8d), ms, &(x[0]));
Chris@10 1455 T8m = VBYI(VADD(T8h, T8g));
Chris@10 1456 T8n = VADD(T8j, T8k);
Chris@10 1457 ST(&(x[WS(rs, 12)]), VADD(T8m, T8n), ms, &(x[0]));
Chris@10 1458 ST(&(x[WS(rs, 52)]), VSUB(T8n, T8m), ms, &(x[0]));
Chris@10 1459 }
Chris@10 1460 {
Chris@10 1461 V T8e, T8f, T8i, T8l;
Chris@10 1462 T8e = VSUB(T7Y, T85);
Chris@10 1463 T8f = VBYI(VSUB(T8c, T89));
Chris@10 1464 ST(&(x[WS(rs, 36)]), VSUB(T8e, T8f), ms, &(x[0]));
Chris@10 1465 ST(&(x[WS(rs, 28)]), VADD(T8e, T8f), ms, &(x[0]));
Chris@10 1466 T8i = VBYI(VSUB(T8g, T8h));
Chris@10 1467 T8l = VSUB(T8j, T8k);
Chris@10 1468 ST(&(x[WS(rs, 20)]), VADD(T8i, T8l), ms, &(x[0]));
Chris@10 1469 ST(&(x[WS(rs, 44)]), VSUB(T8l, T8i), ms, &(x[0]));
Chris@10 1470 }
Chris@10 1471 }
Chris@10 1472 {
Chris@10 1473 V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
Chris@10 1474 {
Chris@10 1475 V T5y, T5N, T6t, T6w;
Chris@10 1476 T5y = VADD(T5q, T5x);
Chris@10 1477 T5N = VADD(T5F, T5M);
Chris@10 1478 T5O = VADD(T5y, T5N);
Chris@10 1479 T6H = VSUB(T5y, T5N);
Chris@10 1480 T6t = VADD(T6r, T6s);
Chris@10 1481 T6w = VADD(T6u, T6v);
Chris@10 1482 T6x = VADD(T6t, T6w);
Chris@10 1483 T6F = VSUB(T6w, T6t);
Chris@10 1484 {
Chris@10 1485 V T65, T6y, T6m, T6z;
Chris@10 1486 {
Chris@10 1487 V T5Z, T64, T6g, T6l;
Chris@10 1488 T5Z = VADD(T5R, T5Y);
Chris@10 1489 T64 = VADD(T62, T63);
Chris@10 1490 T65 = VFMA(LDK(KP980785280), T5Z, VMUL(LDK(KP195090322), T64));
Chris@10 1491 T6y = VFNMS(LDK(KP195090322), T5Z, VMUL(LDK(KP980785280), T64));
Chris@10 1492 T6g = VADD(T68, T6f);
Chris@10 1493 T6l = VADD(T6j, T6k);
Chris@10 1494 T6m = VFNMS(LDK(KP195090322), T6l, VMUL(LDK(KP980785280), T6g));
Chris@10 1495 T6z = VFMA(LDK(KP195090322), T6g, VMUL(LDK(KP980785280), T6l));
Chris@10 1496 }
Chris@10 1497 T6n = VADD(T65, T6m);
Chris@10 1498 T6I = VSUB(T6z, T6y);
Chris@10 1499 T6A = VADD(T6y, T6z);
Chris@10 1500 T6E = VSUB(T6m, T65);
Chris@10 1501 }
Chris@10 1502 }
Chris@10 1503 {
Chris@10 1504 V T6o, T6B, T6K, T6L;
Chris@10 1505 T6o = VADD(T5O, T6n);
Chris@10 1506 T6B = VBYI(VADD(T6x, T6A));
Chris@10 1507 ST(&(x[WS(rs, 62)]), VSUB(T6o, T6B), ms, &(x[0]));
Chris@10 1508 ST(&(x[WS(rs, 2)]), VADD(T6o, T6B), ms, &(x[0]));
Chris@10 1509 T6K = VBYI(VADD(T6F, T6E));
Chris@10 1510 T6L = VADD(T6H, T6I);
Chris@10 1511 ST(&(x[WS(rs, 14)]), VADD(T6K, T6L), ms, &(x[0]));
Chris@10 1512 ST(&(x[WS(rs, 50)]), VSUB(T6L, T6K), ms, &(x[0]));
Chris@10 1513 }
Chris@10 1514 {
Chris@10 1515 V T6C, T6D, T6G, T6J;
Chris@10 1516 T6C = VSUB(T5O, T6n);
Chris@10 1517 T6D = VBYI(VSUB(T6A, T6x));
Chris@10 1518 ST(&(x[WS(rs, 34)]), VSUB(T6C, T6D), ms, &(x[0]));
Chris@10 1519 ST(&(x[WS(rs, 30)]), VADD(T6C, T6D), ms, &(x[0]));
Chris@10 1520 T6G = VBYI(VSUB(T6E, T6F));
Chris@10 1521 T6J = VSUB(T6H, T6I);
Chris@10 1522 ST(&(x[WS(rs, 18)]), VADD(T6G, T6J), ms, &(x[0]));
Chris@10 1523 ST(&(x[WS(rs, 46)]), VSUB(T6J, T6G), ms, &(x[0]));
Chris@10 1524 }
Chris@10 1525 }
Chris@10 1526 {
Chris@10 1527 V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
Chris@10 1528 {
Chris@10 1529 V T6M, T6N, T6X, T6Y;
Chris@10 1530 T6M = VSUB(T5q, T5x);
Chris@10 1531 T6N = VSUB(T6v, T6u);
Chris@10 1532 T6O = VADD(T6M, T6N);
Chris@10 1533 T79 = VSUB(T6M, T6N);
Chris@10 1534 T6X = VSUB(T6s, T6r);
Chris@10 1535 T6Y = VSUB(T5M, T5F);
Chris@10 1536 T6Z = VADD(T6X, T6Y);
Chris@10 1537 T77 = VSUB(T6Y, T6X);
Chris@10 1538 {
Chris@10 1539 V T6R, T70, T6U, T71;
Chris@10 1540 {
Chris@10 1541 V T6P, T6Q, T6S, T6T;
Chris@10 1542 T6P = VSUB(T5R, T5Y);
Chris@10 1543 T6Q = VSUB(T63, T62);
Chris@10 1544 T6R = VFMA(LDK(KP831469612), T6P, VMUL(LDK(KP555570233), T6Q));
Chris@10 1545 T70 = VFNMS(LDK(KP555570233), T6P, VMUL(LDK(KP831469612), T6Q));
Chris@10 1546 T6S = VSUB(T68, T6f);
Chris@10 1547 T6T = VSUB(T6k, T6j);
Chris@10 1548 T6U = VFNMS(LDK(KP555570233), T6T, VMUL(LDK(KP831469612), T6S));
Chris@10 1549 T71 = VFMA(LDK(KP555570233), T6S, VMUL(LDK(KP831469612), T6T));
Chris@10 1550 }
Chris@10 1551 T6V = VADD(T6R, T6U);
Chris@10 1552 T7a = VSUB(T71, T70);
Chris@10 1553 T72 = VADD(T70, T71);
Chris@10 1554 T76 = VSUB(T6U, T6R);
Chris@10 1555 }
Chris@10 1556 }
Chris@10 1557 {
Chris@10 1558 V T6W, T73, T7c, T7d;
Chris@10 1559 T6W = VADD(T6O, T6V);
Chris@10 1560 T73 = VBYI(VADD(T6Z, T72));
Chris@10 1561 ST(&(x[WS(rs, 58)]), VSUB(T6W, T73), ms, &(x[0]));
Chris@10 1562 ST(&(x[WS(rs, 6)]), VADD(T6W, T73), ms, &(x[0]));
Chris@10 1563 T7c = VBYI(VADD(T77, T76));
Chris@10 1564 T7d = VADD(T79, T7a);
Chris@10 1565 ST(&(x[WS(rs, 10)]), VADD(T7c, T7d), ms, &(x[0]));
Chris@10 1566 ST(&(x[WS(rs, 54)]), VSUB(T7d, T7c), ms, &(x[0]));
Chris@10 1567 }
Chris@10 1568 {
Chris@10 1569 V T74, T75, T78, T7b;
Chris@10 1570 T74 = VSUB(T6O, T6V);
Chris@10 1571 T75 = VBYI(VSUB(T72, T6Z));
Chris@10 1572 ST(&(x[WS(rs, 38)]), VSUB(T74, T75), ms, &(x[0]));
Chris@10 1573 ST(&(x[WS(rs, 26)]), VADD(T74, T75), ms, &(x[0]));
Chris@10 1574 T78 = VBYI(VSUB(T76, T77));
Chris@10 1575 T7b = VSUB(T79, T7a);
Chris@10 1576 ST(&(x[WS(rs, 22)]), VADD(T78, T7b), ms, &(x[0]));
Chris@10 1577 ST(&(x[WS(rs, 42)]), VSUB(T7b, T78), ms, &(x[0]));
Chris@10 1578 }
Chris@10 1579 }
Chris@10 1580 {
Chris@10 1581 V T4k, T5h, T4R, T59, T4H, T5j, T4P, T4Y, T4z, T4S, T4K, T4O, T55, T5k, T5c;
Chris@10 1582 V T5g;
Chris@10 1583 {
Chris@10 1584 V T4c, T57, T4j, T58, T4f, T4i;
Chris@10 1585 T4c = VADD(T4a, T4b);
Chris@10 1586 T57 = VSUB(T4C, T4B);
Chris@10 1587 T4f = VFMA(LDK(KP831469612), T4d, VMUL(LDK(KP555570233), T4e));
Chris@10 1588 T4i = VFNMS(LDK(KP555570233), T4h, VMUL(LDK(KP831469612), T4g));
Chris@10 1589 T4j = VADD(T4f, T4i);
Chris@10 1590 T58 = VSUB(T4i, T4f);
Chris@10 1591 T4k = VADD(T4c, T4j);
Chris@10 1592 T5h = VSUB(T58, T57);
Chris@10 1593 T4R = VSUB(T4c, T4j);
Chris@10 1594 T59 = VADD(T57, T58);
Chris@10 1595 }
Chris@10 1596 {
Chris@10 1597 V T4D, T4W, T4G, T4X, T4E, T4F;
Chris@10 1598 T4D = VADD(T4B, T4C);
Chris@10 1599 T4W = VSUB(T4a, T4b);
Chris@10 1600 T4E = VFNMS(LDK(KP555570233), T4d, VMUL(LDK(KP831469612), T4e));
Chris@10 1601 T4F = VFMA(LDK(KP555570233), T4g, VMUL(LDK(KP831469612), T4h));
Chris@10 1602 T4G = VADD(T4E, T4F);
Chris@10 1603 T4X = VSUB(T4F, T4E);
Chris@10 1604 T4H = VADD(T4D, T4G);
Chris@10 1605 T5j = VSUB(T4W, T4X);
Chris@10 1606 T4P = VSUB(T4G, T4D);
Chris@10 1607 T4Y = VADD(T4W, T4X);
Chris@10 1608 }
Chris@10 1609 {
Chris@10 1610 V T4r, T4I, T4y, T4J;
Chris@10 1611 {
Chris@10 1612 V T4n, T4q, T4u, T4x;
Chris@10 1613 T4n = VADD(T4l, T4m);
Chris@10 1614 T4q = VADD(T4o, T4p);
Chris@10 1615 T4r = VFMA(LDK(KP956940335), T4n, VMUL(LDK(KP290284677), T4q));
Chris@10 1616 T4I = VFNMS(LDK(KP290284677), T4n, VMUL(LDK(KP956940335), T4q));
Chris@10 1617 T4u = VADD(T4s, T4t);
Chris@10 1618 T4x = VADD(T4v, T4w);
Chris@10 1619 T4y = VFNMS(LDK(KP290284677), T4x, VMUL(LDK(KP956940335), T4u));
Chris@10 1620 T4J = VFMA(LDK(KP290284677), T4u, VMUL(LDK(KP956940335), T4x));
Chris@10 1621 }
Chris@10 1622 T4z = VADD(T4r, T4y);
Chris@10 1623 T4S = VSUB(T4J, T4I);
Chris@10 1624 T4K = VADD(T4I, T4J);
Chris@10 1625 T4O = VSUB(T4y, T4r);
Chris@10 1626 }
Chris@10 1627 {
Chris@10 1628 V T51, T5a, T54, T5b;
Chris@10 1629 {
Chris@10 1630 V T4Z, T50, T52, T53;
Chris@10 1631 T4Z = VSUB(T4l, T4m);
Chris@10 1632 T50 = VSUB(T4p, T4o);
Chris@10 1633 T51 = VFMA(LDK(KP881921264), T4Z, VMUL(LDK(KP471396736), T50));
Chris@10 1634 T5a = VFNMS(LDK(KP471396736), T4Z, VMUL(LDK(KP881921264), T50));
Chris@10 1635 T52 = VSUB(T4s, T4t);
Chris@10 1636 T53 = VSUB(T4w, T4v);
Chris@10 1637 T54 = VFNMS(LDK(KP471396736), T53, VMUL(LDK(KP881921264), T52));
Chris@10 1638 T5b = VFMA(LDK(KP471396736), T52, VMUL(LDK(KP881921264), T53));
Chris@10 1639 }
Chris@10 1640 T55 = VADD(T51, T54);
Chris@10 1641 T5k = VSUB(T5b, T5a);
Chris@10 1642 T5c = VADD(T5a, T5b);
Chris@10 1643 T5g = VSUB(T54, T51);
Chris@10 1644 }
Chris@10 1645 {
Chris@10 1646 V T4A, T4L, T5i, T5l;
Chris@10 1647 T4A = VADD(T4k, T4z);
Chris@10 1648 T4L = VBYI(VADD(T4H, T4K));
Chris@10 1649 ST(&(x[WS(rs, 61)]), VSUB(T4A, T4L), ms, &(x[WS(rs, 1)]));
Chris@10 1650 ST(&(x[WS(rs, 3)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
Chris@10 1651 T5i = VBYI(VSUB(T5g, T5h));
Chris@10 1652 T5l = VSUB(T5j, T5k);
Chris@10 1653 ST(&(x[WS(rs, 21)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
Chris@10 1654 ST(&(x[WS(rs, 43)]), VSUB(T5l, T5i), ms, &(x[WS(rs, 1)]));
Chris@10 1655 }
Chris@10 1656 {
Chris@10 1657 V T5m, T5n, T4M, T4N;
Chris@10 1658 T5m = VBYI(VADD(T5h, T5g));
Chris@10 1659 T5n = VADD(T5j, T5k);
Chris@10 1660 ST(&(x[WS(rs, 11)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
Chris@10 1661 ST(&(x[WS(rs, 53)]), VSUB(T5n, T5m), ms, &(x[WS(rs, 1)]));
Chris@10 1662 T4M = VSUB(T4k, T4z);
Chris@10 1663 T4N = VBYI(VSUB(T4K, T4H));
Chris@10 1664 ST(&(x[WS(rs, 35)]), VSUB(T4M, T4N), ms, &(x[WS(rs, 1)]));
Chris@10 1665 ST(&(x[WS(rs, 29)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
Chris@10 1666 }
Chris@10 1667 {
Chris@10 1668 V T4Q, T4T, T56, T5d;
Chris@10 1669 T4Q = VBYI(VSUB(T4O, T4P));
Chris@10 1670 T4T = VSUB(T4R, T4S);
Chris@10 1671 ST(&(x[WS(rs, 19)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
Chris@10 1672 ST(&(x[WS(rs, 45)]), VSUB(T4T, T4Q), ms, &(x[WS(rs, 1)]));
Chris@10 1673 T56 = VADD(T4Y, T55);
Chris@10 1674 T5d = VBYI(VADD(T59, T5c));
Chris@10 1675 ST(&(x[WS(rs, 59)]), VSUB(T56, T5d), ms, &(x[WS(rs, 1)]));
Chris@10 1676 ST(&(x[WS(rs, 5)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
Chris@10 1677 }
Chris@10 1678 {
Chris@10 1679 V T5e, T5f, T4U, T4V;
Chris@10 1680 T5e = VSUB(T4Y, T55);
Chris@10 1681 T5f = VBYI(VSUB(T5c, T59));
Chris@10 1682 ST(&(x[WS(rs, 37)]), VSUB(T5e, T5f), ms, &(x[WS(rs, 1)]));
Chris@10 1683 ST(&(x[WS(rs, 27)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
Chris@10 1684 T4U = VBYI(VADD(T4P, T4O));
Chris@10 1685 T4V = VADD(T4R, T4S);
Chris@10 1686 ST(&(x[WS(rs, 13)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
Chris@10 1687 ST(&(x[WS(rs, 51)]), VSUB(T4V, T4U), ms, &(x[WS(rs, 1)]));
Chris@10 1688 }
Chris@10 1689 }
Chris@10 1690 {
Chris@10 1691 V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
Chris@10 1692 V T42;
Chris@10 1693 {
Chris@10 1694 V TE, T3T, T1t, T3U, T13, T1s;
Chris@10 1695 TE = VSUB(Tg, TD);
Chris@10 1696 T3T = VADD(T3o, T3h);
Chris@10 1697 T13 = VFMA(LDK(KP195090322), TV, VMUL(LDK(KP980785280), T12));
Chris@10 1698 T1s = VFNMS(LDK(KP195090322), T1r, VMUL(LDK(KP980785280), T1k));
Chris@10 1699 T1t = VSUB(T13, T1s);
Chris@10 1700 T3U = VADD(T1s, T13);
Chris@10 1701 T1u = VADD(TE, T1t);
Chris@10 1702 T43 = VSUB(T3U, T3T);
Chris@10 1703 T3D = VSUB(TE, T1t);
Chris@10 1704 T3V = VADD(T3T, T3U);
Chris@10 1705 }
Chris@10 1706 {
Chris@10 1707 V T3p, T3I, T3s, T3J, T3q, T3r;
Chris@10 1708 T3p = VSUB(T3h, T3o);
Chris@10 1709 T3I = VADD(Tg, TD);
Chris@10 1710 T3q = VFNMS(LDK(KP195090322), T12, VMUL(LDK(KP980785280), TV));
Chris@10 1711 T3r = VFMA(LDK(KP980785280), T1r, VMUL(LDK(KP195090322), T1k));
Chris@10 1712 T3s = VSUB(T3q, T3r);
Chris@10 1713 T3J = VADD(T3r, T3q);
Chris@10 1714 T3t = VADD(T3p, T3s);
Chris@10 1715 T45 = VSUB(T3I, T3J);
Chris@10 1716 T3B = VSUB(T3s, T3p);
Chris@10 1717 T3K = VADD(T3I, T3J);
Chris@10 1718 }
Chris@10 1719 {
Chris@10 1720 V T2l, T3u, T3c, T3v;
Chris@10 1721 {
Chris@10 1722 V T29, T2k, T30, T3b;
Chris@10 1723 T29 = VSUB(T1R, T28);
Chris@10 1724 T2k = VSUB(T2g, T2j);
Chris@10 1725 T2l = VFMA(LDK(KP634393284), T29, VMUL(LDK(KP773010453), T2k));
Chris@10 1726 T3u = VFNMS(LDK(KP634393284), T2k, VMUL(LDK(KP773010453), T29));
Chris@10 1727 T30 = VSUB(T2C, T2Z);
Chris@10 1728 T3b = VSUB(T33, T3a);
Chris@10 1729 T3c = VFNMS(LDK(KP634393284), T3b, VMUL(LDK(KP773010453), T30));
Chris@10 1730 T3v = VFMA(LDK(KP773010453), T3b, VMUL(LDK(KP634393284), T30));
Chris@10 1731 }
Chris@10 1732 T3d = VADD(T2l, T3c);
Chris@10 1733 T3E = VSUB(T3v, T3u);
Chris@10 1734 T3w = VADD(T3u, T3v);
Chris@10 1735 T3A = VSUB(T3c, T2l);
Chris@10 1736 }
Chris@10 1737 {
Chris@10 1738 V T3N, T3W, T3Q, T3X;
Chris@10 1739 {
Chris@10 1740 V T3L, T3M, T3O, T3P;
Chris@10 1741 T3L = VADD(T28, T1R);
Chris@10 1742 T3M = VADD(T2g, T2j);
Chris@10 1743 T3N = VFMA(LDK(KP098017140), T3L, VMUL(LDK(KP995184726), T3M));
Chris@10 1744 T3W = VFNMS(LDK(KP098017140), T3M, VMUL(LDK(KP995184726), T3L));
Chris@10 1745 T3O = VADD(T2C, T2Z);
Chris@10 1746 T3P = VADD(T3a, T33);
Chris@10 1747 T3Q = VFNMS(LDK(KP098017140), T3P, VMUL(LDK(KP995184726), T3O));
Chris@10 1748 T3X = VFMA(LDK(KP995184726), T3P, VMUL(LDK(KP098017140), T3O));
Chris@10 1749 }
Chris@10 1750 T3R = VADD(T3N, T3Q);
Chris@10 1751 T46 = VSUB(T3X, T3W);
Chris@10 1752 T3Y = VADD(T3W, T3X);
Chris@10 1753 T42 = VSUB(T3Q, T3N);
Chris@10 1754 }
Chris@10 1755 {
Chris@10 1756 V T3e, T3x, T44, T47;
Chris@10 1757 T3e = VADD(T1u, T3d);
Chris@10 1758 T3x = VBYI(VADD(T3t, T3w));
Chris@10 1759 ST(&(x[WS(rs, 57)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
Chris@10 1760 ST(&(x[WS(rs, 7)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
Chris@10 1761 T44 = VBYI(VSUB(T42, T43));
Chris@10 1762 T47 = VSUB(T45, T46);
Chris@10 1763 ST(&(x[WS(rs, 17)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
Chris@10 1764 ST(&(x[WS(rs, 47)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
Chris@10 1765 }
Chris@10 1766 {
Chris@10 1767 V T48, T49, T3y, T3z;
Chris@10 1768 T48 = VBYI(VADD(T43, T42));
Chris@10 1769 T49 = VADD(T45, T46);
Chris@10 1770 ST(&(x[WS(rs, 15)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
Chris@10 1771 ST(&(x[WS(rs, 49)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
Chris@10 1772 T3y = VSUB(T1u, T3d);
Chris@10 1773 T3z = VBYI(VSUB(T3w, T3t));
Chris@10 1774 ST(&(x[WS(rs, 39)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
Chris@10 1775 ST(&(x[WS(rs, 25)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
Chris@10 1776 }
Chris@10 1777 {
Chris@10 1778 V T3C, T3F, T3S, T3Z;
Chris@10 1779 T3C = VBYI(VSUB(T3A, T3B));
Chris@10 1780 T3F = VSUB(T3D, T3E);
Chris@10 1781 ST(&(x[WS(rs, 23)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
Chris@10 1782 ST(&(x[WS(rs, 41)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
Chris@10 1783 T3S = VADD(T3K, T3R);
Chris@10 1784 T3Z = VBYI(VADD(T3V, T3Y));
Chris@10 1785 ST(&(x[WS(rs, 63)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
Chris@10 1786 ST(&(x[WS(rs, 1)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
Chris@10 1787 }
Chris@10 1788 {
Chris@10 1789 V T40, T41, T3G, T3H;
Chris@10 1790 T40 = VSUB(T3K, T3R);
Chris@10 1791 T41 = VBYI(VSUB(T3Y, T3V));
Chris@10 1792 ST(&(x[WS(rs, 33)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
Chris@10 1793 ST(&(x[WS(rs, 31)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
Chris@10 1794 T3G = VBYI(VADD(T3B, T3A));
Chris@10 1795 T3H = VADD(T3D, T3E);
Chris@10 1796 ST(&(x[WS(rs, 9)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
Chris@10 1797 ST(&(x[WS(rs, 55)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
Chris@10 1798 }
Chris@10 1799 }
Chris@10 1800 }
Chris@10 1801 }
Chris@10 1802 VLEAVE();
Chris@10 1803 }
Chris@10 1804
Chris@10 1805 static const tw_instr twinstr[] = {
Chris@10 1806 VTW(0, 1),
Chris@10 1807 VTW(0, 2),
Chris@10 1808 VTW(0, 3),
Chris@10 1809 VTW(0, 4),
Chris@10 1810 VTW(0, 5),
Chris@10 1811 VTW(0, 6),
Chris@10 1812 VTW(0, 7),
Chris@10 1813 VTW(0, 8),
Chris@10 1814 VTW(0, 9),
Chris@10 1815 VTW(0, 10),
Chris@10 1816 VTW(0, 11),
Chris@10 1817 VTW(0, 12),
Chris@10 1818 VTW(0, 13),
Chris@10 1819 VTW(0, 14),
Chris@10 1820 VTW(0, 15),
Chris@10 1821 VTW(0, 16),
Chris@10 1822 VTW(0, 17),
Chris@10 1823 VTW(0, 18),
Chris@10 1824 VTW(0, 19),
Chris@10 1825 VTW(0, 20),
Chris@10 1826 VTW(0, 21),
Chris@10 1827 VTW(0, 22),
Chris@10 1828 VTW(0, 23),
Chris@10 1829 VTW(0, 24),
Chris@10 1830 VTW(0, 25),
Chris@10 1831 VTW(0, 26),
Chris@10 1832 VTW(0, 27),
Chris@10 1833 VTW(0, 28),
Chris@10 1834 VTW(0, 29),
Chris@10 1835 VTW(0, 30),
Chris@10 1836 VTW(0, 31),
Chris@10 1837 VTW(0, 32),
Chris@10 1838 VTW(0, 33),
Chris@10 1839 VTW(0, 34),
Chris@10 1840 VTW(0, 35),
Chris@10 1841 VTW(0, 36),
Chris@10 1842 VTW(0, 37),
Chris@10 1843 VTW(0, 38),
Chris@10 1844 VTW(0, 39),
Chris@10 1845 VTW(0, 40),
Chris@10 1846 VTW(0, 41),
Chris@10 1847 VTW(0, 42),
Chris@10 1848 VTW(0, 43),
Chris@10 1849 VTW(0, 44),
Chris@10 1850 VTW(0, 45),
Chris@10 1851 VTW(0, 46),
Chris@10 1852 VTW(0, 47),
Chris@10 1853 VTW(0, 48),
Chris@10 1854 VTW(0, 49),
Chris@10 1855 VTW(0, 50),
Chris@10 1856 VTW(0, 51),
Chris@10 1857 VTW(0, 52),
Chris@10 1858 VTW(0, 53),
Chris@10 1859 VTW(0, 54),
Chris@10 1860 VTW(0, 55),
Chris@10 1861 VTW(0, 56),
Chris@10 1862 VTW(0, 57),
Chris@10 1863 VTW(0, 58),
Chris@10 1864 VTW(0, 59),
Chris@10 1865 VTW(0, 60),
Chris@10 1866 VTW(0, 61),
Chris@10 1867 VTW(0, 62),
Chris@10 1868 VTW(0, 63),
Chris@10 1869 {TW_NEXT, VL, 0}
Chris@10 1870 };
Chris@10 1871
Chris@10 1872 static const ct_desc desc = { 64, XSIMD_STRING("t1fv_64"), twinstr, &GENUS, {467, 198, 52, 0}, 0, 0, 0 };
Chris@10 1873
Chris@10 1874 void XSIMD(codelet_t1fv_64) (planner *p) {
Chris@10 1875 X(kdft_dit_register) (p, t1fv_64, &desc);
Chris@10 1876 }
Chris@10 1877 #endif /* HAVE_FMA */