annotate src/fftw-3.3.3/dft/simd/common/t1sv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:25 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 434 FP additions, 260 FP multiplications,
Chris@10 32 * (or, 236 additions, 62 multiplications, 198 fused multiply/add),
Chris@10 33 * 158 stack variables, 7 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "ts.h"
Chris@10 36
Chris@10 37 static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 40 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 41 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@10 42 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@10 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 44 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 45 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 46 {
Chris@10 47 INT m;
Chris@10 48 for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 49 V T8Z, T90;
Chris@10 50 {
Chris@10 51 V T87, T8x, T3w, T8, T3B, T83, Tl, T8y, T6F, Tz, T3J, T5T, T6G, TM, T3Q;
Chris@10 52 V T5U, T46, T5Y, T7D, T6L, T5X, T3Z, T6M, T1f, T4l, T61, T7E, T6R, T60, T4e;
Chris@10 53 V T6O, T1G, T5r, T6c, T78, T7N, T54, T6f, T32, T7b, T4S, T65, T6X, T7I, T4v;
Chris@10 54 V T68, T29, T70, T4x, T2f, T5b, T5s, T7O, T7e, T5t, T5i, T79, T3t, T2h, T2k;
Chris@10 55 V T2j, T2o, T2r, T4H, T2y, T2n, T2q, T4y, T2i;
Chris@10 56 {
Chris@10 57 V T3U, TU, TW, TZ, TY, T13, T16, T12, T15, T3V, TX, T44, T1d;
Chris@10 58 {
Chris@10 59 V T1, T86, T3, T6, T5, Ta, Td, Tg, Tj, Tf, T84, T4, Tc, Ti, T3x;
Chris@10 60 V Tb, T2, T9;
Chris@10 61 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 62 T86 = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 63 T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@10 64 T6 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@10 65 T2 = LDW(&(W[TWVL * 30]));
Chris@10 66 T5 = LDW(&(W[TWVL * 31]));
Chris@10 67 Ta = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 68 Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 69 T9 = LDW(&(W[TWVL * 14]));
Chris@10 70 Tg = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@10 71 Tj = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@10 72 Tf = LDW(&(W[TWVL * 46]));
Chris@10 73 T84 = VMUL(T2, T6);
Chris@10 74 T4 = VMUL(T2, T3);
Chris@10 75 Tc = LDW(&(W[TWVL * 15]));
Chris@10 76 Ti = LDW(&(W[TWVL * 47]));
Chris@10 77 T3x = VMUL(T9, Td);
Chris@10 78 Tb = VMUL(T9, Ta);
Chris@10 79 {
Chris@10 80 V Tu, Tx, T3F, Ts, Tt, Tw;
Chris@10 81 {
Chris@10 82 V To, Tr, Tq, T3E, Tp;
Chris@10 83 {
Chris@10 84 V T3y, Te, Tn, T3A, Tk;
Chris@10 85 {
Chris@10 86 V T3z, Th, T85, T7;
Chris@10 87 To = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 88 T3z = VMUL(Tf, Tj);
Chris@10 89 Th = VMUL(Tf, Tg);
Chris@10 90 T85 = VFNMS(T5, T3, T84);
Chris@10 91 T7 = VFMA(T5, T6, T4);
Chris@10 92 Tr = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 93 T3y = VFNMS(Tc, Ta, T3x);
Chris@10 94 Te = VFMA(Tc, Td, Tb);
Chris@10 95 Tn = LDW(&(W[TWVL * 6]));
Chris@10 96 T3A = VFNMS(Ti, Tg, T3z);
Chris@10 97 Tk = VFMA(Ti, Tj, Th);
Chris@10 98 T87 = VADD(T85, T86);
Chris@10 99 T8x = VSUB(T86, T85);
Chris@10 100 T3w = VSUB(T1, T7);
Chris@10 101 T8 = VADD(T1, T7);
Chris@10 102 }
Chris@10 103 Tq = LDW(&(W[TWVL * 7]));
Chris@10 104 T3E = VMUL(Tn, Tr);
Chris@10 105 Tp = VMUL(Tn, To);
Chris@10 106 T3B = VSUB(T3y, T3A);
Chris@10 107 T83 = VADD(T3y, T3A);
Chris@10 108 Tl = VADD(Te, Tk);
Chris@10 109 T8y = VSUB(Te, Tk);
Chris@10 110 }
Chris@10 111 Tu = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@10 112 Tx = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@10 113 T3F = VFNMS(Tq, To, T3E);
Chris@10 114 Ts = VFMA(Tq, Tr, Tp);
Chris@10 115 Tt = LDW(&(W[TWVL * 38]));
Chris@10 116 Tw = LDW(&(W[TWVL * 39]));
Chris@10 117 }
Chris@10 118 {
Chris@10 119 V TB, TE, TD, TH, TK, T3G, Tv, TG, TJ, T3L, TC, TA;
Chris@10 120 TB = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@10 121 TE = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@10 122 TA = LDW(&(W[TWVL * 54]));
Chris@10 123 TD = LDW(&(W[TWVL * 55]));
Chris@10 124 TH = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 125 TK = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 126 T3G = VMUL(Tt, Tx);
Chris@10 127 Tv = VMUL(Tt, Tu);
Chris@10 128 TG = LDW(&(W[TWVL * 22]));
Chris@10 129 TJ = LDW(&(W[TWVL * 23]));
Chris@10 130 T3L = VMUL(TA, TE);
Chris@10 131 TC = VMUL(TA, TB);
Chris@10 132 {
Chris@10 133 V T19, T1c, T3P, T3K, T18, T1b, TV, T43, T1a;
Chris@10 134 {
Chris@10 135 V TQ, TT, T3M, TF, TS, T3I, T3D, T3O, TL, T3T, TR;
Chris@10 136 {
Chris@10 137 V T3H, Ty, T3N, TI, TP;
Chris@10 138 TQ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 139 TT = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 140 T3H = VFNMS(Tw, Tu, T3G);
Chris@10 141 Ty = VFMA(Tw, Tx, Tv);
Chris@10 142 T3N = VMUL(TG, TK);
Chris@10 143 TI = VMUL(TG, TH);
Chris@10 144 T3M = VFNMS(TD, TB, T3L);
Chris@10 145 TF = VFMA(TD, TE, TC);
Chris@10 146 TP = LDW(&(W[TWVL * 2]));
Chris@10 147 TS = LDW(&(W[TWVL * 3]));
Chris@10 148 T6F = VADD(T3F, T3H);
Chris@10 149 T3I = VSUB(T3F, T3H);
Chris@10 150 Tz = VADD(Ts, Ty);
Chris@10 151 T3D = VSUB(Ts, Ty);
Chris@10 152 T3O = VFNMS(TJ, TH, T3N);
Chris@10 153 TL = VFMA(TJ, TK, TI);
Chris@10 154 T3T = VMUL(TP, TT);
Chris@10 155 TR = VMUL(TP, TQ);
Chris@10 156 }
Chris@10 157 T19 = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@10 158 T1c = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@10 159 T3J = VADD(T3D, T3I);
Chris@10 160 T5T = VSUB(T3I, T3D);
Chris@10 161 T6G = VADD(T3M, T3O);
Chris@10 162 T3P = VSUB(T3M, T3O);
Chris@10 163 TM = VADD(TF, TL);
Chris@10 164 T3K = VSUB(TF, TL);
Chris@10 165 T3U = VFNMS(TS, TQ, T3T);
Chris@10 166 TU = VFMA(TS, TT, TR);
Chris@10 167 T18 = LDW(&(W[TWVL * 50]));
Chris@10 168 T1b = LDW(&(W[TWVL * 51]));
Chris@10 169 }
Chris@10 170 TW = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@10 171 TZ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@10 172 T3Q = VSUB(T3K, T3P);
Chris@10 173 T5U = VADD(T3K, T3P);
Chris@10 174 TV = LDW(&(W[TWVL * 34]));
Chris@10 175 TY = LDW(&(W[TWVL * 35]));
Chris@10 176 T43 = VMUL(T18, T1c);
Chris@10 177 T1a = VMUL(T18, T19);
Chris@10 178 T13 = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 179 T16 = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 180 T12 = LDW(&(W[TWVL * 18]));
Chris@10 181 T15 = LDW(&(W[TWVL * 19]));
Chris@10 182 T3V = VMUL(TV, TZ);
Chris@10 183 TX = VMUL(TV, TW);
Chris@10 184 T44 = VFNMS(T1b, T19, T43);
Chris@10 185 T1d = VFMA(T1b, T1c, T1a);
Chris@10 186 }
Chris@10 187 }
Chris@10 188 }
Chris@10 189 }
Chris@10 190 {
Chris@10 191 V T4Z, T2H, T2J, T2M, T2L, T2Q, T2T, T2P, T2S, T5p, T30, T50, T2K;
Chris@10 192 {
Chris@10 193 V T49, T1l, T1n, T1q, T1p, T1u, T1x, T4j, T1E, T1t, T1w, T4a, T1o;
Chris@10 194 {
Chris@10 195 V T1A, T1D, T1C, T4i, T1B, T1m;
Chris@10 196 {
Chris@10 197 V T1h, T1k, T41, T14, T3W, T10, T1g, T1j;
Chris@10 198 T1h = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@10 199 T1k = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@10 200 T41 = VMUL(T12, T16);
Chris@10 201 T14 = VMUL(T12, T13);
Chris@10 202 T3W = VFNMS(TY, TW, T3V);
Chris@10 203 T10 = VFMA(TY, TZ, TX);
Chris@10 204 T1g = LDW(&(W[TWVL * 58]));
Chris@10 205 T1j = LDW(&(W[TWVL * 59]));
Chris@10 206 {
Chris@10 207 V T6J, T3X, T11, T40, T48, T1i, T6K, T45, T1e, T3Y, T1z, T42, T17;
Chris@10 208 T1A = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@10 209 T1D = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@10 210 T42 = VFNMS(T15, T13, T41);
Chris@10 211 T17 = VFMA(T15, T16, T14);
Chris@10 212 T6J = VADD(T3U, T3W);
Chris@10 213 T3X = VSUB(T3U, T3W);
Chris@10 214 T11 = VADD(TU, T10);
Chris@10 215 T40 = VSUB(TU, T10);
Chris@10 216 T48 = VMUL(T1g, T1k);
Chris@10 217 T1i = VMUL(T1g, T1h);
Chris@10 218 T6K = VADD(T42, T44);
Chris@10 219 T45 = VSUB(T42, T44);
Chris@10 220 T1e = VADD(T17, T1d);
Chris@10 221 T3Y = VSUB(T17, T1d);
Chris@10 222 T1z = LDW(&(W[TWVL * 42]));
Chris@10 223 T1C = LDW(&(W[TWVL * 43]));
Chris@10 224 T49 = VFNMS(T1j, T1h, T48);
Chris@10 225 T1l = VFMA(T1j, T1k, T1i);
Chris@10 226 T46 = VADD(T40, T45);
Chris@10 227 T5Y = VSUB(T40, T45);
Chris@10 228 T7D = VADD(T6J, T6K);
Chris@10 229 T6L = VSUB(T6J, T6K);
Chris@10 230 T5X = VADD(T3X, T3Y);
Chris@10 231 T3Z = VSUB(T3X, T3Y);
Chris@10 232 T6M = VSUB(T11, T1e);
Chris@10 233 T1f = VADD(T11, T1e);
Chris@10 234 T4i = VMUL(T1z, T1D);
Chris@10 235 T1B = VMUL(T1z, T1A);
Chris@10 236 }
Chris@10 237 }
Chris@10 238 T1n = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 239 T1q = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 240 T1m = LDW(&(W[TWVL * 26]));
Chris@10 241 T1p = LDW(&(W[TWVL * 27]));
Chris@10 242 T1u = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 243 T1x = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 244 T4j = VFNMS(T1C, T1A, T4i);
Chris@10 245 T1E = VFMA(T1C, T1D, T1B);
Chris@10 246 T1t = LDW(&(W[TWVL * 10]));
Chris@10 247 T1w = LDW(&(W[TWVL * 11]));
Chris@10 248 T4a = VMUL(T1m, T1q);
Chris@10 249 T1o = VMUL(T1m, T1n);
Chris@10 250 }
Chris@10 251 {
Chris@10 252 V T2W, T2Z, T6P, T4c, T1s, T4f, T6Q, T4k, T1F, T4d, T2V, T2Y, T5o, T2X, T2I;
Chris@10 253 {
Chris@10 254 V T2D, T2G, T2C, T2F, T4g, T1v, T4b, T1r;
Chris@10 255 T2D = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@10 256 T2G = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@10 257 T2C = LDW(&(W[TWVL * 60]));
Chris@10 258 T2F = LDW(&(W[TWVL * 61]));
Chris@10 259 T4g = VMUL(T1t, T1x);
Chris@10 260 T1v = VMUL(T1t, T1u);
Chris@10 261 T4b = VFNMS(T1p, T1n, T4a);
Chris@10 262 T1r = VFMA(T1p, T1q, T1o);
Chris@10 263 T2W = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@10 264 T2Z = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@10 265 {
Chris@10 266 V T4Y, T2E, T4h, T1y;
Chris@10 267 T4Y = VMUL(T2C, T2G);
Chris@10 268 T2E = VMUL(T2C, T2D);
Chris@10 269 T4h = VFNMS(T1w, T1u, T4g);
Chris@10 270 T1y = VFMA(T1w, T1x, T1v);
Chris@10 271 T6P = VADD(T49, T4b);
Chris@10 272 T4c = VSUB(T49, T4b);
Chris@10 273 T1s = VADD(T1l, T1r);
Chris@10 274 T4f = VSUB(T1l, T1r);
Chris@10 275 T4Z = VFNMS(T2F, T2D, T4Y);
Chris@10 276 T2H = VFMA(T2F, T2G, T2E);
Chris@10 277 T6Q = VADD(T4h, T4j);
Chris@10 278 T4k = VSUB(T4h, T4j);
Chris@10 279 T1F = VADD(T1y, T1E);
Chris@10 280 T4d = VSUB(T1y, T1E);
Chris@10 281 T2V = LDW(&(W[TWVL * 44]));
Chris@10 282 }
Chris@10 283 T2Y = LDW(&(W[TWVL * 45]));
Chris@10 284 }
Chris@10 285 T2J = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 286 T2M = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 287 T4l = VADD(T4f, T4k);
Chris@10 288 T61 = VSUB(T4f, T4k);
Chris@10 289 T7E = VADD(T6P, T6Q);
Chris@10 290 T6R = VSUB(T6P, T6Q);
Chris@10 291 T60 = VADD(T4c, T4d);
Chris@10 292 T4e = VSUB(T4c, T4d);
Chris@10 293 T6O = VSUB(T1s, T1F);
Chris@10 294 T1G = VADD(T1s, T1F);
Chris@10 295 T5o = VMUL(T2V, T2Z);
Chris@10 296 T2X = VMUL(T2V, T2W);
Chris@10 297 T2I = LDW(&(W[TWVL * 28]));
Chris@10 298 T2L = LDW(&(W[TWVL * 29]));
Chris@10 299 T2Q = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 300 T2T = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 301 T2P = LDW(&(W[TWVL * 12]));
Chris@10 302 T2S = LDW(&(W[TWVL * 13]));
Chris@10 303 T5p = VFNMS(T2Y, T2W, T5o);
Chris@10 304 T30 = VFMA(T2Y, T2Z, T2X);
Chris@10 305 T50 = VMUL(T2I, T2M);
Chris@10 306 T2K = VMUL(T2I, T2J);
Chris@10 307 }
Chris@10 308 }
Chris@10 309 {
Chris@10 310 V T4q, T1O, T1Q, T1T, T1S, T1X, T20, T4Q, T27, T1W, T1Z, T4r, T1R;
Chris@10 311 {
Chris@10 312 V T23, T26, T25, T4P, T24, T1P;
Chris@10 313 {
Chris@10 314 V T1K, T1N, T5m, T2R, T1J, T1M, T51, T2N;
Chris@10 315 T1K = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 316 T1N = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 317 T5m = VMUL(T2P, T2T);
Chris@10 318 T2R = VMUL(T2P, T2Q);
Chris@10 319 T1J = LDW(&(W[0]));
Chris@10 320 T1M = LDW(&(W[TWVL * 1]));
Chris@10 321 T51 = VFNMS(T2L, T2J, T50);
Chris@10 322 T2N = VFMA(T2L, T2M, T2K);
Chris@10 323 {
Chris@10 324 V T76, T52, T2O, T5l, T77, T5q, T31, T53, T22;
Chris@10 325 T23 = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@10 326 T26 = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@10 327 {
Chris@10 328 V T5n, T2U, T4p, T1L;
Chris@10 329 T5n = VFNMS(T2S, T2Q, T5m);
Chris@10 330 T2U = VFMA(T2S, T2T, T2R);
Chris@10 331 T4p = VMUL(T1J, T1N);
Chris@10 332 T1L = VMUL(T1J, T1K);
Chris@10 333 T76 = VADD(T4Z, T51);
Chris@10 334 T52 = VSUB(T4Z, T51);
Chris@10 335 T2O = VADD(T2H, T2N);
Chris@10 336 T5l = VSUB(T2H, T2N);
Chris@10 337 T77 = VADD(T5n, T5p);
Chris@10 338 T5q = VSUB(T5n, T5p);
Chris@10 339 T31 = VADD(T2U, T30);
Chris@10 340 T53 = VSUB(T2U, T30);
Chris@10 341 T4q = VFNMS(T1M, T1K, T4p);
Chris@10 342 T1O = VFMA(T1M, T1N, T1L);
Chris@10 343 T22 = LDW(&(W[TWVL * 48]));
Chris@10 344 }
Chris@10 345 T25 = LDW(&(W[TWVL * 49]));
Chris@10 346 T5r = VADD(T5l, T5q);
Chris@10 347 T6c = VSUB(T5l, T5q);
Chris@10 348 T78 = VSUB(T76, T77);
Chris@10 349 T7N = VADD(T76, T77);
Chris@10 350 T54 = VSUB(T52, T53);
Chris@10 351 T6f = VADD(T52, T53);
Chris@10 352 T32 = VADD(T2O, T31);
Chris@10 353 T7b = VSUB(T2O, T31);
Chris@10 354 T4P = VMUL(T22, T26);
Chris@10 355 T24 = VMUL(T22, T23);
Chris@10 356 }
Chris@10 357 }
Chris@10 358 T1Q = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@10 359 T1T = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@10 360 T1P = LDW(&(W[TWVL * 32]));
Chris@10 361 T1S = LDW(&(W[TWVL * 33]));
Chris@10 362 T1X = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 363 T20 = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 364 T4Q = VFNMS(T25, T23, T4P);
Chris@10 365 T27 = VFMA(T25, T26, T24);
Chris@10 366 T1W = LDW(&(W[TWVL * 16]));
Chris@10 367 T1Z = LDW(&(W[TWVL * 17]));
Chris@10 368 T4r = VMUL(T1P, T1T);
Chris@10 369 T1R = VMUL(T1P, T1Q);
Chris@10 370 }
Chris@10 371 {
Chris@10 372 V T56, T38, T3a, T3d, T3c, T3h, T3k, T3g, T3j, T5g, T3r, T57, T3b;
Chris@10 373 {
Chris@10 374 V T3n, T3q, T6V, T4t, T1V, T4M, T6W, T4R, T28, T4u, T3m, T3p, T5f, T3o, T39;
Chris@10 375 {
Chris@10 376 V T34, T37, T33, T36, T4N, T1Y, T4s, T1U;
Chris@10 377 T34 = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 378 T37 = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 379 T33 = LDW(&(W[TWVL * 4]));
Chris@10 380 T36 = LDW(&(W[TWVL * 5]));
Chris@10 381 T4N = VMUL(T1W, T20);
Chris@10 382 T1Y = VMUL(T1W, T1X);
Chris@10 383 T4s = VFNMS(T1S, T1Q, T4r);
Chris@10 384 T1U = VFMA(T1S, T1T, T1R);
Chris@10 385 T3n = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 386 T3q = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 387 {
Chris@10 388 V T55, T35, T4O, T21;
Chris@10 389 T55 = VMUL(T33, T37);
Chris@10 390 T35 = VMUL(T33, T34);
Chris@10 391 T4O = VFNMS(T1Z, T1X, T4N);
Chris@10 392 T21 = VFMA(T1Z, T20, T1Y);
Chris@10 393 T6V = VADD(T4q, T4s);
Chris@10 394 T4t = VSUB(T4q, T4s);
Chris@10 395 T1V = VADD(T1O, T1U);
Chris@10 396 T4M = VSUB(T1O, T1U);
Chris@10 397 T56 = VFNMS(T36, T34, T55);
Chris@10 398 T38 = VFMA(T36, T37, T35);
Chris@10 399 T6W = VADD(T4O, T4Q);
Chris@10 400 T4R = VSUB(T4O, T4Q);
Chris@10 401 T28 = VADD(T21, T27);
Chris@10 402 T4u = VSUB(T21, T27);
Chris@10 403 T3m = LDW(&(W[TWVL * 20]));
Chris@10 404 }
Chris@10 405 T3p = LDW(&(W[TWVL * 21]));
Chris@10 406 }
Chris@10 407 T3a = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@10 408 T3d = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@10 409 T4S = VADD(T4M, T4R);
Chris@10 410 T65 = VSUB(T4M, T4R);
Chris@10 411 T6X = VSUB(T6V, T6W);
Chris@10 412 T7I = VADD(T6V, T6W);
Chris@10 413 T4v = VSUB(T4t, T4u);
Chris@10 414 T68 = VADD(T4t, T4u);
Chris@10 415 T29 = VADD(T1V, T28);
Chris@10 416 T70 = VSUB(T1V, T28);
Chris@10 417 T5f = VMUL(T3m, T3q);
Chris@10 418 T3o = VMUL(T3m, T3n);
Chris@10 419 T39 = LDW(&(W[TWVL * 36]));
Chris@10 420 T3c = LDW(&(W[TWVL * 37]));
Chris@10 421 T3h = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@10 422 T3k = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@10 423 T3g = LDW(&(W[TWVL * 52]));
Chris@10 424 T3j = LDW(&(W[TWVL * 53]));
Chris@10 425 T5g = VFNMS(T3p, T3n, T5f);
Chris@10 426 T3r = VFMA(T3p, T3q, T3o);
Chris@10 427 T57 = VMUL(T39, T3d);
Chris@10 428 T3b = VMUL(T39, T3a);
Chris@10 429 }
Chris@10 430 {
Chris@10 431 V T2u, T2x, T2w, T4G, T2v, T2g;
Chris@10 432 {
Chris@10 433 V T2b, T2e, T5d, T3i, T2a, T2d, T58, T3e, T2t;
Chris@10 434 T2b = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 435 T2e = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 436 T5d = VMUL(T3g, T3k);
Chris@10 437 T3i = VMUL(T3g, T3h);
Chris@10 438 T2a = LDW(&(W[TWVL * 8]));
Chris@10 439 T2d = LDW(&(W[TWVL * 9]));
Chris@10 440 T58 = VFNMS(T3c, T3a, T57);
Chris@10 441 T3e = VFMA(T3c, T3d, T3b);
Chris@10 442 T2u = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 443 T2x = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 444 {
Chris@10 445 V T5e, T3l, T4w, T2c;
Chris@10 446 T5e = VFNMS(T3j, T3h, T5d);
Chris@10 447 T3l = VFMA(T3j, T3k, T3i);
Chris@10 448 T4w = VMUL(T2a, T2e);
Chris@10 449 T2c = VMUL(T2a, T2b);
Chris@10 450 {
Chris@10 451 V T7c, T59, T3f, T5a;
Chris@10 452 T7c = VADD(T56, T58);
Chris@10 453 T59 = VSUB(T56, T58);
Chris@10 454 T3f = VADD(T38, T3e);
Chris@10 455 T5a = VSUB(T38, T3e);
Chris@10 456 {
Chris@10 457 V T7d, T5h, T3s, T5c;
Chris@10 458 T7d = VADD(T5e, T5g);
Chris@10 459 T5h = VSUB(T5e, T5g);
Chris@10 460 T3s = VADD(T3l, T3r);
Chris@10 461 T5c = VSUB(T3l, T3r);
Chris@10 462 T4x = VFNMS(T2d, T2b, T4w);
Chris@10 463 T2f = VFMA(T2d, T2e, T2c);
Chris@10 464 T5b = VSUB(T59, T5a);
Chris@10 465 T5s = VADD(T5a, T59);
Chris@10 466 T2t = LDW(&(W[TWVL * 24]));
Chris@10 467 T7O = VADD(T7c, T7d);
Chris@10 468 T7e = VSUB(T7c, T7d);
Chris@10 469 T5t = VSUB(T5c, T5h);
Chris@10 470 T5i = VADD(T5c, T5h);
Chris@10 471 T79 = VSUB(T3s, T3f);
Chris@10 472 T3t = VADD(T3f, T3s);
Chris@10 473 }
Chris@10 474 }
Chris@10 475 }
Chris@10 476 T2w = LDW(&(W[TWVL * 25]));
Chris@10 477 T4G = VMUL(T2t, T2x);
Chris@10 478 T2v = VMUL(T2t, T2u);
Chris@10 479 }
Chris@10 480 T2h = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@10 481 T2k = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@10 482 T2g = LDW(&(W[TWVL * 40]));
Chris@10 483 T2j = LDW(&(W[TWVL * 41]));
Chris@10 484 T2o = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@10 485 T2r = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@10 486 T4H = VFNMS(T2w, T2u, T4G);
Chris@10 487 T2y = VFMA(T2w, T2x, T2v);
Chris@10 488 T2n = LDW(&(W[TWVL * 56]));
Chris@10 489 T2q = LDW(&(W[TWVL * 57]));
Chris@10 490 T4y = VMUL(T2g, T2k);
Chris@10 491 T2i = VMUL(T2g, T2h);
Chris@10 492 }
Chris@10 493 }
Chris@10 494 }
Chris@10 495 }
Chris@10 496 }
Chris@10 497 {
Chris@10 498 V T4C, T4T, T4U, T4J, T7A, T7w, T7j, T75, T7i, T6U, T8p, T8n, T8v, T8t, T7q;
Chris@10 499 V T7y, T7t, T7z, T7g, T7k;
Chris@10 500 {
Chris@10 501 V T6E, T8j, T6H, T8k, T73, T6Y, T7S, T8i, T8h, T7V;
Chris@10 502 {
Chris@10 503 V T7P, T7Y, T7C, TO, T89, T8e, T3u, T7M, T8d, T1H, T7K, T7X, T2B, T7H;
Chris@10 504 {
Chris@10 505 V T71, T2m, T72, T4I, T2z, T4D, Tm, TN, T2A, T7J;
Chris@10 506 T6E = VSUB(T8, Tl);
Chris@10 507 Tm = VADD(T8, Tl);
Chris@10 508 TN = VADD(Tz, TM);
Chris@10 509 T8j = VSUB(TM, Tz);
Chris@10 510 T7P = VSUB(T7N, T7O);
Chris@10 511 T7Y = VADD(T7N, T7O);
Chris@10 512 {
Chris@10 513 V T82, T4E, T2p, T4z, T2l, T88;
Chris@10 514 T82 = VADD(T6F, T6G);
Chris@10 515 T6H = VSUB(T6F, T6G);
Chris@10 516 T4E = VMUL(T2n, T2r);
Chris@10 517 T2p = VMUL(T2n, T2o);
Chris@10 518 T4z = VFNMS(T2j, T2h, T4y);
Chris@10 519 T2l = VFMA(T2j, T2k, T2i);
Chris@10 520 T8k = VSUB(T87, T83);
Chris@10 521 T88 = VADD(T83, T87);
Chris@10 522 T7C = VSUB(Tm, TN);
Chris@10 523 TO = VADD(Tm, TN);
Chris@10 524 {
Chris@10 525 V T4F, T2s, T4A, T4B;
Chris@10 526 T4F = VFNMS(T2q, T2o, T4E);
Chris@10 527 T2s = VFMA(T2q, T2r, T2p);
Chris@10 528 T71 = VADD(T4x, T4z);
Chris@10 529 T4A = VSUB(T4x, T4z);
Chris@10 530 T2m = VADD(T2f, T2l);
Chris@10 531 T4B = VSUB(T2f, T2l);
Chris@10 532 T89 = VADD(T82, T88);
Chris@10 533 T8e = VSUB(T88, T82);
Chris@10 534 T72 = VADD(T4F, T4H);
Chris@10 535 T4I = VSUB(T4F, T4H);
Chris@10 536 T2z = VADD(T2s, T2y);
Chris@10 537 T4D = VSUB(T2s, T2y);
Chris@10 538 T4C = VSUB(T4A, T4B);
Chris@10 539 T4T = VADD(T4B, T4A);
Chris@10 540 }
Chris@10 541 }
Chris@10 542 T3u = VADD(T32, T3t);
Chris@10 543 T7M = VSUB(T32, T3t);
Chris@10 544 T7J = VADD(T71, T72);
Chris@10 545 T73 = VSUB(T71, T72);
Chris@10 546 T4U = VSUB(T4D, T4I);
Chris@10 547 T4J = VADD(T4D, T4I);
Chris@10 548 T6Y = VSUB(T2z, T2m);
Chris@10 549 T2A = VADD(T2m, T2z);
Chris@10 550 T8d = VSUB(T1G, T1f);
Chris@10 551 T1H = VADD(T1f, T1G);
Chris@10 552 T7K = VSUB(T7I, T7J);
Chris@10 553 T7X = VADD(T7I, T7J);
Chris@10 554 T2B = VADD(T29, T2A);
Chris@10 555 T7H = VSUB(T29, T2A);
Chris@10 556 }
Chris@10 557 {
Chris@10 558 V T1I, T80, T7Q, T7U, T7F, T7L, T7T, T3v, T8b, T8c, T8a, T7W, T81, T7Z;
Chris@10 559 T7W = VSUB(TO, T1H);
Chris@10 560 T1I = VADD(TO, T1H);
Chris@10 561 T7Z = VSUB(T7X, T7Y);
Chris@10 562 T80 = VADD(T7X, T7Y);
Chris@10 563 T7Q = VSUB(T7M, T7P);
Chris@10 564 T7U = VADD(T7M, T7P);
Chris@10 565 T7F = VSUB(T7D, T7E);
Chris@10 566 T81 = VADD(T7D, T7E);
Chris@10 567 T7L = VADD(T7H, T7K);
Chris@10 568 T7T = VSUB(T7K, T7H);
Chris@10 569 T3v = VADD(T2B, T3u);
Chris@10 570 T8b = VSUB(T3u, T2B);
Chris@10 571 ST(&(ri[WS(rs, 24)]), VSUB(T7W, T7Z), ms, &(ri[0]));
Chris@10 572 ST(&(ri[WS(rs, 8)]), VADD(T7W, T7Z), ms, &(ri[0]));
Chris@10 573 T8c = VSUB(T89, T81);
Chris@10 574 T8a = VADD(T81, T89);
Chris@10 575 {
Chris@10 576 V T8f, T8g, T7G, T7R;
Chris@10 577 T7S = VSUB(T7C, T7F);
Chris@10 578 T7G = VADD(T7C, T7F);
Chris@10 579 T7R = VADD(T7L, T7Q);
Chris@10 580 T8i = VSUB(T7Q, T7L);
Chris@10 581 T8h = VSUB(T8e, T8d);
Chris@10 582 T8f = VADD(T8d, T8e);
Chris@10 583 ST(&(ri[0]), VADD(T1I, T3v), ms, &(ri[0]));
Chris@10 584 ST(&(ri[WS(rs, 16)]), VSUB(T1I, T3v), ms, &(ri[0]));
Chris@10 585 T8g = VADD(T7T, T7U);
Chris@10 586 T7V = VSUB(T7T, T7U);
Chris@10 587 ST(&(ii[WS(rs, 16)]), VSUB(T8a, T80), ms, &(ii[0]));
Chris@10 588 ST(&(ii[0]), VADD(T80, T8a), ms, &(ii[0]));
Chris@10 589 ST(&(ii[WS(rs, 24)]), VSUB(T8c, T8b), ms, &(ii[0]));
Chris@10 590 ST(&(ii[WS(rs, 8)]), VADD(T8b, T8c), ms, &(ii[0]));
Chris@10 591 ST(&(ri[WS(rs, 4)]), VFMA(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
Chris@10 592 ST(&(ri[WS(rs, 20)]), VFNMS(LDK(KP707106781), T7R, T7G), ms, &(ri[0]));
Chris@10 593 ST(&(ii[WS(rs, 20)]), VFNMS(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
Chris@10 594 ST(&(ii[WS(rs, 4)]), VFMA(LDK(KP707106781), T8g, T8f), ms, &(ii[0]));
Chris@10 595 }
Chris@10 596 }
Chris@10 597 }
Chris@10 598 {
Chris@10 599 V T7f, T7a, T7m, T6I, T7s, T7r, T8r, T8l, T8m, T6T, T8s, T7p;
Chris@10 600 {
Chris@10 601 V T7n, T6N, T6S, T7o, T7u, T7v, T6Z, T74;
Chris@10 602 T7f = VSUB(T7b, T7e);
Chris@10 603 T7u = VADD(T7b, T7e);
Chris@10 604 T7v = VADD(T78, T79);
Chris@10 605 T7a = VSUB(T78, T79);
Chris@10 606 ST(&(ri[WS(rs, 12)]), VFMA(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
Chris@10 607 ST(&(ri[WS(rs, 28)]), VFNMS(LDK(KP707106781), T7V, T7S), ms, &(ri[0]));
Chris@10 608 ST(&(ii[WS(rs, 28)]), VFNMS(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
Chris@10 609 ST(&(ii[WS(rs, 12)]), VFMA(LDK(KP707106781), T8i, T8h), ms, &(ii[0]));
Chris@10 610 T7m = VADD(T6E, T6H);
Chris@10 611 T6I = VSUB(T6E, T6H);
Chris@10 612 T7A = VFMA(LDK(KP414213562), T7u, T7v);
Chris@10 613 T7w = VFNMS(LDK(KP414213562), T7v, T7u);
Chris@10 614 T7n = VADD(T6M, T6L);
Chris@10 615 T6N = VSUB(T6L, T6M);
Chris@10 616 T6S = VADD(T6O, T6R);
Chris@10 617 T7o = VSUB(T6O, T6R);
Chris@10 618 T7s = VADD(T6X, T6Y);
Chris@10 619 T6Z = VSUB(T6X, T6Y);
Chris@10 620 T74 = VSUB(T70, T73);
Chris@10 621 T7r = VADD(T70, T73);
Chris@10 622 T8r = VSUB(T8k, T8j);
Chris@10 623 T8l = VADD(T8j, T8k);
Chris@10 624 T8m = VADD(T6N, T6S);
Chris@10 625 T6T = VSUB(T6N, T6S);
Chris@10 626 T7j = VFNMS(LDK(KP414213562), T6Z, T74);
Chris@10 627 T75 = VFMA(LDK(KP414213562), T74, T6Z);
Chris@10 628 T8s = VSUB(T7o, T7n);
Chris@10 629 T7p = VADD(T7n, T7o);
Chris@10 630 }
Chris@10 631 T7i = VFNMS(LDK(KP707106781), T6T, T6I);
Chris@10 632 T6U = VFMA(LDK(KP707106781), T6T, T6I);
Chris@10 633 T8p = VFNMS(LDK(KP707106781), T8m, T8l);
Chris@10 634 T8n = VFMA(LDK(KP707106781), T8m, T8l);
Chris@10 635 T8v = VFNMS(LDK(KP707106781), T8s, T8r);
Chris@10 636 T8t = VFMA(LDK(KP707106781), T8s, T8r);
Chris@10 637 T7q = VFMA(LDK(KP707106781), T7p, T7m);
Chris@10 638 T7y = VFNMS(LDK(KP707106781), T7p, T7m);
Chris@10 639 T7t = VFMA(LDK(KP414213562), T7s, T7r);
Chris@10 640 T7z = VFNMS(LDK(KP414213562), T7r, T7s);
Chris@10 641 T7g = VFNMS(LDK(KP414213562), T7f, T7a);
Chris@10 642 T7k = VFMA(LDK(KP414213562), T7a, T7f);
Chris@10 643 }
Chris@10 644 }
Chris@10 645 {
Chris@10 646 V T5S, T8O, T8N, T5V, T6d, T6g, T66, T4L, T5I, T69, T5y, T4o, T8J, T8L, T5M;
Chris@10 647 V T5Q, T5A, T5w, T5H, T4W, T5O, T5G, T8D, T8F;
Chris@10 648 {
Chris@10 649 V T5C, T3S, T8C, T4n, T8H, T8B, T8I, T5F, T5L, T5k, T5K, T5v, T4V;
Chris@10 650 {
Chris@10 651 V T5D, T47, T4m, T5E, T8z, T8A, T3C, T3R, T5j, T5u, T4K;
Chris@10 652 T5S = VSUB(T3w, T3B);
Chris@10 653 T3C = VADD(T3w, T3B);
Chris@10 654 T3R = VADD(T3J, T3Q);
Chris@10 655 T8O = VSUB(T3Q, T3J);
Chris@10 656 {
Chris@10 657 V T8o, T7B, T7x, T8q;
Chris@10 658 T8o = VADD(T7z, T7A);
Chris@10 659 T7B = VSUB(T7z, T7A);
Chris@10 660 T7x = VADD(T7t, T7w);
Chris@10 661 T8q = VSUB(T7w, T7t);
Chris@10 662 {
Chris@10 663 V T8u, T7l, T7h, T8w;
Chris@10 664 T8u = VSUB(T7k, T7j);
Chris@10 665 T7l = VADD(T7j, T7k);
Chris@10 666 T7h = VSUB(T75, T7g);
Chris@10 667 T8w = VADD(T75, T7g);
Chris@10 668 ST(&(ri[WS(rs, 10)]), VFMA(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
Chris@10 669 ST(&(ri[WS(rs, 26)]), VFNMS(LDK(KP923879532), T7B, T7y), ms, &(ri[0]));
Chris@10 670 ST(&(ii[WS(rs, 18)]), VFNMS(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
Chris@10 671 ST(&(ii[WS(rs, 2)]), VFMA(LDK(KP923879532), T8o, T8n), ms, &(ii[0]));
Chris@10 672 ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
Chris@10 673 ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T8q, T8p), ms, &(ii[0]));
Chris@10 674 ST(&(ri[WS(rs, 2)]), VFMA(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
Chris@10 675 ST(&(ri[WS(rs, 18)]), VFNMS(LDK(KP923879532), T7x, T7q), ms, &(ri[0]));
Chris@10 676 ST(&(ri[WS(rs, 30)]), VFMA(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
Chris@10 677 ST(&(ri[WS(rs, 14)]), VFNMS(LDK(KP923879532), T7l, T7i), ms, &(ri[0]));
Chris@10 678 ST(&(ii[WS(rs, 22)]), VFNMS(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
Chris@10 679 ST(&(ii[WS(rs, 6)]), VFMA(LDK(KP923879532), T8u, T8t), ms, &(ii[0]));
Chris@10 680 ST(&(ii[WS(rs, 30)]), VFMA(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
Chris@10 681 ST(&(ii[WS(rs, 14)]), VFNMS(LDK(KP923879532), T8w, T8v), ms, &(ii[0]));
Chris@10 682 ST(&(ri[WS(rs, 6)]), VFMA(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
Chris@10 683 ST(&(ri[WS(rs, 22)]), VFNMS(LDK(KP923879532), T7h, T6U), ms, &(ri[0]));
Chris@10 684 T5C = VFMA(LDK(KP707106781), T3R, T3C);
Chris@10 685 T3S = VFNMS(LDK(KP707106781), T3R, T3C);
Chris@10 686 }
Chris@10 687 }
Chris@10 688 T5D = VFMA(LDK(KP414213562), T3Z, T46);
Chris@10 689 T47 = VFNMS(LDK(KP414213562), T46, T3Z);
Chris@10 690 T4m = VFMA(LDK(KP414213562), T4l, T4e);
Chris@10 691 T5E = VFNMS(LDK(KP414213562), T4e, T4l);
Chris@10 692 T8N = VADD(T8y, T8x);
Chris@10 693 T8z = VSUB(T8x, T8y);
Chris@10 694 T8A = VADD(T5T, T5U);
Chris@10 695 T5V = VSUB(T5T, T5U);
Chris@10 696 T6d = VSUB(T5i, T5b);
Chris@10 697 T5j = VADD(T5b, T5i);
Chris@10 698 T5u = VADD(T5s, T5t);
Chris@10 699 T6g = VSUB(T5s, T5t);
Chris@10 700 T66 = VSUB(T4J, T4C);
Chris@10 701 T4K = VADD(T4C, T4J);
Chris@10 702 T8C = VADD(T47, T4m);
Chris@10 703 T4n = VSUB(T47, T4m);
Chris@10 704 T8H = VFNMS(LDK(KP707106781), T8A, T8z);
Chris@10 705 T8B = VFMA(LDK(KP707106781), T8A, T8z);
Chris@10 706 T8I = VSUB(T5E, T5D);
Chris@10 707 T5F = VADD(T5D, T5E);
Chris@10 708 T5L = VFMA(LDK(KP707106781), T5j, T54);
Chris@10 709 T5k = VFNMS(LDK(KP707106781), T5j, T54);
Chris@10 710 T5K = VFMA(LDK(KP707106781), T5u, T5r);
Chris@10 711 T5v = VFNMS(LDK(KP707106781), T5u, T5r);
Chris@10 712 T4L = VFNMS(LDK(KP707106781), T4K, T4v);
Chris@10 713 T5I = VFMA(LDK(KP707106781), T4K, T4v);
Chris@10 714 T4V = VADD(T4T, T4U);
Chris@10 715 T69 = VSUB(T4T, T4U);
Chris@10 716 }
Chris@10 717 T5y = VFNMS(LDK(KP923879532), T4n, T3S);
Chris@10 718 T4o = VFMA(LDK(KP923879532), T4n, T3S);
Chris@10 719 T8J = VFMA(LDK(KP923879532), T8I, T8H);
Chris@10 720 T8L = VFNMS(LDK(KP923879532), T8I, T8H);
Chris@10 721 T5M = VFNMS(LDK(KP198912367), T5L, T5K);
Chris@10 722 T5Q = VFMA(LDK(KP198912367), T5K, T5L);
Chris@10 723 T5A = VFMA(LDK(KP668178637), T5k, T5v);
Chris@10 724 T5w = VFNMS(LDK(KP668178637), T5v, T5k);
Chris@10 725 T5H = VFMA(LDK(KP707106781), T4V, T4S);
Chris@10 726 T4W = VFNMS(LDK(KP707106781), T4V, T4S);
Chris@10 727 T5O = VFNMS(LDK(KP923879532), T5F, T5C);
Chris@10 728 T5G = VFMA(LDK(KP923879532), T5F, T5C);
Chris@10 729 T8D = VFMA(LDK(KP923879532), T8C, T8B);
Chris@10 730 T8F = VFNMS(LDK(KP923879532), T8C, T8B);
Chris@10 731 }
Chris@10 732 {
Chris@10 733 V T6p, T6q, T6o, T5W, T8W, T63;
Chris@10 734 {
Chris@10 735 V T5J, T5P, T5z, T4X, T5Z, T62;
Chris@10 736 T5J = VFMA(LDK(KP198912367), T5I, T5H);
Chris@10 737 T5P = VFNMS(LDK(KP198912367), T5H, T5I);
Chris@10 738 T5z = VFNMS(LDK(KP668178637), T4L, T4W);
Chris@10 739 T4X = VFMA(LDK(KP668178637), T4W, T4L);
Chris@10 740 T6p = VFNMS(LDK(KP414213562), T5X, T5Y);
Chris@10 741 T5Z = VFMA(LDK(KP414213562), T5Y, T5X);
Chris@10 742 T62 = VFNMS(LDK(KP414213562), T61, T60);
Chris@10 743 T6q = VFMA(LDK(KP414213562), T60, T61);
Chris@10 744 {
Chris@10 745 V T8G, T5N, T5R, T8E;
Chris@10 746 T8G = VSUB(T5M, T5J);
Chris@10 747 T5N = VADD(T5J, T5M);
Chris@10 748 T5R = VSUB(T5P, T5Q);
Chris@10 749 T8E = VADD(T5P, T5Q);
Chris@10 750 {
Chris@10 751 V T5B, T8K, T8M, T5x;
Chris@10 752 T5B = VADD(T5z, T5A);
Chris@10 753 T8K = VSUB(T5A, T5z);
Chris@10 754 T8M = VADD(T4X, T5w);
Chris@10 755 T5x = VSUB(T4X, T5w);
Chris@10 756 T6o = VFNMS(LDK(KP707106781), T5V, T5S);
Chris@10 757 T5W = VFMA(LDK(KP707106781), T5V, T5S);
Chris@10 758 T8W = VADD(T5Z, T62);
Chris@10 759 T63 = VSUB(T5Z, T62);
Chris@10 760 ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
Chris@10 761 ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)]));
Chris@10 762 ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
Chris@10 763 ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)]));
Chris@10 764 ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
Chris@10 765 ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)]));
Chris@10 766 ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
Chris@10 767 ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)]));
Chris@10 768 ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
Chris@10 769 ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)]));
Chris@10 770 ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
Chris@10 771 ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)]));
Chris@10 772 ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
Chris@10 773 ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)]));
Chris@10 774 ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
Chris@10 775 ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)]));
Chris@10 776 }
Chris@10 777 }
Chris@10 778 }
Chris@10 779 {
Chris@10 780 V T6k, T64, T8V, T6r, T8R, T8T, T6y, T6C, T6m, T6i, T6v, T6B, T6l, T6b, T6A;
Chris@10 781 V T6s, T8X;
Chris@10 782 {
Chris@10 783 V T6x, T6e, T6w, T6h, T6u, T67, T6t, T6a, T8P, T8Q;
Chris@10 784 T6k = VFNMS(LDK(KP923879532), T63, T5W);
Chris@10 785 T64 = VFMA(LDK(KP923879532), T63, T5W);
Chris@10 786 T8V = VFNMS(LDK(KP707106781), T8O, T8N);
Chris@10 787 T8P = VFMA(LDK(KP707106781), T8O, T8N);
Chris@10 788 T8Q = VSUB(T6q, T6p);
Chris@10 789 T6r = VADD(T6p, T6q);
Chris@10 790 T6x = VFMA(LDK(KP707106781), T6d, T6c);
Chris@10 791 T6e = VFNMS(LDK(KP707106781), T6d, T6c);
Chris@10 792 T6w = VFMA(LDK(KP707106781), T6g, T6f);
Chris@10 793 T6h = VFNMS(LDK(KP707106781), T6g, T6f);
Chris@10 794 T6u = VFMA(LDK(KP707106781), T66, T65);
Chris@10 795 T67 = VFNMS(LDK(KP707106781), T66, T65);
Chris@10 796 T6t = VFMA(LDK(KP707106781), T69, T68);
Chris@10 797 T6a = VFNMS(LDK(KP707106781), T69, T68);
Chris@10 798 T8R = VFMA(LDK(KP923879532), T8Q, T8P);
Chris@10 799 T8T = VFNMS(LDK(KP923879532), T8Q, T8P);
Chris@10 800 T6y = VFNMS(LDK(KP198912367), T6x, T6w);
Chris@10 801 T6C = VFMA(LDK(KP198912367), T6w, T6x);
Chris@10 802 T6m = VFMA(LDK(KP668178637), T6e, T6h);
Chris@10 803 T6i = VFNMS(LDK(KP668178637), T6h, T6e);
Chris@10 804 T6v = VFMA(LDK(KP198912367), T6u, T6t);
Chris@10 805 T6B = VFNMS(LDK(KP198912367), T6t, T6u);
Chris@10 806 T6l = VFNMS(LDK(KP668178637), T67, T6a);
Chris@10 807 T6b = VFMA(LDK(KP668178637), T6a, T67);
Chris@10 808 }
Chris@10 809 T6A = VFMA(LDK(KP923879532), T6r, T6o);
Chris@10 810 T6s = VFNMS(LDK(KP923879532), T6r, T6o);
Chris@10 811 T8X = VFNMS(LDK(KP923879532), T8W, T8V);
Chris@10 812 T8Z = VFMA(LDK(KP923879532), T8W, T8V);
Chris@10 813 {
Chris@10 814 V T6z, T6D, T8Y, T6n, T8S, T8U, T6j;
Chris@10 815 T6z = VSUB(T6v, T6y);
Chris@10 816 T90 = VADD(T6v, T6y);
Chris@10 817 T6D = VADD(T6B, T6C);
Chris@10 818 T8Y = VSUB(T6C, T6B);
Chris@10 819 T6n = VSUB(T6l, T6m);
Chris@10 820 T8S = VADD(T6l, T6m);
Chris@10 821 T8U = VSUB(T6i, T6b);
Chris@10 822 T6j = VADD(T6b, T6i);
Chris@10 823 ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
Chris@10 824 ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)]));
Chris@10 825 ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
Chris@10 826 ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)]));
Chris@10 827 ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
Chris@10 828 ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)]));
Chris@10 829 ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
Chris@10 830 ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)]));
Chris@10 831 ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
Chris@10 832 ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)]));
Chris@10 833 ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
Chris@10 834 ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)]));
Chris@10 835 ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
Chris@10 836 ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)]));
Chris@10 837 }
Chris@10 838 }
Chris@10 839 }
Chris@10 840 }
Chris@10 841 }
Chris@10 842 }
Chris@10 843 ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
Chris@10 844 ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)]));
Chris@10 845 }
Chris@10 846 }
Chris@10 847 VLEAVE();
Chris@10 848 }
Chris@10 849
Chris@10 850 static const tw_instr twinstr[] = {
Chris@10 851 VTW(0, 1),
Chris@10 852 VTW(0, 2),
Chris@10 853 VTW(0, 3),
Chris@10 854 VTW(0, 4),
Chris@10 855 VTW(0, 5),
Chris@10 856 VTW(0, 6),
Chris@10 857 VTW(0, 7),
Chris@10 858 VTW(0, 8),
Chris@10 859 VTW(0, 9),
Chris@10 860 VTW(0, 10),
Chris@10 861 VTW(0, 11),
Chris@10 862 VTW(0, 12),
Chris@10 863 VTW(0, 13),
Chris@10 864 VTW(0, 14),
Chris@10 865 VTW(0, 15),
Chris@10 866 VTW(0, 16),
Chris@10 867 VTW(0, 17),
Chris@10 868 VTW(0, 18),
Chris@10 869 VTW(0, 19),
Chris@10 870 VTW(0, 20),
Chris@10 871 VTW(0, 21),
Chris@10 872 VTW(0, 22),
Chris@10 873 VTW(0, 23),
Chris@10 874 VTW(0, 24),
Chris@10 875 VTW(0, 25),
Chris@10 876 VTW(0, 26),
Chris@10 877 VTW(0, 27),
Chris@10 878 VTW(0, 28),
Chris@10 879 VTW(0, 29),
Chris@10 880 VTW(0, 30),
Chris@10 881 VTW(0, 31),
Chris@10 882 {TW_NEXT, (2 * VL), 0}
Chris@10 883 };
Chris@10 884
Chris@10 885 static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };
Chris@10 886
Chris@10 887 void XSIMD(codelet_t1sv_32) (planner *p) {
Chris@10 888 X(kdft_dit_register) (p, t1sv_32, &desc);
Chris@10 889 }
Chris@10 890 #else /* HAVE_FMA */
Chris@10 891
Chris@10 892 /* Generated by: ../../../genfft/gen_twiddle.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h */
Chris@10 893
Chris@10 894 /*
Chris@10 895 * This function contains 434 FP additions, 208 FP multiplications,
Chris@10 896 * (or, 340 additions, 114 multiplications, 94 fused multiply/add),
Chris@10 897 * 96 stack variables, 7 constants, and 128 memory accesses
Chris@10 898 */
Chris@10 899 #include "ts.h"
Chris@10 900
Chris@10 901 static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 902 {
Chris@10 903 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@10 904 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@10 905 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@10 906 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@10 907 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 908 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 909 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 910 {
Chris@10 911 INT m;
Chris@10 912 for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@10 913 V Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41;
Chris@10 914 V T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U;
Chris@10 915 V T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x;
Chris@10 916 V T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P;
Chris@10 917 V T4m, T5h, T4v, T5e;
Chris@10 918 {
Chris@10 919 V T1, T76, T6, T75, Tc, T32, Th, T33;
Chris@10 920 T1 = LD(&(ri[0]), ms, &(ri[0]));
Chris@10 921 T76 = LD(&(ii[0]), ms, &(ii[0]));
Chris@10 922 {
Chris@10 923 V T3, T5, T2, T4;
Chris@10 924 T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0]));
Chris@10 925 T5 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0]));
Chris@10 926 T2 = LDW(&(W[TWVL * 30]));
Chris@10 927 T4 = LDW(&(W[TWVL * 31]));
Chris@10 928 T6 = VFMA(T2, T3, VMUL(T4, T5));
Chris@10 929 T75 = VFNMS(T4, T3, VMUL(T2, T5));
Chris@10 930 }
Chris@10 931 {
Chris@10 932 V T9, Tb, T8, Ta;
Chris@10 933 T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0]));
Chris@10 934 Tb = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
Chris@10 935 T8 = LDW(&(W[TWVL * 14]));
Chris@10 936 Ta = LDW(&(W[TWVL * 15]));
Chris@10 937 Tc = VFMA(T8, T9, VMUL(Ta, Tb));
Chris@10 938 T32 = VFNMS(Ta, T9, VMUL(T8, Tb));
Chris@10 939 }
Chris@10 940 {
Chris@10 941 V Te, Tg, Td, Tf;
Chris@10 942 Te = LD(&(ri[WS(rs, 24)]), ms, &(ri[0]));
Chris@10 943 Tg = LD(&(ii[WS(rs, 24)]), ms, &(ii[0]));
Chris@10 944 Td = LDW(&(W[TWVL * 46]));
Chris@10 945 Tf = LDW(&(W[TWVL * 47]));
Chris@10 946 Th = VFMA(Td, Te, VMUL(Tf, Tg));
Chris@10 947 T33 = VFNMS(Tf, Te, VMUL(Td, Tg));
Chris@10 948 }
Chris@10 949 {
Chris@10 950 V T7, Ti, T7A, T7B;
Chris@10 951 T7 = VADD(T1, T6);
Chris@10 952 Ti = VADD(Tc, Th);
Chris@10 953 Tj = VADD(T7, Ti);
Chris@10 954 T5F = VSUB(T7, Ti);
Chris@10 955 T7A = VSUB(T76, T75);
Chris@10 956 T7B = VSUB(Tc, Th);
Chris@10 957 T7C = VSUB(T7A, T7B);
Chris@10 958 T7Q = VADD(T7B, T7A);
Chris@10 959 }
Chris@10 960 {
Chris@10 961 V T31, T34, T74, T77;
Chris@10 962 T31 = VSUB(T1, T6);
Chris@10 963 T34 = VSUB(T32, T33);
Chris@10 964 T35 = VSUB(T31, T34);
Chris@10 965 T4T = VADD(T31, T34);
Chris@10 966 T74 = VADD(T32, T33);
Chris@10 967 T77 = VADD(T75, T76);
Chris@10 968 T78 = VADD(T74, T77);
Chris@10 969 T7m = VSUB(T77, T74);
Chris@10 970 }
Chris@10 971 }
Chris@10 972 {
Chris@10 973 V T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y;
Chris@10 974 {
Chris@10 975 V T1v, T1x, T1u, T1w;
Chris@10 976 T1v = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)]));
Chris@10 977 T1x = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)]));
Chris@10 978 T1u = LDW(&(W[0]));
Chris@10 979 T1w = LDW(&(W[TWVL * 1]));
Chris@10 980 T1y = VFMA(T1u, T1v, VMUL(T1w, T1x));
Chris@10 981 T3G = VFNMS(T1w, T1v, VMUL(T1u, T1x));
Chris@10 982 }
Chris@10 983 {
Chris@10 984 V T1L, T1N, T1K, T1M;
Chris@10 985 T1L = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)]));
Chris@10 986 T1N = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)]));
Chris@10 987 T1K = LDW(&(W[TWVL * 48]));
Chris@10 988 T1M = LDW(&(W[TWVL * 49]));
Chris@10 989 T1O = VFMA(T1K, T1L, VMUL(T1M, T1N));
Chris@10 990 T3Z = VFNMS(T1M, T1L, VMUL(T1K, T1N));
Chris@10 991 }
Chris@10 992 {
Chris@10 993 V T1A, T1C, T1z, T1B;
Chris@10 994 T1A = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)]));
Chris@10 995 T1C = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)]));
Chris@10 996 T1z = LDW(&(W[TWVL * 32]));
Chris@10 997 T1B = LDW(&(W[TWVL * 33]));
Chris@10 998 T1D = VFMA(T1z, T1A, VMUL(T1B, T1C));
Chris@10 999 T3H = VFNMS(T1B, T1A, VMUL(T1z, T1C));
Chris@10 1000 }
Chris@10 1001 {
Chris@10 1002 V T1G, T1I, T1F, T1H;
Chris@10 1003 T1G = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1004 T1I = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1005 T1F = LDW(&(W[TWVL * 16]));
Chris@10 1006 T1H = LDW(&(W[TWVL * 17]));
Chris@10 1007 T1J = VFMA(T1F, T1G, VMUL(T1H, T1I));
Chris@10 1008 T3Y = VFNMS(T1H, T1G, VMUL(T1F, T1I));
Chris@10 1009 }
Chris@10 1010 {
Chris@10 1011 V T1E, T1P, T5W, T5X;
Chris@10 1012 T1E = VADD(T1y, T1D);
Chris@10 1013 T1P = VADD(T1J, T1O);
Chris@10 1014 T1Q = VADD(T1E, T1P);
Chris@10 1015 T61 = VSUB(T1E, T1P);
Chris@10 1016 T5W = VADD(T3G, T3H);
Chris@10 1017 T5X = VADD(T3Y, T3Z);
Chris@10 1018 T5Y = VSUB(T5W, T5X);
Chris@10 1019 T6J = VADD(T5W, T5X);
Chris@10 1020 }
Chris@10 1021 {
Chris@10 1022 V T3I, T3J, T3X, T40;
Chris@10 1023 T3I = VSUB(T3G, T3H);
Chris@10 1024 T3J = VSUB(T1J, T1O);
Chris@10 1025 T3K = VADD(T3I, T3J);
Chris@10 1026 T59 = VSUB(T3I, T3J);
Chris@10 1027 T3X = VSUB(T1y, T1D);
Chris@10 1028 T40 = VSUB(T3Y, T3Z);
Chris@10 1029 T41 = VSUB(T3X, T40);
Chris@10 1030 T56 = VADD(T3X, T40);
Chris@10 1031 }
Chris@10 1032 }
Chris@10 1033 {
Chris@10 1034 V T2j, T4o, T2z, T49, T2o, T4p, T2u, T48;
Chris@10 1035 {
Chris@10 1036 V T2g, T2i, T2f, T2h;
Chris@10 1037 T2g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1038 T2i = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1039 T2f = LDW(&(W[TWVL * 60]));
Chris@10 1040 T2h = LDW(&(W[TWVL * 61]));
Chris@10 1041 T2j = VFMA(T2f, T2g, VMUL(T2h, T2i));
Chris@10 1042 T4o = VFNMS(T2h, T2g, VMUL(T2f, T2i));
Chris@10 1043 }
Chris@10 1044 {
Chris@10 1045 V T2w, T2y, T2v, T2x;
Chris@10 1046 T2w = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1047 T2y = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1048 T2v = LDW(&(W[TWVL * 44]));
Chris@10 1049 T2x = LDW(&(W[TWVL * 45]));
Chris@10 1050 T2z = VFMA(T2v, T2w, VMUL(T2x, T2y));
Chris@10 1051 T49 = VFNMS(T2x, T2w, VMUL(T2v, T2y));
Chris@10 1052 }
Chris@10 1053 {
Chris@10 1054 V T2l, T2n, T2k, T2m;
Chris@10 1055 T2l = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1056 T2n = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1057 T2k = LDW(&(W[TWVL * 28]));
Chris@10 1058 T2m = LDW(&(W[TWVL * 29]));
Chris@10 1059 T2o = VFMA(T2k, T2l, VMUL(T2m, T2n));
Chris@10 1060 T4p = VFNMS(T2m, T2l, VMUL(T2k, T2n));
Chris@10 1061 }
Chris@10 1062 {
Chris@10 1063 V T2r, T2t, T2q, T2s;
Chris@10 1064 T2r = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1065 T2t = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1066 T2q = LDW(&(W[TWVL * 12]));
Chris@10 1067 T2s = LDW(&(W[TWVL * 13]));
Chris@10 1068 T2u = VFMA(T2q, T2r, VMUL(T2s, T2t));
Chris@10 1069 T48 = VFNMS(T2s, T2r, VMUL(T2q, T2t));
Chris@10 1070 }
Chris@10 1071 {
Chris@10 1072 V T2p, T2A, T6c, T6d;
Chris@10 1073 T2p = VADD(T2j, T2o);
Chris@10 1074 T2A = VADD(T2u, T2z);
Chris@10 1075 T2B = VADD(T2p, T2A);
Chris@10 1076 T67 = VSUB(T2p, T2A);
Chris@10 1077 T6c = VADD(T4o, T4p);
Chris@10 1078 T6d = VADD(T48, T49);
Chris@10 1079 T6e = VSUB(T6c, T6d);
Chris@10 1080 T6O = VADD(T6c, T6d);
Chris@10 1081 }
Chris@10 1082 {
Chris@10 1083 V T47, T4a, T4q, T4r;
Chris@10 1084 T47 = VSUB(T2j, T2o);
Chris@10 1085 T4a = VSUB(T48, T49);
Chris@10 1086 T4b = VSUB(T47, T4a);
Chris@10 1087 T5d = VADD(T47, T4a);
Chris@10 1088 T4q = VSUB(T4o, T4p);
Chris@10 1089 T4r = VSUB(T2u, T2z);
Chris@10 1090 T4s = VADD(T4q, T4r);
Chris@10 1091 T5g = VSUB(T4q, T4r);
Chris@10 1092 }
Chris@10 1093 }
Chris@10 1094 {
Chris@10 1095 V To, T36, TE, T3d, Tt, T37, Tz, T3c;
Chris@10 1096 {
Chris@10 1097 V Tl, Tn, Tk, Tm;
Chris@10 1098 Tl = LD(&(ri[WS(rs, 4)]), ms, &(ri[0]));
Chris@10 1099 Tn = LD(&(ii[WS(rs, 4)]), ms, &(ii[0]));
Chris@10 1100 Tk = LDW(&(W[TWVL * 6]));
Chris@10 1101 Tm = LDW(&(W[TWVL * 7]));
Chris@10 1102 To = VFMA(Tk, Tl, VMUL(Tm, Tn));
Chris@10 1103 T36 = VFNMS(Tm, Tl, VMUL(Tk, Tn));
Chris@10 1104 }
Chris@10 1105 {
Chris@10 1106 V TB, TD, TA, TC;
Chris@10 1107 TB = LD(&(ri[WS(rs, 12)]), ms, &(ri[0]));
Chris@10 1108 TD = LD(&(ii[WS(rs, 12)]), ms, &(ii[0]));
Chris@10 1109 TA = LDW(&(W[TWVL * 22]));
Chris@10 1110 TC = LDW(&(W[TWVL * 23]));
Chris@10 1111 TE = VFMA(TA, TB, VMUL(TC, TD));
Chris@10 1112 T3d = VFNMS(TC, TB, VMUL(TA, TD));
Chris@10 1113 }
Chris@10 1114 {
Chris@10 1115 V Tq, Ts, Tp, Tr;
Chris@10 1116 Tq = LD(&(ri[WS(rs, 20)]), ms, &(ri[0]));
Chris@10 1117 Ts = LD(&(ii[WS(rs, 20)]), ms, &(ii[0]));
Chris@10 1118 Tp = LDW(&(W[TWVL * 38]));
Chris@10 1119 Tr = LDW(&(W[TWVL * 39]));
Chris@10 1120 Tt = VFMA(Tp, Tq, VMUL(Tr, Ts));
Chris@10 1121 T37 = VFNMS(Tr, Tq, VMUL(Tp, Ts));
Chris@10 1122 }
Chris@10 1123 {
Chris@10 1124 V Tw, Ty, Tv, Tx;
Chris@10 1125 Tw = LD(&(ri[WS(rs, 28)]), ms, &(ri[0]));
Chris@10 1126 Ty = LD(&(ii[WS(rs, 28)]), ms, &(ii[0]));
Chris@10 1127 Tv = LDW(&(W[TWVL * 54]));
Chris@10 1128 Tx = LDW(&(W[TWVL * 55]));
Chris@10 1129 Tz = VFMA(Tv, Tw, VMUL(Tx, Ty));
Chris@10 1130 T3c = VFNMS(Tx, Tw, VMUL(Tv, Ty));
Chris@10 1131 }
Chris@10 1132 {
Chris@10 1133 V Tu, TF, T5G, T5H;
Chris@10 1134 Tu = VADD(To, Tt);
Chris@10 1135 TF = VADD(Tz, TE);
Chris@10 1136 TG = VADD(Tu, TF);
Chris@10 1137 T7l = VSUB(TF, Tu);
Chris@10 1138 T5G = VADD(T36, T37);
Chris@10 1139 T5H = VADD(T3c, T3d);
Chris@10 1140 T5I = VSUB(T5G, T5H);
Chris@10 1141 T73 = VADD(T5G, T5H);
Chris@10 1142 }
Chris@10 1143 {
Chris@10 1144 V T38, T39, T3b, T3e;
Chris@10 1145 T38 = VSUB(T36, T37);
Chris@10 1146 T39 = VSUB(To, Tt);
Chris@10 1147 T3a = VSUB(T38, T39);
Chris@10 1148 T4U = VADD(T39, T38);
Chris@10 1149 T3b = VSUB(Tz, TE);
Chris@10 1150 T3e = VSUB(T3c, T3d);
Chris@10 1151 T3f = VADD(T3b, T3e);
Chris@10 1152 T4V = VSUB(T3b, T3e);
Chris@10 1153 }
Chris@10 1154 }
Chris@10 1155 {
Chris@10 1156 V TM, T3i, T12, T3p, TR, T3j, TX, T3o;
Chris@10 1157 {
Chris@10 1158 V TJ, TL, TI, TK;
Chris@10 1159 TJ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0]));
Chris@10 1160 TL = LD(&(ii[WS(rs, 2)]), ms, &(ii[0]));
Chris@10 1161 TI = LDW(&(W[TWVL * 2]));
Chris@10 1162 TK = LDW(&(W[TWVL * 3]));
Chris@10 1163 TM = VFMA(TI, TJ, VMUL(TK, TL));
Chris@10 1164 T3i = VFNMS(TK, TJ, VMUL(TI, TL));
Chris@10 1165 }
Chris@10 1166 {
Chris@10 1167 V TZ, T11, TY, T10;
Chris@10 1168 TZ = LD(&(ri[WS(rs, 26)]), ms, &(ri[0]));
Chris@10 1169 T11 = LD(&(ii[WS(rs, 26)]), ms, &(ii[0]));
Chris@10 1170 TY = LDW(&(W[TWVL * 50]));
Chris@10 1171 T10 = LDW(&(W[TWVL * 51]));
Chris@10 1172 T12 = VFMA(TY, TZ, VMUL(T10, T11));
Chris@10 1173 T3p = VFNMS(T10, TZ, VMUL(TY, T11));
Chris@10 1174 }
Chris@10 1175 {
Chris@10 1176 V TO, TQ, TN, TP;
Chris@10 1177 TO = LD(&(ri[WS(rs, 18)]), ms, &(ri[0]));
Chris@10 1178 TQ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0]));
Chris@10 1179 TN = LDW(&(W[TWVL * 34]));
Chris@10 1180 TP = LDW(&(W[TWVL * 35]));
Chris@10 1181 TR = VFMA(TN, TO, VMUL(TP, TQ));
Chris@10 1182 T3j = VFNMS(TP, TO, VMUL(TN, TQ));
Chris@10 1183 }
Chris@10 1184 {
Chris@10 1185 V TU, TW, TT, TV;
Chris@10 1186 TU = LD(&(ri[WS(rs, 10)]), ms, &(ri[0]));
Chris@10 1187 TW = LD(&(ii[WS(rs, 10)]), ms, &(ii[0]));
Chris@10 1188 TT = LDW(&(W[TWVL * 18]));
Chris@10 1189 TV = LDW(&(W[TWVL * 19]));
Chris@10 1190 TX = VFMA(TT, TU, VMUL(TV, TW));
Chris@10 1191 T3o = VFNMS(TV, TU, VMUL(TT, TW));
Chris@10 1192 }
Chris@10 1193 {
Chris@10 1194 V TS, T13, T5K, T5L;
Chris@10 1195 TS = VADD(TM, TR);
Chris@10 1196 T13 = VADD(TX, T12);
Chris@10 1197 T14 = VADD(TS, T13);
Chris@10 1198 T5N = VSUB(TS, T13);
Chris@10 1199 T5K = VADD(T3i, T3j);
Chris@10 1200 T5L = VADD(T3o, T3p);
Chris@10 1201 T5M = VSUB(T5K, T5L);
Chris@10 1202 T6E = VADD(T5K, T5L);
Chris@10 1203 }
Chris@10 1204 {
Chris@10 1205 V T3k, T3l, T3n, T3q;
Chris@10 1206 T3k = VSUB(T3i, T3j);
Chris@10 1207 T3l = VSUB(TX, T12);
Chris@10 1208 T3m = VADD(T3k, T3l);
Chris@10 1209 T4Y = VSUB(T3k, T3l);
Chris@10 1210 T3n = VSUB(TM, TR);
Chris@10 1211 T3q = VSUB(T3o, T3p);
Chris@10 1212 T3r = VSUB(T3n, T3q);
Chris@10 1213 T4Z = VADD(T3n, T3q);
Chris@10 1214 }
Chris@10 1215 }
Chris@10 1216 {
Chris@10 1217 V T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z;
Chris@10 1218 {
Chris@10 1219 V T16, T18, T15, T17;
Chris@10 1220 T16 = LD(&(ri[WS(rs, 30)]), ms, &(ri[0]));
Chris@10 1221 T18 = LD(&(ii[WS(rs, 30)]), ms, &(ii[0]));
Chris@10 1222 T15 = LDW(&(W[TWVL * 58]));
Chris@10 1223 T17 = LDW(&(W[TWVL * 59]));
Chris@10 1224 T19 = VFMA(T15, T16, VMUL(T17, T18));
Chris@10 1225 T3t = VFNMS(T17, T16, VMUL(T15, T18));
Chris@10 1226 }
Chris@10 1227 {
Chris@10 1228 V T1m, T1o, T1l, T1n;
Chris@10 1229 T1m = LD(&(ri[WS(rs, 22)]), ms, &(ri[0]));
Chris@10 1230 T1o = LD(&(ii[WS(rs, 22)]), ms, &(ii[0]));
Chris@10 1231 T1l = LDW(&(W[TWVL * 42]));
Chris@10 1232 T1n = LDW(&(W[TWVL * 43]));
Chris@10 1233 T1p = VFMA(T1l, T1m, VMUL(T1n, T1o));
Chris@10 1234 T3A = VFNMS(T1n, T1m, VMUL(T1l, T1o));
Chris@10 1235 }
Chris@10 1236 {
Chris@10 1237 V T1b, T1d, T1a, T1c;
Chris@10 1238 T1b = LD(&(ri[WS(rs, 14)]), ms, &(ri[0]));
Chris@10 1239 T1d = LD(&(ii[WS(rs, 14)]), ms, &(ii[0]));
Chris@10 1240 T1a = LDW(&(W[TWVL * 26]));
Chris@10 1241 T1c = LDW(&(W[TWVL * 27]));
Chris@10 1242 T1e = VFMA(T1a, T1b, VMUL(T1c, T1d));
Chris@10 1243 T3u = VFNMS(T1c, T1b, VMUL(T1a, T1d));
Chris@10 1244 }
Chris@10 1245 {
Chris@10 1246 V T1h, T1j, T1g, T1i;
Chris@10 1247 T1h = LD(&(ri[WS(rs, 6)]), ms, &(ri[0]));
Chris@10 1248 T1j = LD(&(ii[WS(rs, 6)]), ms, &(ii[0]));
Chris@10 1249 T1g = LDW(&(W[TWVL * 10]));
Chris@10 1250 T1i = LDW(&(W[TWVL * 11]));
Chris@10 1251 T1k = VFMA(T1g, T1h, VMUL(T1i, T1j));
Chris@10 1252 T3z = VFNMS(T1i, T1h, VMUL(T1g, T1j));
Chris@10 1253 }
Chris@10 1254 {
Chris@10 1255 V T1f, T1q, T5Q, T5R;
Chris@10 1256 T1f = VADD(T19, T1e);
Chris@10 1257 T1q = VADD(T1k, T1p);
Chris@10 1258 T1r = VADD(T1f, T1q);
Chris@10 1259 T5P = VSUB(T1f, T1q);
Chris@10 1260 T5Q = VADD(T3t, T3u);
Chris@10 1261 T5R = VADD(T3z, T3A);
Chris@10 1262 T5S = VSUB(T5Q, T5R);
Chris@10 1263 T6F = VADD(T5Q, T5R);
Chris@10 1264 }
Chris@10 1265 {
Chris@10 1266 V T3v, T3w, T3y, T3B;
Chris@10 1267 T3v = VSUB(T3t, T3u);
Chris@10 1268 T3w = VSUB(T1k, T1p);
Chris@10 1269 T3x = VADD(T3v, T3w);
Chris@10 1270 T51 = VSUB(T3v, T3w);
Chris@10 1271 T3y = VSUB(T19, T1e);
Chris@10 1272 T3B = VSUB(T3z, T3A);
Chris@10 1273 T3C = VSUB(T3y, T3B);
Chris@10 1274 T52 = VADD(T3y, T3B);
Chris@10 1275 }
Chris@10 1276 }
Chris@10 1277 {
Chris@10 1278 V T1V, T3R, T20, T3S, T3Q, T3T, T26, T3M, T2b, T3N, T3L, T3O;
Chris@10 1279 {
Chris@10 1280 V T1S, T1U, T1R, T1T;
Chris@10 1281 T1S = LD(&(ri[WS(rs, 5)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1282 T1U = LD(&(ii[WS(rs, 5)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1283 T1R = LDW(&(W[TWVL * 8]));
Chris@10 1284 T1T = LDW(&(W[TWVL * 9]));
Chris@10 1285 T1V = VFMA(T1R, T1S, VMUL(T1T, T1U));
Chris@10 1286 T3R = VFNMS(T1T, T1S, VMUL(T1R, T1U));
Chris@10 1287 }
Chris@10 1288 {
Chris@10 1289 V T1X, T1Z, T1W, T1Y;
Chris@10 1290 T1X = LD(&(ri[WS(rs, 21)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1291 T1Z = LD(&(ii[WS(rs, 21)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1292 T1W = LDW(&(W[TWVL * 40]));
Chris@10 1293 T1Y = LDW(&(W[TWVL * 41]));
Chris@10 1294 T20 = VFMA(T1W, T1X, VMUL(T1Y, T1Z));
Chris@10 1295 T3S = VFNMS(T1Y, T1X, VMUL(T1W, T1Z));
Chris@10 1296 }
Chris@10 1297 T3Q = VSUB(T1V, T20);
Chris@10 1298 T3T = VSUB(T3R, T3S);
Chris@10 1299 {
Chris@10 1300 V T23, T25, T22, T24;
Chris@10 1301 T23 = LD(&(ri[WS(rs, 29)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1302 T25 = LD(&(ii[WS(rs, 29)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1303 T22 = LDW(&(W[TWVL * 56]));
Chris@10 1304 T24 = LDW(&(W[TWVL * 57]));
Chris@10 1305 T26 = VFMA(T22, T23, VMUL(T24, T25));
Chris@10 1306 T3M = VFNMS(T24, T23, VMUL(T22, T25));
Chris@10 1307 }
Chris@10 1308 {
Chris@10 1309 V T28, T2a, T27, T29;
Chris@10 1310 T28 = LD(&(ri[WS(rs, 13)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1311 T2a = LD(&(ii[WS(rs, 13)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1312 T27 = LDW(&(W[TWVL * 24]));
Chris@10 1313 T29 = LDW(&(W[TWVL * 25]));
Chris@10 1314 T2b = VFMA(T27, T28, VMUL(T29, T2a));
Chris@10 1315 T3N = VFNMS(T29, T28, VMUL(T27, T2a));
Chris@10 1316 }
Chris@10 1317 T3L = VSUB(T26, T2b);
Chris@10 1318 T3O = VSUB(T3M, T3N);
Chris@10 1319 {
Chris@10 1320 V T21, T2c, T62, T63;
Chris@10 1321 T21 = VADD(T1V, T20);
Chris@10 1322 T2c = VADD(T26, T2b);
Chris@10 1323 T2d = VADD(T21, T2c);
Chris@10 1324 T5Z = VSUB(T2c, T21);
Chris@10 1325 T62 = VADD(T3R, T3S);
Chris@10 1326 T63 = VADD(T3M, T3N);
Chris@10 1327 T64 = VSUB(T62, T63);
Chris@10 1328 T6K = VADD(T62, T63);
Chris@10 1329 }
Chris@10 1330 {
Chris@10 1331 V T3P, T3U, T42, T43;
Chris@10 1332 T3P = VSUB(T3L, T3O);
Chris@10 1333 T3U = VADD(T3Q, T3T);
Chris@10 1334 T3V = VMUL(LDK(KP707106781), VSUB(T3P, T3U));
Chris@10 1335 T57 = VMUL(LDK(KP707106781), VADD(T3U, T3P));
Chris@10 1336 T42 = VSUB(T3T, T3Q);
Chris@10 1337 T43 = VADD(T3L, T3O);
Chris@10 1338 T44 = VMUL(LDK(KP707106781), VSUB(T42, T43));
Chris@10 1339 T5a = VMUL(LDK(KP707106781), VADD(T42, T43));
Chris@10 1340 }
Chris@10 1341 }
Chris@10 1342 {
Chris@10 1343 V T2G, T4c, T2L, T4d, T4e, T4f, T2R, T4i, T2W, T4j, T4h, T4k;
Chris@10 1344 {
Chris@10 1345 V T2D, T2F, T2C, T2E;
Chris@10 1346 T2D = LD(&(ri[WS(rs, 3)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1347 T2F = LD(&(ii[WS(rs, 3)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1348 T2C = LDW(&(W[TWVL * 4]));
Chris@10 1349 T2E = LDW(&(W[TWVL * 5]));
Chris@10 1350 T2G = VFMA(T2C, T2D, VMUL(T2E, T2F));
Chris@10 1351 T4c = VFNMS(T2E, T2D, VMUL(T2C, T2F));
Chris@10 1352 }
Chris@10 1353 {
Chris@10 1354 V T2I, T2K, T2H, T2J;
Chris@10 1355 T2I = LD(&(ri[WS(rs, 19)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1356 T2K = LD(&(ii[WS(rs, 19)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1357 T2H = LDW(&(W[TWVL * 36]));
Chris@10 1358 T2J = LDW(&(W[TWVL * 37]));
Chris@10 1359 T2L = VFMA(T2H, T2I, VMUL(T2J, T2K));
Chris@10 1360 T4d = VFNMS(T2J, T2I, VMUL(T2H, T2K));
Chris@10 1361 }
Chris@10 1362 T4e = VSUB(T4c, T4d);
Chris@10 1363 T4f = VSUB(T2G, T2L);
Chris@10 1364 {
Chris@10 1365 V T2O, T2Q, T2N, T2P;
Chris@10 1366 T2O = LD(&(ri[WS(rs, 27)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1367 T2Q = LD(&(ii[WS(rs, 27)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1368 T2N = LDW(&(W[TWVL * 52]));
Chris@10 1369 T2P = LDW(&(W[TWVL * 53]));
Chris@10 1370 T2R = VFMA(T2N, T2O, VMUL(T2P, T2Q));
Chris@10 1371 T4i = VFNMS(T2P, T2O, VMUL(T2N, T2Q));
Chris@10 1372 }
Chris@10 1373 {
Chris@10 1374 V T2T, T2V, T2S, T2U;
Chris@10 1375 T2T = LD(&(ri[WS(rs, 11)]), ms, &(ri[WS(rs, 1)]));
Chris@10 1376 T2V = LD(&(ii[WS(rs, 11)]), ms, &(ii[WS(rs, 1)]));
Chris@10 1377 T2S = LDW(&(W[TWVL * 20]));
Chris@10 1378 T2U = LDW(&(W[TWVL * 21]));
Chris@10 1379 T2W = VFMA(T2S, T2T, VMUL(T2U, T2V));
Chris@10 1380 T4j = VFNMS(T2U, T2T, VMUL(T2S, T2V));
Chris@10 1381 }
Chris@10 1382 T4h = VSUB(T2R, T2W);
Chris@10 1383 T4k = VSUB(T4i, T4j);
Chris@10 1384 {
Chris@10 1385 V T2M, T2X, T68, T69;
Chris@10 1386 T2M = VADD(T2G, T2L);
Chris@10 1387 T2X = VADD(T2R, T2W);
Chris@10 1388 T2Y = VADD(T2M, T2X);
Chris@10 1389 T6f = VSUB(T2X, T2M);
Chris@10 1390 T68 = VADD(T4c, T4d);
Chris@10 1391 T69 = VADD(T4i, T4j);
Chris@10 1392 T6a = VSUB(T68, T69);
Chris@10 1393 T6P = VADD(T68, T69);
Chris@10 1394 }
Chris@10 1395 {
Chris@10 1396 V T4g, T4l, T4t, T4u;
Chris@10 1397 T4g = VSUB(T4e, T4f);
Chris@10 1398 T4l = VADD(T4h, T4k);
Chris@10 1399 T4m = VMUL(LDK(KP707106781), VSUB(T4g, T4l));
Chris@10 1400 T5h = VMUL(LDK(KP707106781), VADD(T4g, T4l));
Chris@10 1401 T4t = VSUB(T4h, T4k);
Chris@10 1402 T4u = VADD(T4f, T4e);
Chris@10 1403 T4v = VMUL(LDK(KP707106781), VSUB(T4t, T4u));
Chris@10 1404 T5e = VMUL(LDK(KP707106781), VADD(T4u, T4t));
Chris@10 1405 }
Chris@10 1406 }
Chris@10 1407 {
Chris@10 1408 V T1t, T6X, T7a, T7c, T30, T7b, T70, T71;
Chris@10 1409 {
Chris@10 1410 V TH, T1s, T72, T79;
Chris@10 1411 TH = VADD(Tj, TG);
Chris@10 1412 T1s = VADD(T14, T1r);
Chris@10 1413 T1t = VADD(TH, T1s);
Chris@10 1414 T6X = VSUB(TH, T1s);
Chris@10 1415 T72 = VADD(T6E, T6F);
Chris@10 1416 T79 = VADD(T73, T78);
Chris@10 1417 T7a = VADD(T72, T79);
Chris@10 1418 T7c = VSUB(T79, T72);
Chris@10 1419 }
Chris@10 1420 {
Chris@10 1421 V T2e, T2Z, T6Y, T6Z;
Chris@10 1422 T2e = VADD(T1Q, T2d);
Chris@10 1423 T2Z = VADD(T2B, T2Y);
Chris@10 1424 T30 = VADD(T2e, T2Z);
Chris@10 1425 T7b = VSUB(T2Z, T2e);
Chris@10 1426 T6Y = VADD(T6J, T6K);
Chris@10 1427 T6Z = VADD(T6O, T6P);
Chris@10 1428 T70 = VSUB(T6Y, T6Z);
Chris@10 1429 T71 = VADD(T6Y, T6Z);
Chris@10 1430 }
Chris@10 1431 ST(&(ri[WS(rs, 16)]), VSUB(T1t, T30), ms, &(ri[0]));
Chris@10 1432 ST(&(ii[WS(rs, 16)]), VSUB(T7a, T71), ms, &(ii[0]));
Chris@10 1433 ST(&(ri[0]), VADD(T1t, T30), ms, &(ri[0]));
Chris@10 1434 ST(&(ii[0]), VADD(T71, T7a), ms, &(ii[0]));
Chris@10 1435 ST(&(ri[WS(rs, 24)]), VSUB(T6X, T70), ms, &(ri[0]));
Chris@10 1436 ST(&(ii[WS(rs, 24)]), VSUB(T7c, T7b), ms, &(ii[0]));
Chris@10 1437 ST(&(ri[WS(rs, 8)]), VADD(T6X, T70), ms, &(ri[0]));
Chris@10 1438 ST(&(ii[WS(rs, 8)]), VADD(T7b, T7c), ms, &(ii[0]));
Chris@10 1439 }
Chris@10 1440 {
Chris@10 1441 V T6H, T6T, T7g, T7i, T6M, T6U, T6R, T6V;
Chris@10 1442 {
Chris@10 1443 V T6D, T6G, T7e, T7f;
Chris@10 1444 T6D = VSUB(Tj, TG);
Chris@10 1445 T6G = VSUB(T6E, T6F);
Chris@10 1446 T6H = VADD(T6D, T6G);
Chris@10 1447 T6T = VSUB(T6D, T6G);
Chris@10 1448 T7e = VSUB(T1r, T14);
Chris@10 1449 T7f = VSUB(T78, T73);
Chris@10 1450 T7g = VADD(T7e, T7f);
Chris@10 1451 T7i = VSUB(T7f, T7e);
Chris@10 1452 }
Chris@10 1453 {
Chris@10 1454 V T6I, T6L, T6N, T6Q;
Chris@10 1455 T6I = VSUB(T1Q, T2d);
Chris@10 1456 T6L = VSUB(T6J, T6K);
Chris@10 1457 T6M = VADD(T6I, T6L);
Chris@10 1458 T6U = VSUB(T6L, T6I);
Chris@10 1459 T6N = VSUB(T2B, T2Y);
Chris@10 1460 T6Q = VSUB(T6O, T6P);
Chris@10 1461 T6R = VSUB(T6N, T6Q);
Chris@10 1462 T6V = VADD(T6N, T6Q);
Chris@10 1463 }
Chris@10 1464 {
Chris@10 1465 V T6S, T7d, T6W, T7h;
Chris@10 1466 T6S = VMUL(LDK(KP707106781), VADD(T6M, T6R));
Chris@10 1467 ST(&(ri[WS(rs, 20)]), VSUB(T6H, T6S), ms, &(ri[0]));
Chris@10 1468 ST(&(ri[WS(rs, 4)]), VADD(T6H, T6S), ms, &(ri[0]));
Chris@10 1469 T7d = VMUL(LDK(KP707106781), VADD(T6U, T6V));
Chris@10 1470 ST(&(ii[WS(rs, 4)]), VADD(T7d, T7g), ms, &(ii[0]));
Chris@10 1471 ST(&(ii[WS(rs, 20)]), VSUB(T7g, T7d), ms, &(ii[0]));
Chris@10 1472 T6W = VMUL(LDK(KP707106781), VSUB(T6U, T6V));
Chris@10 1473 ST(&(ri[WS(rs, 28)]), VSUB(T6T, T6W), ms, &(ri[0]));
Chris@10 1474 ST(&(ri[WS(rs, 12)]), VADD(T6T, T6W), ms, &(ri[0]));
Chris@10 1475 T7h = VMUL(LDK(KP707106781), VSUB(T6R, T6M));
Chris@10 1476 ST(&(ii[WS(rs, 12)]), VADD(T7h, T7i), ms, &(ii[0]));
Chris@10 1477 ST(&(ii[WS(rs, 28)]), VSUB(T7i, T7h), ms, &(ii[0]));
Chris@10 1478 }
Chris@10 1479 }
Chris@10 1480 {
Chris@10 1481 V T5J, T7n, T7t, T6n, T5U, T7k, T6x, T6B, T6q, T7s, T66, T6k, T6u, T6A, T6h;
Chris@10 1482 V T6l;
Chris@10 1483 {
Chris@10 1484 V T5O, T5T, T60, T65;
Chris@10 1485 T5J = VSUB(T5F, T5I);
Chris@10 1486 T7n = VADD(T7l, T7m);
Chris@10 1487 T7t = VSUB(T7m, T7l);
Chris@10 1488 T6n = VADD(T5F, T5I);
Chris@10 1489 T5O = VSUB(T5M, T5N);
Chris@10 1490 T5T = VADD(T5P, T5S);
Chris@10 1491 T5U = VMUL(LDK(KP707106781), VSUB(T5O, T5T));
Chris@10 1492 T7k = VMUL(LDK(KP707106781), VADD(T5O, T5T));
Chris@10 1493 {
Chris@10 1494 V T6v, T6w, T6o, T6p;
Chris@10 1495 T6v = VADD(T67, T6a);
Chris@10 1496 T6w = VADD(T6e, T6f);
Chris@10 1497 T6x = VFNMS(LDK(KP382683432), T6w, VMUL(LDK(KP923879532), T6v));
Chris@10 1498 T6B = VFMA(LDK(KP923879532), T6w, VMUL(LDK(KP382683432), T6v));
Chris@10 1499 T6o = VADD(T5N, T5M);
Chris@10 1500 T6p = VSUB(T5P, T5S);
Chris@10 1501 T6q = VMUL(LDK(KP707106781), VADD(T6o, T6p));
Chris@10 1502 T7s = VMUL(LDK(KP707106781), VSUB(T6p, T6o));
Chris@10 1503 }
Chris@10 1504 T60 = VSUB(T5Y, T5Z);
Chris@10 1505 T65 = VSUB(T61, T64);
Chris@10 1506 T66 = VFMA(LDK(KP923879532), T60, VMUL(LDK(KP382683432), T65));
Chris@10 1507 T6k = VFNMS(LDK(KP923879532), T65, VMUL(LDK(KP382683432), T60));
Chris@10 1508 {
Chris@10 1509 V T6s, T6t, T6b, T6g;
Chris@10 1510 T6s = VADD(T5Y, T5Z);
Chris@10 1511 T6t = VADD(T61, T64);
Chris@10 1512 T6u = VFMA(LDK(KP382683432), T6s, VMUL(LDK(KP923879532), T6t));
Chris@10 1513 T6A = VFNMS(LDK(KP382683432), T6t, VMUL(LDK(KP923879532), T6s));
Chris@10 1514 T6b = VSUB(T67, T6a);
Chris@10 1515 T6g = VSUB(T6e, T6f);
Chris@10 1516 T6h = VFNMS(LDK(KP923879532), T6g, VMUL(LDK(KP382683432), T6b));
Chris@10 1517 T6l = VFMA(LDK(KP382683432), T6g, VMUL(LDK(KP923879532), T6b));
Chris@10 1518 }
Chris@10 1519 }
Chris@10 1520 {
Chris@10 1521 V T5V, T6i, T7r, T7u;
Chris@10 1522 T5V = VADD(T5J, T5U);
Chris@10 1523 T6i = VADD(T66, T6h);
Chris@10 1524 ST(&(ri[WS(rs, 22)]), VSUB(T5V, T6i), ms, &(ri[0]));
Chris@10 1525 ST(&(ri[WS(rs, 6)]), VADD(T5V, T6i), ms, &(ri[0]));
Chris@10 1526 T7r = VADD(T6k, T6l);
Chris@10 1527 T7u = VADD(T7s, T7t);
Chris@10 1528 ST(&(ii[WS(rs, 6)]), VADD(T7r, T7u), ms, &(ii[0]));
Chris@10 1529 ST(&(ii[WS(rs, 22)]), VSUB(T7u, T7r), ms, &(ii[0]));
Chris@10 1530 }
Chris@10 1531 {
Chris@10 1532 V T6j, T6m, T7v, T7w;
Chris@10 1533 T6j = VSUB(T5J, T5U);
Chris@10 1534 T6m = VSUB(T6k, T6l);
Chris@10 1535 ST(&(ri[WS(rs, 30)]), VSUB(T6j, T6m), ms, &(ri[0]));
Chris@10 1536 ST(&(ri[WS(rs, 14)]), VADD(T6j, T6m), ms, &(ri[0]));
Chris@10 1537 T7v = VSUB(T6h, T66);
Chris@10 1538 T7w = VSUB(T7t, T7s);
Chris@10 1539 ST(&(ii[WS(rs, 14)]), VADD(T7v, T7w), ms, &(ii[0]));
Chris@10 1540 ST(&(ii[WS(rs, 30)]), VSUB(T7w, T7v), ms, &(ii[0]));
Chris@10 1541 }
Chris@10 1542 {
Chris@10 1543 V T6r, T6y, T7j, T7o;
Chris@10 1544 T6r = VADD(T6n, T6q);
Chris@10 1545 T6y = VADD(T6u, T6x);
Chris@10 1546 ST(&(ri[WS(rs, 18)]), VSUB(T6r, T6y), ms, &(ri[0]));
Chris@10 1547 ST(&(ri[WS(rs, 2)]), VADD(T6r, T6y), ms, &(ri[0]));
Chris@10 1548 T7j = VADD(T6A, T6B);
Chris@10 1549 T7o = VADD(T7k, T7n);
Chris@10 1550 ST(&(ii[WS(rs, 2)]), VADD(T7j, T7o), ms, &(ii[0]));
Chris@10 1551 ST(&(ii[WS(rs, 18)]), VSUB(T7o, T7j), ms, &(ii[0]));
Chris@10 1552 }
Chris@10 1553 {
Chris@10 1554 V T6z, T6C, T7p, T7q;
Chris@10 1555 T6z = VSUB(T6n, T6q);
Chris@10 1556 T6C = VSUB(T6A, T6B);
Chris@10 1557 ST(&(ri[WS(rs, 26)]), VSUB(T6z, T6C), ms, &(ri[0]));
Chris@10 1558 ST(&(ri[WS(rs, 10)]), VADD(T6z, T6C), ms, &(ri[0]));
Chris@10 1559 T7p = VSUB(T6x, T6u);
Chris@10 1560 T7q = VSUB(T7n, T7k);
Chris@10 1561 ST(&(ii[WS(rs, 10)]), VADD(T7p, T7q), ms, &(ii[0]));
Chris@10 1562 ST(&(ii[WS(rs, 26)]), VSUB(T7q, T7p), ms, &(ii[0]));
Chris@10 1563 }
Chris@10 1564 }
Chris@10 1565 {
Chris@10 1566 V T3h, T4D, T7R, T7X, T3E, T7O, T4N, T4R, T46, T4A, T4G, T7W, T4K, T4Q, T4x;
Chris@10 1567 V T4B, T3g, T7P;
Chris@10 1568 T3g = VMUL(LDK(KP707106781), VSUB(T3a, T3f));
Chris@10 1569 T3h = VSUB(T35, T3g);
Chris@10 1570 T4D = VADD(T35, T3g);
Chris@10 1571 T7P = VMUL(LDK(KP707106781), VSUB(T4V, T4U));
Chris@10 1572 T7R = VADD(T7P, T7Q);
Chris@10 1573 T7X = VSUB(T7Q, T7P);
Chris@10 1574 {
Chris@10 1575 V T3s, T3D, T4L, T4M;
Chris@10 1576 T3s = VFNMS(LDK(KP923879532), T3r, VMUL(LDK(KP382683432), T3m));
Chris@10 1577 T3D = VFMA(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3C));
Chris@10 1578 T3E = VSUB(T3s, T3D);
Chris@10 1579 T7O = VADD(T3s, T3D);
Chris@10 1580 T4L = VADD(T4b, T4m);
Chris@10 1581 T4M = VADD(T4s, T4v);
Chris@10 1582 T4N = VFNMS(LDK(KP555570233), T4M, VMUL(LDK(KP831469612), T4L));
Chris@10 1583 T4R = VFMA(LDK(KP831469612), T4M, VMUL(LDK(KP555570233), T4L));
Chris@10 1584 }
Chris@10 1585 {
Chris@10 1586 V T3W, T45, T4E, T4F;
Chris@10 1587 T3W = VSUB(T3K, T3V);
Chris@10 1588 T45 = VSUB(T41, T44);
Chris@10 1589 T46 = VFMA(LDK(KP980785280), T3W, VMUL(LDK(KP195090322), T45));
Chris@10 1590 T4A = VFNMS(LDK(KP980785280), T45, VMUL(LDK(KP195090322), T3W));
Chris@10 1591 T4E = VFMA(LDK(KP923879532), T3m, VMUL(LDK(KP382683432), T3r));
Chris@10 1592 T4F = VFNMS(LDK(KP923879532), T3x, VMUL(LDK(KP382683432), T3C));
Chris@10 1593 T4G = VADD(T4E, T4F);
Chris@10 1594 T7W = VSUB(T4F, T4E);
Chris@10 1595 }
Chris@10 1596 {
Chris@10 1597 V T4I, T4J, T4n, T4w;
Chris@10 1598 T4I = VADD(T3K, T3V);
Chris@10 1599 T4J = VADD(T41, T44);
Chris@10 1600 T4K = VFMA(LDK(KP555570233), T4I, VMUL(LDK(KP831469612), T4J));
Chris@10 1601 T4Q = VFNMS(LDK(KP555570233), T4J, VMUL(LDK(KP831469612), T4I));
Chris@10 1602 T4n = VSUB(T4b, T4m);
Chris@10 1603 T4w = VSUB(T4s, T4v);
Chris@10 1604 T4x = VFNMS(LDK(KP980785280), T4w, VMUL(LDK(KP195090322), T4n));
Chris@10 1605 T4B = VFMA(LDK(KP195090322), T4w, VMUL(LDK(KP980785280), T4n));
Chris@10 1606 }
Chris@10 1607 {
Chris@10 1608 V T3F, T4y, T7V, T7Y;
Chris@10 1609 T3F = VADD(T3h, T3E);
Chris@10 1610 T4y = VADD(T46, T4x);
Chris@10 1611 ST(&(ri[WS(rs, 23)]), VSUB(T3F, T4y), ms, &(ri[WS(rs, 1)]));
Chris@10 1612 ST(&(ri[WS(rs, 7)]), VADD(T3F, T4y), ms, &(ri[WS(rs, 1)]));
Chris@10 1613 T7V = VADD(T4A, T4B);
Chris@10 1614 T7Y = VADD(T7W, T7X);
Chris@10 1615 ST(&(ii[WS(rs, 7)]), VADD(T7V, T7Y), ms, &(ii[WS(rs, 1)]));
Chris@10 1616 ST(&(ii[WS(rs, 23)]), VSUB(T7Y, T7V), ms, &(ii[WS(rs, 1)]));
Chris@10 1617 }
Chris@10 1618 {
Chris@10 1619 V T4z, T4C, T7Z, T80;
Chris@10 1620 T4z = VSUB(T3h, T3E);
Chris@10 1621 T4C = VSUB(T4A, T4B);
Chris@10 1622 ST(&(ri[WS(rs, 31)]), VSUB(T4z, T4C), ms, &(ri[WS(rs, 1)]));
Chris@10 1623 ST(&(ri[WS(rs, 15)]), VADD(T4z, T4C), ms, &(ri[WS(rs, 1)]));
Chris@10 1624 T7Z = VSUB(T4x, T46);
Chris@10 1625 T80 = VSUB(T7X, T7W);
Chris@10 1626 ST(&(ii[WS(rs, 15)]), VADD(T7Z, T80), ms, &(ii[WS(rs, 1)]));
Chris@10 1627 ST(&(ii[WS(rs, 31)]), VSUB(T80, T7Z), ms, &(ii[WS(rs, 1)]));
Chris@10 1628 }
Chris@10 1629 {
Chris@10 1630 V T4H, T4O, T7N, T7S;
Chris@10 1631 T4H = VADD(T4D, T4G);
Chris@10 1632 T4O = VADD(T4K, T4N);
Chris@10 1633 ST(&(ri[WS(rs, 19)]), VSUB(T4H, T4O), ms, &(ri[WS(rs, 1)]));
Chris@10 1634 ST(&(ri[WS(rs, 3)]), VADD(T4H, T4O), ms, &(ri[WS(rs, 1)]));
Chris@10 1635 T7N = VADD(T4Q, T4R);
Chris@10 1636 T7S = VADD(T7O, T7R);
Chris@10 1637 ST(&(ii[WS(rs, 3)]), VADD(T7N, T7S), ms, &(ii[WS(rs, 1)]));
Chris@10 1638 ST(&(ii[WS(rs, 19)]), VSUB(T7S, T7N), ms, &(ii[WS(rs, 1)]));
Chris@10 1639 }
Chris@10 1640 {
Chris@10 1641 V T4P, T4S, T7T, T7U;
Chris@10 1642 T4P = VSUB(T4D, T4G);
Chris@10 1643 T4S = VSUB(T4Q, T4R);
Chris@10 1644 ST(&(ri[WS(rs, 27)]), VSUB(T4P, T4S), ms, &(ri[WS(rs, 1)]));
Chris@10 1645 ST(&(ri[WS(rs, 11)]), VADD(T4P, T4S), ms, &(ri[WS(rs, 1)]));
Chris@10 1646 T7T = VSUB(T4N, T4K);
Chris@10 1647 T7U = VSUB(T7R, T7O);
Chris@10 1648 ST(&(ii[WS(rs, 11)]), VADD(T7T, T7U), ms, &(ii[WS(rs, 1)]));
Chris@10 1649 ST(&(ii[WS(rs, 27)]), VSUB(T7U, T7T), ms, &(ii[WS(rs, 1)]));
Chris@10 1650 }
Chris@10 1651 }
Chris@10 1652 {
Chris@10 1653 V T4X, T5p, T7D, T7J, T54, T7y, T5z, T5D, T5c, T5m, T5s, T7I, T5w, T5C, T5j;
Chris@10 1654 V T5n, T4W, T7z;
Chris@10 1655 T4W = VMUL(LDK(KP707106781), VADD(T4U, T4V));
Chris@10 1656 T4X = VSUB(T4T, T4W);
Chris@10 1657 T5p = VADD(T4T, T4W);
Chris@10 1658 T7z = VMUL(LDK(KP707106781), VADD(T3a, T3f));
Chris@10 1659 T7D = VADD(T7z, T7C);
Chris@10 1660 T7J = VSUB(T7C, T7z);
Chris@10 1661 {
Chris@10 1662 V T50, T53, T5x, T5y;
Chris@10 1663 T50 = VFNMS(LDK(KP382683432), T4Z, VMUL(LDK(KP923879532), T4Y));
Chris@10 1664 T53 = VFMA(LDK(KP923879532), T51, VMUL(LDK(KP382683432), T52));
Chris@10 1665 T54 = VSUB(T50, T53);
Chris@10 1666 T7y = VADD(T50, T53);
Chris@10 1667 T5x = VADD(T5d, T5e);
Chris@10 1668 T5y = VADD(T5g, T5h);
Chris@10 1669 T5z = VFNMS(LDK(KP195090322), T5y, VMUL(LDK(KP980785280), T5x));
Chris@10 1670 T5D = VFMA(LDK(KP195090322), T5x, VMUL(LDK(KP980785280), T5y));
Chris@10 1671 }
Chris@10 1672 {
Chris@10 1673 V T58, T5b, T5q, T5r;
Chris@10 1674 T58 = VSUB(T56, T57);
Chris@10 1675 T5b = VSUB(T59, T5a);
Chris@10 1676 T5c = VFMA(LDK(KP555570233), T58, VMUL(LDK(KP831469612), T5b));
Chris@10 1677 T5m = VFNMS(LDK(KP831469612), T58, VMUL(LDK(KP555570233), T5b));
Chris@10 1678 T5q = VFMA(LDK(KP382683432), T4Y, VMUL(LDK(KP923879532), T4Z));
Chris@10 1679 T5r = VFNMS(LDK(KP382683432), T51, VMUL(LDK(KP923879532), T52));
Chris@10 1680 T5s = VADD(T5q, T5r);
Chris@10 1681 T7I = VSUB(T5r, T5q);
Chris@10 1682 }
Chris@10 1683 {
Chris@10 1684 V T5u, T5v, T5f, T5i;
Chris@10 1685 T5u = VADD(T56, T57);
Chris@10 1686 T5v = VADD(T59, T5a);
Chris@10 1687 T5w = VFMA(LDK(KP980785280), T5u, VMUL(LDK(KP195090322), T5v));
Chris@10 1688 T5C = VFNMS(LDK(KP195090322), T5u, VMUL(LDK(KP980785280), T5v));
Chris@10 1689 T5f = VSUB(T5d, T5e);
Chris@10 1690 T5i = VSUB(T5g, T5h);
Chris@10 1691 T5j = VFNMS(LDK(KP831469612), T5i, VMUL(LDK(KP555570233), T5f));
Chris@10 1692 T5n = VFMA(LDK(KP831469612), T5f, VMUL(LDK(KP555570233), T5i));
Chris@10 1693 }
Chris@10 1694 {
Chris@10 1695 V T55, T5k, T7H, T7K;
Chris@10 1696 T55 = VADD(T4X, T54);
Chris@10 1697 T5k = VADD(T5c, T5j);
Chris@10 1698 ST(&(ri[WS(rs, 21)]), VSUB(T55, T5k), ms, &(ri[WS(rs, 1)]));
Chris@10 1699 ST(&(ri[WS(rs, 5)]), VADD(T55, T5k), ms, &(ri[WS(rs, 1)]));
Chris@10 1700 T7H = VADD(T5m, T5n);
Chris@10 1701 T7K = VADD(T7I, T7J);
Chris@10 1702 ST(&(ii[WS(rs, 5)]), VADD(T7H, T7K), ms, &(ii[WS(rs, 1)]));
Chris@10 1703 ST(&(ii[WS(rs, 21)]), VSUB(T7K, T7H), ms, &(ii[WS(rs, 1)]));
Chris@10 1704 }
Chris@10 1705 {
Chris@10 1706 V T5l, T5o, T7L, T7M;
Chris@10 1707 T5l = VSUB(T4X, T54);
Chris@10 1708 T5o = VSUB(T5m, T5n);
Chris@10 1709 ST(&(ri[WS(rs, 29)]), VSUB(T5l, T5o), ms, &(ri[WS(rs, 1)]));
Chris@10 1710 ST(&(ri[WS(rs, 13)]), VADD(T5l, T5o), ms, &(ri[WS(rs, 1)]));
Chris@10 1711 T7L = VSUB(T5j, T5c);
Chris@10 1712 T7M = VSUB(T7J, T7I);
Chris@10 1713 ST(&(ii[WS(rs, 13)]), VADD(T7L, T7M), ms, &(ii[WS(rs, 1)]));
Chris@10 1714 ST(&(ii[WS(rs, 29)]), VSUB(T7M, T7L), ms, &(ii[WS(rs, 1)]));
Chris@10 1715 }
Chris@10 1716 {
Chris@10 1717 V T5t, T5A, T7x, T7E;
Chris@10 1718 T5t = VADD(T5p, T5s);
Chris@10 1719 T5A = VADD(T5w, T5z);
Chris@10 1720 ST(&(ri[WS(rs, 17)]), VSUB(T5t, T5A), ms, &(ri[WS(rs, 1)]));
Chris@10 1721 ST(&(ri[WS(rs, 1)]), VADD(T5t, T5A), ms, &(ri[WS(rs, 1)]));
Chris@10 1722 T7x = VADD(T5C, T5D);
Chris@10 1723 T7E = VADD(T7y, T7D);
Chris@10 1724 ST(&(ii[WS(rs, 1)]), VADD(T7x, T7E), ms, &(ii[WS(rs, 1)]));
Chris@10 1725 ST(&(ii[WS(rs, 17)]), VSUB(T7E, T7x), ms, &(ii[WS(rs, 1)]));
Chris@10 1726 }
Chris@10 1727 {
Chris@10 1728 V T5B, T5E, T7F, T7G;
Chris@10 1729 T5B = VSUB(T5p, T5s);
Chris@10 1730 T5E = VSUB(T5C, T5D);
Chris@10 1731 ST(&(ri[WS(rs, 25)]), VSUB(T5B, T5E), ms, &(ri[WS(rs, 1)]));
Chris@10 1732 ST(&(ri[WS(rs, 9)]), VADD(T5B, T5E), ms, &(ri[WS(rs, 1)]));
Chris@10 1733 T7F = VSUB(T5z, T5w);
Chris@10 1734 T7G = VSUB(T7D, T7y);
Chris@10 1735 ST(&(ii[WS(rs, 9)]), VADD(T7F, T7G), ms, &(ii[WS(rs, 1)]));
Chris@10 1736 ST(&(ii[WS(rs, 25)]), VSUB(T7G, T7F), ms, &(ii[WS(rs, 1)]));
Chris@10 1737 }
Chris@10 1738 }
Chris@10 1739 }
Chris@10 1740 }
Chris@10 1741 VLEAVE();
Chris@10 1742 }
Chris@10 1743
Chris@10 1744 static const tw_instr twinstr[] = {
Chris@10 1745 VTW(0, 1),
Chris@10 1746 VTW(0, 2),
Chris@10 1747 VTW(0, 3),
Chris@10 1748 VTW(0, 4),
Chris@10 1749 VTW(0, 5),
Chris@10 1750 VTW(0, 6),
Chris@10 1751 VTW(0, 7),
Chris@10 1752 VTW(0, 8),
Chris@10 1753 VTW(0, 9),
Chris@10 1754 VTW(0, 10),
Chris@10 1755 VTW(0, 11),
Chris@10 1756 VTW(0, 12),
Chris@10 1757 VTW(0, 13),
Chris@10 1758 VTW(0, 14),
Chris@10 1759 VTW(0, 15),
Chris@10 1760 VTW(0, 16),
Chris@10 1761 VTW(0, 17),
Chris@10 1762 VTW(0, 18),
Chris@10 1763 VTW(0, 19),
Chris@10 1764 VTW(0, 20),
Chris@10 1765 VTW(0, 21),
Chris@10 1766 VTW(0, 22),
Chris@10 1767 VTW(0, 23),
Chris@10 1768 VTW(0, 24),
Chris@10 1769 VTW(0, 25),
Chris@10 1770 VTW(0, 26),
Chris@10 1771 VTW(0, 27),
Chris@10 1772 VTW(0, 28),
Chris@10 1773 VTW(0, 29),
Chris@10 1774 VTW(0, 30),
Chris@10 1775 VTW(0, 31),
Chris@10 1776 {TW_NEXT, (2 * VL), 0}
Chris@10 1777 };
Chris@10 1778
Chris@10 1779 static const ct_desc desc = { 32, XSIMD_STRING("t1sv_32"), twinstr, &GENUS, {340, 114, 94, 0}, 0, 0, 0 };
Chris@10 1780
Chris@10 1781 void XSIMD(codelet_t1sv_32) (planner *p) {
Chris@10 1782 X(kdft_dit_register) (p, t1sv_32, &desc);
Chris@10 1783 }
Chris@10 1784 #endif /* HAVE_FMA */