annotate src/fftw-3.3.8/dft/scalar/codelets/q1_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:31 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include dft/scalar/q.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 276 FP additions, 192 FP multiplications,
Chris@82 32 * (or, 144 additions, 60 multiplications, 132 fused multiply/add),
Chris@82 33 * 109 stack variables, 2 constants, and 144 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/q.h"
Chris@82 36
Chris@82 37 static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 44 E T3, Tc, Tw, TW, Ta, TM, Tf, Tg, Tt, TT, Tn, TP, Tu, Tv, TU;
Chris@82 45 E TV, T17, T1g, T1A, T20, T1e, T1Q, T1j, T1k, T1x, T1X, T1r, T1T, T1y, T1z;
Chris@82 46 E T1Y, T1Z, T2B, T31, T2v, T2X, T2C, T2D, T32, T33, T2b, T2k, T2E, T34, T2i;
Chris@82 47 E T2U, T2n, T2o, T3f, T3o, T3I, T48, T3m, T3Y, T3r, T3s, T3F, T45, T3z, T41;
Chris@82 48 E T3G, T3H, T46, T47, T4j, T4s, T4M, T5c, T4q, T52, T4v, T4w, T4J, T59, T4D;
Chris@82 49 E T55, T4K, T4L, T5a, T5b, T5N, T6d, T5H, T69, T5O, T5P, T6e, T6f, T5n, T5w;
Chris@82 50 E T5Q, T6g, T5u, T66, T5z, T5A;
Chris@82 51 {
Chris@82 52 E T9, Te, T6, Td, T1, T2;
Chris@82 53 T1 = rio[0];
Chris@82 54 T2 = rio[WS(rs, 3)];
Chris@82 55 T3 = T1 + T2;
Chris@82 56 Tc = T1 - T2;
Chris@82 57 {
Chris@82 58 E T7, T8, T4, T5;
Chris@82 59 T7 = rio[WS(rs, 4)];
Chris@82 60 T8 = rio[WS(rs, 1)];
Chris@82 61 T9 = T7 + T8;
Chris@82 62 Te = T7 - T8;
Chris@82 63 T4 = rio[WS(rs, 2)];
Chris@82 64 T5 = rio[WS(rs, 5)];
Chris@82 65 T6 = T4 + T5;
Chris@82 66 Td = T4 - T5;
Chris@82 67 }
Chris@82 68 Tw = Te - Td;
Chris@82 69 TW = T9 - T6;
Chris@82 70 Ta = T6 + T9;
Chris@82 71 TM = FNMS(KP500000000, Ta, T3);
Chris@82 72 Tf = Td + Te;
Chris@82 73 Tg = FNMS(KP500000000, Tf, Tc);
Chris@82 74 }
Chris@82 75 {
Chris@82 76 E Tj, TN, Tm, TO, Th, Ti;
Chris@82 77 Th = iio[WS(rs, 2)];
Chris@82 78 Ti = iio[WS(rs, 5)];
Chris@82 79 Tj = Th - Ti;
Chris@82 80 TN = Th + Ti;
Chris@82 81 {
Chris@82 82 E Tr, Ts, Tk, Tl;
Chris@82 83 Tr = iio[0];
Chris@82 84 Ts = iio[WS(rs, 3)];
Chris@82 85 Tt = Tr - Ts;
Chris@82 86 TT = Tr + Ts;
Chris@82 87 Tk = iio[WS(rs, 4)];
Chris@82 88 Tl = iio[WS(rs, 1)];
Chris@82 89 Tm = Tk - Tl;
Chris@82 90 TO = Tk + Tl;
Chris@82 91 }
Chris@82 92 Tn = Tj - Tm;
Chris@82 93 TP = TN - TO;
Chris@82 94 Tu = Tj + Tm;
Chris@82 95 Tv = FNMS(KP500000000, Tu, Tt);
Chris@82 96 TU = TN + TO;
Chris@82 97 TV = FNMS(KP500000000, TU, TT);
Chris@82 98 }
Chris@82 99 {
Chris@82 100 E T1d, T1i, T1a, T1h, T15, T16;
Chris@82 101 T15 = rio[WS(vs, 1)];
Chris@82 102 T16 = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 103 T17 = T15 + T16;
Chris@82 104 T1g = T15 - T16;
Chris@82 105 {
Chris@82 106 E T1b, T1c, T18, T19;
Chris@82 107 T1b = rio[WS(vs, 1) + WS(rs, 4)];
Chris@82 108 T1c = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 109 T1d = T1b + T1c;
Chris@82 110 T1i = T1b - T1c;
Chris@82 111 T18 = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 112 T19 = rio[WS(vs, 1) + WS(rs, 5)];
Chris@82 113 T1a = T18 + T19;
Chris@82 114 T1h = T18 - T19;
Chris@82 115 }
Chris@82 116 T1A = T1i - T1h;
Chris@82 117 T20 = T1d - T1a;
Chris@82 118 T1e = T1a + T1d;
Chris@82 119 T1Q = FNMS(KP500000000, T1e, T17);
Chris@82 120 T1j = T1h + T1i;
Chris@82 121 T1k = FNMS(KP500000000, T1j, T1g);
Chris@82 122 }
Chris@82 123 {
Chris@82 124 E T1n, T1R, T1q, T1S, T1l, T1m;
Chris@82 125 T1l = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 126 T1m = iio[WS(vs, 1) + WS(rs, 5)];
Chris@82 127 T1n = T1l - T1m;
Chris@82 128 T1R = T1l + T1m;
Chris@82 129 {
Chris@82 130 E T1v, T1w, T1o, T1p;
Chris@82 131 T1v = iio[WS(vs, 1)];
Chris@82 132 T1w = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 133 T1x = T1v - T1w;
Chris@82 134 T1X = T1v + T1w;
Chris@82 135 T1o = iio[WS(vs, 1) + WS(rs, 4)];
Chris@82 136 T1p = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 137 T1q = T1o - T1p;
Chris@82 138 T1S = T1o + T1p;
Chris@82 139 }
Chris@82 140 T1r = T1n - T1q;
Chris@82 141 T1T = T1R - T1S;
Chris@82 142 T1y = T1n + T1q;
Chris@82 143 T1z = FNMS(KP500000000, T1y, T1x);
Chris@82 144 T1Y = T1R + T1S;
Chris@82 145 T1Z = FNMS(KP500000000, T1Y, T1X);
Chris@82 146 }
Chris@82 147 {
Chris@82 148 E T2r, T2V, T2u, T2W, T2p, T2q;
Chris@82 149 T2p = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 150 T2q = iio[WS(vs, 2) + WS(rs, 5)];
Chris@82 151 T2r = T2p - T2q;
Chris@82 152 T2V = T2p + T2q;
Chris@82 153 {
Chris@82 154 E T2z, T2A, T2s, T2t;
Chris@82 155 T2z = iio[WS(vs, 2)];
Chris@82 156 T2A = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 157 T2B = T2z - T2A;
Chris@82 158 T31 = T2z + T2A;
Chris@82 159 T2s = iio[WS(vs, 2) + WS(rs, 4)];
Chris@82 160 T2t = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 161 T2u = T2s - T2t;
Chris@82 162 T2W = T2s + T2t;
Chris@82 163 }
Chris@82 164 T2v = T2r - T2u;
Chris@82 165 T2X = T2V - T2W;
Chris@82 166 T2C = T2r + T2u;
Chris@82 167 T2D = FNMS(KP500000000, T2C, T2B);
Chris@82 168 T32 = T2V + T2W;
Chris@82 169 T33 = FNMS(KP500000000, T32, T31);
Chris@82 170 }
Chris@82 171 {
Chris@82 172 E T2h, T2m, T2e, T2l, T29, T2a;
Chris@82 173 T29 = rio[WS(vs, 2)];
Chris@82 174 T2a = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 175 T2b = T29 + T2a;
Chris@82 176 T2k = T29 - T2a;
Chris@82 177 {
Chris@82 178 E T2f, T2g, T2c, T2d;
Chris@82 179 T2f = rio[WS(vs, 2) + WS(rs, 4)];
Chris@82 180 T2g = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 181 T2h = T2f + T2g;
Chris@82 182 T2m = T2f - T2g;
Chris@82 183 T2c = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 184 T2d = rio[WS(vs, 2) + WS(rs, 5)];
Chris@82 185 T2e = T2c + T2d;
Chris@82 186 T2l = T2c - T2d;
Chris@82 187 }
Chris@82 188 T2E = T2m - T2l;
Chris@82 189 T34 = T2h - T2e;
Chris@82 190 T2i = T2e + T2h;
Chris@82 191 T2U = FNMS(KP500000000, T2i, T2b);
Chris@82 192 T2n = T2l + T2m;
Chris@82 193 T2o = FNMS(KP500000000, T2n, T2k);
Chris@82 194 }
Chris@82 195 {
Chris@82 196 E T3l, T3q, T3i, T3p, T3d, T3e;
Chris@82 197 T3d = rio[WS(vs, 3)];
Chris@82 198 T3e = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 199 T3f = T3d + T3e;
Chris@82 200 T3o = T3d - T3e;
Chris@82 201 {
Chris@82 202 E T3j, T3k, T3g, T3h;
Chris@82 203 T3j = rio[WS(vs, 3) + WS(rs, 4)];
Chris@82 204 T3k = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 205 T3l = T3j + T3k;
Chris@82 206 T3q = T3j - T3k;
Chris@82 207 T3g = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 208 T3h = rio[WS(vs, 3) + WS(rs, 5)];
Chris@82 209 T3i = T3g + T3h;
Chris@82 210 T3p = T3g - T3h;
Chris@82 211 }
Chris@82 212 T3I = T3q - T3p;
Chris@82 213 T48 = T3l - T3i;
Chris@82 214 T3m = T3i + T3l;
Chris@82 215 T3Y = FNMS(KP500000000, T3m, T3f);
Chris@82 216 T3r = T3p + T3q;
Chris@82 217 T3s = FNMS(KP500000000, T3r, T3o);
Chris@82 218 }
Chris@82 219 {
Chris@82 220 E T3v, T3Z, T3y, T40, T3t, T3u;
Chris@82 221 T3t = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 222 T3u = iio[WS(vs, 3) + WS(rs, 5)];
Chris@82 223 T3v = T3t - T3u;
Chris@82 224 T3Z = T3t + T3u;
Chris@82 225 {
Chris@82 226 E T3D, T3E, T3w, T3x;
Chris@82 227 T3D = iio[WS(vs, 3)];
Chris@82 228 T3E = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 229 T3F = T3D - T3E;
Chris@82 230 T45 = T3D + T3E;
Chris@82 231 T3w = iio[WS(vs, 3) + WS(rs, 4)];
Chris@82 232 T3x = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 233 T3y = T3w - T3x;
Chris@82 234 T40 = T3w + T3x;
Chris@82 235 }
Chris@82 236 T3z = T3v - T3y;
Chris@82 237 T41 = T3Z - T40;
Chris@82 238 T3G = T3v + T3y;
Chris@82 239 T3H = FNMS(KP500000000, T3G, T3F);
Chris@82 240 T46 = T3Z + T40;
Chris@82 241 T47 = FNMS(KP500000000, T46, T45);
Chris@82 242 }
Chris@82 243 {
Chris@82 244 E T4p, T4u, T4m, T4t, T4h, T4i;
Chris@82 245 T4h = rio[WS(vs, 4)];
Chris@82 246 T4i = rio[WS(vs, 4) + WS(rs, 3)];
Chris@82 247 T4j = T4h + T4i;
Chris@82 248 T4s = T4h - T4i;
Chris@82 249 {
Chris@82 250 E T4n, T4o, T4k, T4l;
Chris@82 251 T4n = rio[WS(vs, 4) + WS(rs, 4)];
Chris@82 252 T4o = rio[WS(vs, 4) + WS(rs, 1)];
Chris@82 253 T4p = T4n + T4o;
Chris@82 254 T4u = T4n - T4o;
Chris@82 255 T4k = rio[WS(vs, 4) + WS(rs, 2)];
Chris@82 256 T4l = rio[WS(vs, 4) + WS(rs, 5)];
Chris@82 257 T4m = T4k + T4l;
Chris@82 258 T4t = T4k - T4l;
Chris@82 259 }
Chris@82 260 T4M = T4u - T4t;
Chris@82 261 T5c = T4p - T4m;
Chris@82 262 T4q = T4m + T4p;
Chris@82 263 T52 = FNMS(KP500000000, T4q, T4j);
Chris@82 264 T4v = T4t + T4u;
Chris@82 265 T4w = FNMS(KP500000000, T4v, T4s);
Chris@82 266 }
Chris@82 267 {
Chris@82 268 E T4z, T53, T4C, T54, T4x, T4y;
Chris@82 269 T4x = iio[WS(vs, 4) + WS(rs, 2)];
Chris@82 270 T4y = iio[WS(vs, 4) + WS(rs, 5)];
Chris@82 271 T4z = T4x - T4y;
Chris@82 272 T53 = T4x + T4y;
Chris@82 273 {
Chris@82 274 E T4H, T4I, T4A, T4B;
Chris@82 275 T4H = iio[WS(vs, 4)];
Chris@82 276 T4I = iio[WS(vs, 4) + WS(rs, 3)];
Chris@82 277 T4J = T4H - T4I;
Chris@82 278 T59 = T4H + T4I;
Chris@82 279 T4A = iio[WS(vs, 4) + WS(rs, 4)];
Chris@82 280 T4B = iio[WS(vs, 4) + WS(rs, 1)];
Chris@82 281 T4C = T4A - T4B;
Chris@82 282 T54 = T4A + T4B;
Chris@82 283 }
Chris@82 284 T4D = T4z - T4C;
Chris@82 285 T55 = T53 - T54;
Chris@82 286 T4K = T4z + T4C;
Chris@82 287 T4L = FNMS(KP500000000, T4K, T4J);
Chris@82 288 T5a = T53 + T54;
Chris@82 289 T5b = FNMS(KP500000000, T5a, T59);
Chris@82 290 }
Chris@82 291 {
Chris@82 292 E T5D, T67, T5G, T68, T5B, T5C;
Chris@82 293 T5B = iio[WS(vs, 5) + WS(rs, 2)];
Chris@82 294 T5C = iio[WS(vs, 5) + WS(rs, 5)];
Chris@82 295 T5D = T5B - T5C;
Chris@82 296 T67 = T5B + T5C;
Chris@82 297 {
Chris@82 298 E T5L, T5M, T5E, T5F;
Chris@82 299 T5L = iio[WS(vs, 5)];
Chris@82 300 T5M = iio[WS(vs, 5) + WS(rs, 3)];
Chris@82 301 T5N = T5L - T5M;
Chris@82 302 T6d = T5L + T5M;
Chris@82 303 T5E = iio[WS(vs, 5) + WS(rs, 4)];
Chris@82 304 T5F = iio[WS(vs, 5) + WS(rs, 1)];
Chris@82 305 T5G = T5E - T5F;
Chris@82 306 T68 = T5E + T5F;
Chris@82 307 }
Chris@82 308 T5H = T5D - T5G;
Chris@82 309 T69 = T67 - T68;
Chris@82 310 T5O = T5D + T5G;
Chris@82 311 T5P = FNMS(KP500000000, T5O, T5N);
Chris@82 312 T6e = T67 + T68;
Chris@82 313 T6f = FNMS(KP500000000, T6e, T6d);
Chris@82 314 }
Chris@82 315 {
Chris@82 316 E T5t, T5y, T5q, T5x, T5l, T5m;
Chris@82 317 T5l = rio[WS(vs, 5)];
Chris@82 318 T5m = rio[WS(vs, 5) + WS(rs, 3)];
Chris@82 319 T5n = T5l + T5m;
Chris@82 320 T5w = T5l - T5m;
Chris@82 321 {
Chris@82 322 E T5r, T5s, T5o, T5p;
Chris@82 323 T5r = rio[WS(vs, 5) + WS(rs, 4)];
Chris@82 324 T5s = rio[WS(vs, 5) + WS(rs, 1)];
Chris@82 325 T5t = T5r + T5s;
Chris@82 326 T5y = T5r - T5s;
Chris@82 327 T5o = rio[WS(vs, 5) + WS(rs, 2)];
Chris@82 328 T5p = rio[WS(vs, 5) + WS(rs, 5)];
Chris@82 329 T5q = T5o + T5p;
Chris@82 330 T5x = T5o - T5p;
Chris@82 331 }
Chris@82 332 T5Q = T5y - T5x;
Chris@82 333 T6g = T5t - T5q;
Chris@82 334 T5u = T5q + T5t;
Chris@82 335 T66 = FNMS(KP500000000, T5u, T5n);
Chris@82 336 T5z = T5x + T5y;
Chris@82 337 T5A = FNMS(KP500000000, T5z, T5w);
Chris@82 338 }
Chris@82 339 rio[0] = T3 + Ta;
Chris@82 340 iio[0] = TT + TU;
Chris@82 341 rio[WS(rs, 1)] = T17 + T1e;
Chris@82 342 iio[WS(rs, 1)] = T1X + T1Y;
Chris@82 343 rio[WS(rs, 2)] = T2b + T2i;
Chris@82 344 iio[WS(rs, 2)] = T31 + T32;
Chris@82 345 iio[WS(rs, 4)] = T59 + T5a;
Chris@82 346 rio[WS(rs, 4)] = T4j + T4q;
Chris@82 347 rio[WS(rs, 3)] = T3f + T3m;
Chris@82 348 iio[WS(rs, 3)] = T45 + T46;
Chris@82 349 rio[WS(rs, 5)] = T5n + T5u;
Chris@82 350 iio[WS(rs, 5)] = T6d + T6e;
Chris@82 351 {
Chris@82 352 E To, Tx, Tp, Ty, Tb, Tq;
Chris@82 353 To = FMA(KP866025403, Tn, Tg);
Chris@82 354 Tx = FMA(KP866025403, Tw, Tv);
Chris@82 355 Tb = W[0];
Chris@82 356 Tp = Tb * To;
Chris@82 357 Ty = Tb * Tx;
Chris@82 358 Tq = W[1];
Chris@82 359 rio[WS(vs, 1)] = FMA(Tq, Tx, Tp);
Chris@82 360 iio[WS(vs, 1)] = FNMS(Tq, To, Ty);
Chris@82 361 }
Chris@82 362 {
Chris@82 363 E TG, TJ, TH, TK, TF, TI;
Chris@82 364 TG = Tc + Tf;
Chris@82 365 TJ = Tt + Tu;
Chris@82 366 TF = W[4];
Chris@82 367 TH = TF * TG;
Chris@82 368 TK = TF * TJ;
Chris@82 369 TI = W[5];
Chris@82 370 rio[WS(vs, 3)] = FMA(TI, TJ, TH);
Chris@82 371 iio[WS(vs, 3)] = FNMS(TI, TG, TK);
Chris@82 372 }
Chris@82 373 {
Chris@82 374 E T10, T13, T11, T14, TZ, T12;
Chris@82 375 T10 = FMA(KP866025403, TP, TM);
Chris@82 376 T13 = FMA(KP866025403, TW, TV);
Chris@82 377 TZ = W[6];
Chris@82 378 T11 = TZ * T10;
Chris@82 379 T14 = TZ * T13;
Chris@82 380 T12 = W[7];
Chris@82 381 rio[WS(vs, 4)] = FMA(T12, T13, T11);
Chris@82 382 iio[WS(vs, 4)] = FNMS(T12, T10, T14);
Chris@82 383 }
Chris@82 384 {
Chris@82 385 E T60, T63, T61, T64, T5Z, T62;
Chris@82 386 T60 = T5w + T5z;
Chris@82 387 T63 = T5N + T5O;
Chris@82 388 T5Z = W[4];
Chris@82 389 T61 = T5Z * T60;
Chris@82 390 T64 = T5Z * T63;
Chris@82 391 T62 = W[5];
Chris@82 392 rio[WS(vs, 3) + WS(rs, 5)] = FMA(T62, T63, T61);
Chris@82 393 iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T62, T60, T64);
Chris@82 394 }
Chris@82 395 {
Chris@82 396 E T6k, T6n, T6l, T6o, T6j, T6m;
Chris@82 397 T6k = FMA(KP866025403, T69, T66);
Chris@82 398 T6n = FMA(KP866025403, T6g, T6f);
Chris@82 399 T6j = W[6];
Chris@82 400 T6l = T6j * T6k;
Chris@82 401 T6o = T6j * T6n;
Chris@82 402 T6m = W[7];
Chris@82 403 rio[WS(vs, 4) + WS(rs, 5)] = FMA(T6m, T6n, T6l);
Chris@82 404 iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T6m, T6k, T6o);
Chris@82 405 }
Chris@82 406 {
Chris@82 407 E TA, TD, TB, TE, Tz, TC;
Chris@82 408 TA = FNMS(KP866025403, Tn, Tg);
Chris@82 409 TD = FNMS(KP866025403, Tw, Tv);
Chris@82 410 Tz = W[8];
Chris@82 411 TB = Tz * TA;
Chris@82 412 TE = Tz * TD;
Chris@82 413 TC = W[9];
Chris@82 414 rio[WS(vs, 5)] = FMA(TC, TD, TB);
Chris@82 415 iio[WS(vs, 5)] = FNMS(TC, TA, TE);
Chris@82 416 }
Chris@82 417 {
Chris@82 418 E TQ, TX, TR, TY, TL, TS;
Chris@82 419 TQ = FNMS(KP866025403, TP, TM);
Chris@82 420 TX = FNMS(KP866025403, TW, TV);
Chris@82 421 TL = W[2];
Chris@82 422 TR = TL * TQ;
Chris@82 423 TY = TL * TX;
Chris@82 424 TS = W[3];
Chris@82 425 rio[WS(vs, 2)] = FMA(TS, TX, TR);
Chris@82 426 iio[WS(vs, 2)] = FNMS(TS, TQ, TY);
Chris@82 427 }
Chris@82 428 {
Chris@82 429 E T5U, T5X, T5V, T5Y, T5T, T5W;
Chris@82 430 T5U = FNMS(KP866025403, T5H, T5A);
Chris@82 431 T5X = FNMS(KP866025403, T5Q, T5P);
Chris@82 432 T5T = W[8];
Chris@82 433 T5V = T5T * T5U;
Chris@82 434 T5Y = T5T * T5X;
Chris@82 435 T5W = W[9];
Chris@82 436 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T5W, T5X, T5V);
Chris@82 437 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T5W, T5U, T5Y);
Chris@82 438 }
Chris@82 439 {
Chris@82 440 E T6a, T6h, T6b, T6i, T65, T6c;
Chris@82 441 T6a = FNMS(KP866025403, T69, T66);
Chris@82 442 T6h = FNMS(KP866025403, T6g, T6f);
Chris@82 443 T65 = W[2];
Chris@82 444 T6b = T65 * T6a;
Chris@82 445 T6i = T65 * T6h;
Chris@82 446 T6c = W[3];
Chris@82 447 rio[WS(vs, 2) + WS(rs, 5)] = FMA(T6c, T6h, T6b);
Chris@82 448 iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T6c, T6a, T6i);
Chris@82 449 }
Chris@82 450 {
Chris@82 451 E T5I, T5R, T5J, T5S, T5v, T5K;
Chris@82 452 T5I = FMA(KP866025403, T5H, T5A);
Chris@82 453 T5R = FMA(KP866025403, T5Q, T5P);
Chris@82 454 T5v = W[0];
Chris@82 455 T5J = T5v * T5I;
Chris@82 456 T5S = T5v * T5R;
Chris@82 457 T5K = W[1];
Chris@82 458 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T5K, T5R, T5J);
Chris@82 459 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T5K, T5I, T5S);
Chris@82 460 }
Chris@82 461 {
Chris@82 462 E T1s, T1B, T1t, T1C, T1f, T1u;
Chris@82 463 T1s = FMA(KP866025403, T1r, T1k);
Chris@82 464 T1B = FMA(KP866025403, T1A, T1z);
Chris@82 465 T1f = W[0];
Chris@82 466 T1t = T1f * T1s;
Chris@82 467 T1C = T1f * T1B;
Chris@82 468 T1u = W[1];
Chris@82 469 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1u, T1B, T1t);
Chris@82 470 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1u, T1s, T1C);
Chris@82 471 }
Chris@82 472 {
Chris@82 473 E T3S, T3V, T3T, T3W, T3R, T3U;
Chris@82 474 T3S = T3o + T3r;
Chris@82 475 T3V = T3F + T3G;
Chris@82 476 T3R = W[4];
Chris@82 477 T3T = T3R * T3S;
Chris@82 478 T3W = T3R * T3V;
Chris@82 479 T3U = W[5];
Chris@82 480 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3U, T3V, T3T);
Chris@82 481 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3U, T3S, T3W);
Chris@82 482 }
Chris@82 483 {
Chris@82 484 E T3A, T3J, T3B, T3K, T3n, T3C;
Chris@82 485 T3A = FMA(KP866025403, T3z, T3s);
Chris@82 486 T3J = FMA(KP866025403, T3I, T3H);
Chris@82 487 T3n = W[0];
Chris@82 488 T3B = T3n * T3A;
Chris@82 489 T3K = T3n * T3J;
Chris@82 490 T3C = W[1];
Chris@82 491 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T3C, T3J, T3B);
Chris@82 492 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T3C, T3A, T3K);
Chris@82 493 }
Chris@82 494 {
Chris@82 495 E T56, T5d, T57, T5e, T51, T58;
Chris@82 496 T56 = FNMS(KP866025403, T55, T52);
Chris@82 497 T5d = FNMS(KP866025403, T5c, T5b);
Chris@82 498 T51 = W[2];
Chris@82 499 T57 = T51 * T56;
Chris@82 500 T5e = T51 * T5d;
Chris@82 501 T58 = W[3];
Chris@82 502 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T58, T5d, T57);
Chris@82 503 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T58, T56, T5e);
Chris@82 504 }
Chris@82 505 {
Chris@82 506 E T2Y, T35, T2Z, T36, T2T, T30;
Chris@82 507 T2Y = FNMS(KP866025403, T2X, T2U);
Chris@82 508 T35 = FNMS(KP866025403, T34, T33);
Chris@82 509 T2T = W[2];
Chris@82 510 T2Z = T2T * T2Y;
Chris@82 511 T36 = T2T * T35;
Chris@82 512 T30 = W[3];
Chris@82 513 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T30, T35, T2Z);
Chris@82 514 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T30, T2Y, T36);
Chris@82 515 }
Chris@82 516 {
Chris@82 517 E T3M, T3P, T3N, T3Q, T3L, T3O;
Chris@82 518 T3M = FNMS(KP866025403, T3z, T3s);
Chris@82 519 T3P = FNMS(KP866025403, T3I, T3H);
Chris@82 520 T3L = W[8];
Chris@82 521 T3N = T3L * T3M;
Chris@82 522 T3Q = T3L * T3P;
Chris@82 523 T3O = W[9];
Chris@82 524 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3O, T3P, T3N);
Chris@82 525 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3O, T3M, T3Q);
Chris@82 526 }
Chris@82 527 {
Chris@82 528 E T38, T3b, T39, T3c, T37, T3a;
Chris@82 529 T38 = FMA(KP866025403, T2X, T2U);
Chris@82 530 T3b = FMA(KP866025403, T34, T33);
Chris@82 531 T37 = W[6];
Chris@82 532 T39 = T37 * T38;
Chris@82 533 T3c = T37 * T3b;
Chris@82 534 T3a = W[7];
Chris@82 535 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T3a, T3b, T39);
Chris@82 536 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T3a, T38, T3c);
Chris@82 537 }
Chris@82 538 {
Chris@82 539 E T1E, T1H, T1F, T1I, T1D, T1G;
Chris@82 540 T1E = FNMS(KP866025403, T1r, T1k);
Chris@82 541 T1H = FNMS(KP866025403, T1A, T1z);
Chris@82 542 T1D = W[8];
Chris@82 543 T1F = T1D * T1E;
Chris@82 544 T1I = T1D * T1H;
Chris@82 545 T1G = W[9];
Chris@82 546 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1G, T1H, T1F);
Chris@82 547 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1G, T1E, T1I);
Chris@82 548 }
Chris@82 549 {
Chris@82 550 E T5g, T5j, T5h, T5k, T5f, T5i;
Chris@82 551 T5g = FMA(KP866025403, T55, T52);
Chris@82 552 T5j = FMA(KP866025403, T5c, T5b);
Chris@82 553 T5f = W[6];
Chris@82 554 T5h = T5f * T5g;
Chris@82 555 T5k = T5f * T5j;
Chris@82 556 T5i = W[7];
Chris@82 557 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T5i, T5j, T5h);
Chris@82 558 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T5i, T5g, T5k);
Chris@82 559 }
Chris@82 560 {
Chris@82 561 E T1K, T1N, T1L, T1O, T1J, T1M;
Chris@82 562 T1K = T1g + T1j;
Chris@82 563 T1N = T1x + T1y;
Chris@82 564 T1J = W[4];
Chris@82 565 T1L = T1J * T1K;
Chris@82 566 T1O = T1J * T1N;
Chris@82 567 T1M = W[5];
Chris@82 568 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
Chris@82 569 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
Chris@82 570 }
Chris@82 571 {
Chris@82 572 E T4W, T4Z, T4X, T50, T4V, T4Y;
Chris@82 573 T4W = T4s + T4v;
Chris@82 574 T4Z = T4J + T4K;
Chris@82 575 T4V = W[4];
Chris@82 576 T4X = T4V * T4W;
Chris@82 577 T50 = T4V * T4Z;
Chris@82 578 T4Y = W[5];
Chris@82 579 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4Y, T4Z, T4X);
Chris@82 580 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4Y, T4W, T50);
Chris@82 581 }
Chris@82 582 {
Chris@82 583 E T4E, T4N, T4F, T4O, T4r, T4G;
Chris@82 584 T4E = FMA(KP866025403, T4D, T4w);
Chris@82 585 T4N = FMA(KP866025403, T4M, T4L);
Chris@82 586 T4r = W[0];
Chris@82 587 T4F = T4r * T4E;
Chris@82 588 T4O = T4r * T4N;
Chris@82 589 T4G = W[1];
Chris@82 590 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T4G, T4N, T4F);
Chris@82 591 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T4G, T4E, T4O);
Chris@82 592 }
Chris@82 593 {
Chris@82 594 E T2O, T2R, T2P, T2S, T2N, T2Q;
Chris@82 595 T2O = T2k + T2n;
Chris@82 596 T2R = T2B + T2C;
Chris@82 597 T2N = W[4];
Chris@82 598 T2P = T2N * T2O;
Chris@82 599 T2S = T2N * T2R;
Chris@82 600 T2Q = W[5];
Chris@82 601 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2Q, T2R, T2P);
Chris@82 602 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2Q, T2O, T2S);
Chris@82 603 }
Chris@82 604 {
Chris@82 605 E T2w, T2F, T2x, T2G, T2j, T2y;
Chris@82 606 T2w = FMA(KP866025403, T2v, T2o);
Chris@82 607 T2F = FMA(KP866025403, T2E, T2D);
Chris@82 608 T2j = W[0];
Chris@82 609 T2x = T2j * T2w;
Chris@82 610 T2G = T2j * T2F;
Chris@82 611 T2y = W[1];
Chris@82 612 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2y, T2F, T2x);
Chris@82 613 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2y, T2w, T2G);
Chris@82 614 }
Chris@82 615 {
Chris@82 616 E T24, T27, T25, T28, T23, T26;
Chris@82 617 T24 = FMA(KP866025403, T1T, T1Q);
Chris@82 618 T27 = FMA(KP866025403, T20, T1Z);
Chris@82 619 T23 = W[6];
Chris@82 620 T25 = T23 * T24;
Chris@82 621 T28 = T23 * T27;
Chris@82 622 T26 = W[7];
Chris@82 623 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T26, T27, T25);
Chris@82 624 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T26, T24, T28);
Chris@82 625 }
Chris@82 626 {
Chris@82 627 E T42, T49, T43, T4a, T3X, T44;
Chris@82 628 T42 = FNMS(KP866025403, T41, T3Y);
Chris@82 629 T49 = FNMS(KP866025403, T48, T47);
Chris@82 630 T3X = W[2];
Chris@82 631 T43 = T3X * T42;
Chris@82 632 T4a = T3X * T49;
Chris@82 633 T44 = W[3];
Chris@82 634 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T44, T49, T43);
Chris@82 635 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T44, T42, T4a);
Chris@82 636 }
Chris@82 637 {
Chris@82 638 E T2I, T2L, T2J, T2M, T2H, T2K;
Chris@82 639 T2I = FNMS(KP866025403, T2v, T2o);
Chris@82 640 T2L = FNMS(KP866025403, T2E, T2D);
Chris@82 641 T2H = W[8];
Chris@82 642 T2J = T2H * T2I;
Chris@82 643 T2M = T2H * T2L;
Chris@82 644 T2K = W[9];
Chris@82 645 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2K, T2L, T2J);
Chris@82 646 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2K, T2I, T2M);
Chris@82 647 }
Chris@82 648 {
Chris@82 649 E T4Q, T4T, T4R, T4U, T4P, T4S;
Chris@82 650 T4Q = FNMS(KP866025403, T4D, T4w);
Chris@82 651 T4T = FNMS(KP866025403, T4M, T4L);
Chris@82 652 T4P = W[8];
Chris@82 653 T4R = T4P * T4Q;
Chris@82 654 T4U = T4P * T4T;
Chris@82 655 T4S = W[9];
Chris@82 656 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T4S, T4T, T4R);
Chris@82 657 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4S, T4Q, T4U);
Chris@82 658 }
Chris@82 659 {
Chris@82 660 E T1U, T21, T1V, T22, T1P, T1W;
Chris@82 661 T1U = FNMS(KP866025403, T1T, T1Q);
Chris@82 662 T21 = FNMS(KP866025403, T20, T1Z);
Chris@82 663 T1P = W[2];
Chris@82 664 T1V = T1P * T1U;
Chris@82 665 T22 = T1P * T21;
Chris@82 666 T1W = W[3];
Chris@82 667 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1W, T21, T1V);
Chris@82 668 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1W, T1U, T22);
Chris@82 669 }
Chris@82 670 {
Chris@82 671 E T4c, T4f, T4d, T4g, T4b, T4e;
Chris@82 672 T4c = FMA(KP866025403, T41, T3Y);
Chris@82 673 T4f = FMA(KP866025403, T48, T47);
Chris@82 674 T4b = W[6];
Chris@82 675 T4d = T4b * T4c;
Chris@82 676 T4g = T4b * T4f;
Chris@82 677 T4e = W[7];
Chris@82 678 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T4e, T4f, T4d);
Chris@82 679 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T4e, T4c, T4g);
Chris@82 680 }
Chris@82 681 }
Chris@82 682 }
Chris@82 683 }
Chris@82 684
Chris@82 685 static const tw_instr twinstr[] = {
Chris@82 686 {TW_FULL, 0, 6},
Chris@82 687 {TW_NEXT, 1, 0}
Chris@82 688 };
Chris@82 689
Chris@82 690 static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {144, 60, 132, 0}, 0, 0, 0 };
Chris@82 691
Chris@82 692 void X(codelet_q1_6) (planner *p) {
Chris@82 693 X(kdft_difsq_register) (p, q1_6, &desc);
Chris@82 694 }
Chris@82 695 #else
Chris@82 696
Chris@82 697 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include dft/scalar/q.h */
Chris@82 698
Chris@82 699 /*
Chris@82 700 * This function contains 276 FP additions, 168 FP multiplications,
Chris@82 701 * (or, 192 additions, 84 multiplications, 84 fused multiply/add),
Chris@82 702 * 85 stack variables, 2 constants, and 144 memory accesses
Chris@82 703 */
Chris@82 704 #include "dft/scalar/q.h"
Chris@82 705
Chris@82 706 static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 707 {
Chris@82 708 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 709 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 710 {
Chris@82 711 INT m;
Chris@82 712 for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@82 713 E T3, Tc, Tt, TM, TX, T16, T1n, T1G, T2h, T2A, T1R, T20, T2L, T2U, T3b;
Chris@82 714 E T3u, T3F, T3O, T45, T4o, T4Z, T5i, T4z, T4I, Ta, TP, Tf, Tq, Tn, TN;
Chris@82 715 E Tu, TJ, T14, T1J, T19, T1k, T1h, T1H, T1o, T1D, T2b, T2B, T2i, T2x, T1Y;
Chris@82 716 E T2D, T23, T2e, T2S, T3x, T2X, T38, T35, T3v, T3c, T3r, T3M, T4r, T3R, T42;
Chris@82 717 E T3Z, T4p, T46, T4l, T4T, T5j, T50, T5f, T4G, T5l, T4L, T4W;
Chris@82 718 {
Chris@82 719 E T1, T2, T1l, T1m;
Chris@82 720 T1 = rio[0];
Chris@82 721 T2 = rio[WS(rs, 3)];
Chris@82 722 T3 = T1 + T2;
Chris@82 723 Tc = T1 - T2;
Chris@82 724 {
Chris@82 725 E Tr, Ts, TV, TW;
Chris@82 726 Tr = iio[0];
Chris@82 727 Ts = iio[WS(rs, 3)];
Chris@82 728 Tt = Tr - Ts;
Chris@82 729 TM = Tr + Ts;
Chris@82 730 TV = rio[WS(vs, 1)];
Chris@82 731 TW = rio[WS(vs, 1) + WS(rs, 3)];
Chris@82 732 TX = TV + TW;
Chris@82 733 T16 = TV - TW;
Chris@82 734 }
Chris@82 735 T1l = iio[WS(vs, 1)];
Chris@82 736 T1m = iio[WS(vs, 1) + WS(rs, 3)];
Chris@82 737 T1n = T1l - T1m;
Chris@82 738 T1G = T1l + T1m;
Chris@82 739 {
Chris@82 740 E T2f, T2g, T1P, T1Q;
Chris@82 741 T2f = iio[WS(vs, 2)];
Chris@82 742 T2g = iio[WS(vs, 2) + WS(rs, 3)];
Chris@82 743 T2h = T2f - T2g;
Chris@82 744 T2A = T2f + T2g;
Chris@82 745 T1P = rio[WS(vs, 2)];
Chris@82 746 T1Q = rio[WS(vs, 2) + WS(rs, 3)];
Chris@82 747 T1R = T1P + T1Q;
Chris@82 748 T20 = T1P - T1Q;
Chris@82 749 }
Chris@82 750 }
Chris@82 751 {
Chris@82 752 E T2J, T2K, T43, T44;
Chris@82 753 T2J = rio[WS(vs, 3)];
Chris@82 754 T2K = rio[WS(vs, 3) + WS(rs, 3)];
Chris@82 755 T2L = T2J + T2K;
Chris@82 756 T2U = T2J - T2K;
Chris@82 757 {
Chris@82 758 E T39, T3a, T3D, T3E;
Chris@82 759 T39 = iio[WS(vs, 3)];
Chris@82 760 T3a = iio[WS(vs, 3) + WS(rs, 3)];
Chris@82 761 T3b = T39 - T3a;
Chris@82 762 T3u = T39 + T3a;
Chris@82 763 T3D = rio[WS(vs, 4)];
Chris@82 764 T3E = rio[WS(vs, 4) + WS(rs, 3)];
Chris@82 765 T3F = T3D + T3E;
Chris@82 766 T3O = T3D - T3E;
Chris@82 767 }
Chris@82 768 T43 = iio[WS(vs, 4)];
Chris@82 769 T44 = iio[WS(vs, 4) + WS(rs, 3)];
Chris@82 770 T45 = T43 - T44;
Chris@82 771 T4o = T43 + T44;
Chris@82 772 {
Chris@82 773 E T4X, T4Y, T4x, T4y;
Chris@82 774 T4X = iio[WS(vs, 5)];
Chris@82 775 T4Y = iio[WS(vs, 5) + WS(rs, 3)];
Chris@82 776 T4Z = T4X - T4Y;
Chris@82 777 T5i = T4X + T4Y;
Chris@82 778 T4x = rio[WS(vs, 5)];
Chris@82 779 T4y = rio[WS(vs, 5) + WS(rs, 3)];
Chris@82 780 T4z = T4x + T4y;
Chris@82 781 T4I = T4x - T4y;
Chris@82 782 }
Chris@82 783 }
Chris@82 784 {
Chris@82 785 E T6, Td, T9, Te;
Chris@82 786 {
Chris@82 787 E T4, T5, T7, T8;
Chris@82 788 T4 = rio[WS(rs, 2)];
Chris@82 789 T5 = rio[WS(rs, 5)];
Chris@82 790 T6 = T4 + T5;
Chris@82 791 Td = T4 - T5;
Chris@82 792 T7 = rio[WS(rs, 4)];
Chris@82 793 T8 = rio[WS(rs, 1)];
Chris@82 794 T9 = T7 + T8;
Chris@82 795 Te = T7 - T8;
Chris@82 796 }
Chris@82 797 Ta = T6 + T9;
Chris@82 798 TP = KP866025403 * (T9 - T6);
Chris@82 799 Tf = Td + Te;
Chris@82 800 Tq = KP866025403 * (Te - Td);
Chris@82 801 }
Chris@82 802 {
Chris@82 803 E Tj, TH, Tm, TI;
Chris@82 804 {
Chris@82 805 E Th, Ti, Tk, Tl;
Chris@82 806 Th = iio[WS(rs, 2)];
Chris@82 807 Ti = iio[WS(rs, 5)];
Chris@82 808 Tj = Th - Ti;
Chris@82 809 TH = Th + Ti;
Chris@82 810 Tk = iio[WS(rs, 4)];
Chris@82 811 Tl = iio[WS(rs, 1)];
Chris@82 812 Tm = Tk - Tl;
Chris@82 813 TI = Tk + Tl;
Chris@82 814 }
Chris@82 815 Tn = KP866025403 * (Tj - Tm);
Chris@82 816 TN = TH + TI;
Chris@82 817 Tu = Tj + Tm;
Chris@82 818 TJ = KP866025403 * (TH - TI);
Chris@82 819 }
Chris@82 820 {
Chris@82 821 E T10, T17, T13, T18;
Chris@82 822 {
Chris@82 823 E TY, TZ, T11, T12;
Chris@82 824 TY = rio[WS(vs, 1) + WS(rs, 2)];
Chris@82 825 TZ = rio[WS(vs, 1) + WS(rs, 5)];
Chris@82 826 T10 = TY + TZ;
Chris@82 827 T17 = TY - TZ;
Chris@82 828 T11 = rio[WS(vs, 1) + WS(rs, 4)];
Chris@82 829 T12 = rio[WS(vs, 1) + WS(rs, 1)];
Chris@82 830 T13 = T11 + T12;
Chris@82 831 T18 = T11 - T12;
Chris@82 832 }
Chris@82 833 T14 = T10 + T13;
Chris@82 834 T1J = KP866025403 * (T13 - T10);
Chris@82 835 T19 = T17 + T18;
Chris@82 836 T1k = KP866025403 * (T18 - T17);
Chris@82 837 }
Chris@82 838 {
Chris@82 839 E T1d, T1B, T1g, T1C;
Chris@82 840 {
Chris@82 841 E T1b, T1c, T1e, T1f;
Chris@82 842 T1b = iio[WS(vs, 1) + WS(rs, 2)];
Chris@82 843 T1c = iio[WS(vs, 1) + WS(rs, 5)];
Chris@82 844 T1d = T1b - T1c;
Chris@82 845 T1B = T1b + T1c;
Chris@82 846 T1e = iio[WS(vs, 1) + WS(rs, 4)];
Chris@82 847 T1f = iio[WS(vs, 1) + WS(rs, 1)];
Chris@82 848 T1g = T1e - T1f;
Chris@82 849 T1C = T1e + T1f;
Chris@82 850 }
Chris@82 851 T1h = KP866025403 * (T1d - T1g);
Chris@82 852 T1H = T1B + T1C;
Chris@82 853 T1o = T1d + T1g;
Chris@82 854 T1D = KP866025403 * (T1B - T1C);
Chris@82 855 }
Chris@82 856 {
Chris@82 857 E T27, T2v, T2a, T2w;
Chris@82 858 {
Chris@82 859 E T25, T26, T28, T29;
Chris@82 860 T25 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@82 861 T26 = iio[WS(vs, 2) + WS(rs, 5)];
Chris@82 862 T27 = T25 - T26;
Chris@82 863 T2v = T25 + T26;
Chris@82 864 T28 = iio[WS(vs, 2) + WS(rs, 4)];
Chris@82 865 T29 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@82 866 T2a = T28 - T29;
Chris@82 867 T2w = T28 + T29;
Chris@82 868 }
Chris@82 869 T2b = KP866025403 * (T27 - T2a);
Chris@82 870 T2B = T2v + T2w;
Chris@82 871 T2i = T27 + T2a;
Chris@82 872 T2x = KP866025403 * (T2v - T2w);
Chris@82 873 }
Chris@82 874 {
Chris@82 875 E T1U, T21, T1X, T22;
Chris@82 876 {
Chris@82 877 E T1S, T1T, T1V, T1W;
Chris@82 878 T1S = rio[WS(vs, 2) + WS(rs, 2)];
Chris@82 879 T1T = rio[WS(vs, 2) + WS(rs, 5)];
Chris@82 880 T1U = T1S + T1T;
Chris@82 881 T21 = T1S - T1T;
Chris@82 882 T1V = rio[WS(vs, 2) + WS(rs, 4)];
Chris@82 883 T1W = rio[WS(vs, 2) + WS(rs, 1)];
Chris@82 884 T1X = T1V + T1W;
Chris@82 885 T22 = T1V - T1W;
Chris@82 886 }
Chris@82 887 T1Y = T1U + T1X;
Chris@82 888 T2D = KP866025403 * (T1X - T1U);
Chris@82 889 T23 = T21 + T22;
Chris@82 890 T2e = KP866025403 * (T22 - T21);
Chris@82 891 }
Chris@82 892 {
Chris@82 893 E T2O, T2V, T2R, T2W;
Chris@82 894 {
Chris@82 895 E T2M, T2N, T2P, T2Q;
Chris@82 896 T2M = rio[WS(vs, 3) + WS(rs, 2)];
Chris@82 897 T2N = rio[WS(vs, 3) + WS(rs, 5)];
Chris@82 898 T2O = T2M + T2N;
Chris@82 899 T2V = T2M - T2N;
Chris@82 900 T2P = rio[WS(vs, 3) + WS(rs, 4)];
Chris@82 901 T2Q = rio[WS(vs, 3) + WS(rs, 1)];
Chris@82 902 T2R = T2P + T2Q;
Chris@82 903 T2W = T2P - T2Q;
Chris@82 904 }
Chris@82 905 T2S = T2O + T2R;
Chris@82 906 T3x = KP866025403 * (T2R - T2O);
Chris@82 907 T2X = T2V + T2W;
Chris@82 908 T38 = KP866025403 * (T2W - T2V);
Chris@82 909 }
Chris@82 910 {
Chris@82 911 E T31, T3p, T34, T3q;
Chris@82 912 {
Chris@82 913 E T2Z, T30, T32, T33;
Chris@82 914 T2Z = iio[WS(vs, 3) + WS(rs, 2)];
Chris@82 915 T30 = iio[WS(vs, 3) + WS(rs, 5)];
Chris@82 916 T31 = T2Z - T30;
Chris@82 917 T3p = T2Z + T30;
Chris@82 918 T32 = iio[WS(vs, 3) + WS(rs, 4)];
Chris@82 919 T33 = iio[WS(vs, 3) + WS(rs, 1)];
Chris@82 920 T34 = T32 - T33;
Chris@82 921 T3q = T32 + T33;
Chris@82 922 }
Chris@82 923 T35 = KP866025403 * (T31 - T34);
Chris@82 924 T3v = T3p + T3q;
Chris@82 925 T3c = T31 + T34;
Chris@82 926 T3r = KP866025403 * (T3p - T3q);
Chris@82 927 }
Chris@82 928 {
Chris@82 929 E T3I, T3P, T3L, T3Q;
Chris@82 930 {
Chris@82 931 E T3G, T3H, T3J, T3K;
Chris@82 932 T3G = rio[WS(vs, 4) + WS(rs, 2)];
Chris@82 933 T3H = rio[WS(vs, 4) + WS(rs, 5)];
Chris@82 934 T3I = T3G + T3H;
Chris@82 935 T3P = T3G - T3H;
Chris@82 936 T3J = rio[WS(vs, 4) + WS(rs, 4)];
Chris@82 937 T3K = rio[WS(vs, 4) + WS(rs, 1)];
Chris@82 938 T3L = T3J + T3K;
Chris@82 939 T3Q = T3J - T3K;
Chris@82 940 }
Chris@82 941 T3M = T3I + T3L;
Chris@82 942 T4r = KP866025403 * (T3L - T3I);
Chris@82 943 T3R = T3P + T3Q;
Chris@82 944 T42 = KP866025403 * (T3Q - T3P);
Chris@82 945 }
Chris@82 946 {
Chris@82 947 E T3V, T4j, T3Y, T4k;
Chris@82 948 {
Chris@82 949 E T3T, T3U, T3W, T3X;
Chris@82 950 T3T = iio[WS(vs, 4) + WS(rs, 2)];
Chris@82 951 T3U = iio[WS(vs, 4) + WS(rs, 5)];
Chris@82 952 T3V = T3T - T3U;
Chris@82 953 T4j = T3T + T3U;
Chris@82 954 T3W = iio[WS(vs, 4) + WS(rs, 4)];
Chris@82 955 T3X = iio[WS(vs, 4) + WS(rs, 1)];
Chris@82 956 T3Y = T3W - T3X;
Chris@82 957 T4k = T3W + T3X;
Chris@82 958 }
Chris@82 959 T3Z = KP866025403 * (T3V - T3Y);
Chris@82 960 T4p = T4j + T4k;
Chris@82 961 T46 = T3V + T3Y;
Chris@82 962 T4l = KP866025403 * (T4j - T4k);
Chris@82 963 }
Chris@82 964 {
Chris@82 965 E T4P, T5d, T4S, T5e;
Chris@82 966 {
Chris@82 967 E T4N, T4O, T4Q, T4R;
Chris@82 968 T4N = iio[WS(vs, 5) + WS(rs, 2)];
Chris@82 969 T4O = iio[WS(vs, 5) + WS(rs, 5)];
Chris@82 970 T4P = T4N - T4O;
Chris@82 971 T5d = T4N + T4O;
Chris@82 972 T4Q = iio[WS(vs, 5) + WS(rs, 4)];
Chris@82 973 T4R = iio[WS(vs, 5) + WS(rs, 1)];
Chris@82 974 T4S = T4Q - T4R;
Chris@82 975 T5e = T4Q + T4R;
Chris@82 976 }
Chris@82 977 T4T = KP866025403 * (T4P - T4S);
Chris@82 978 T5j = T5d + T5e;
Chris@82 979 T50 = T4P + T4S;
Chris@82 980 T5f = KP866025403 * (T5d - T5e);
Chris@82 981 }
Chris@82 982 {
Chris@82 983 E T4C, T4J, T4F, T4K;
Chris@82 984 {
Chris@82 985 E T4A, T4B, T4D, T4E;
Chris@82 986 T4A = rio[WS(vs, 5) + WS(rs, 2)];
Chris@82 987 T4B = rio[WS(vs, 5) + WS(rs, 5)];
Chris@82 988 T4C = T4A + T4B;
Chris@82 989 T4J = T4A - T4B;
Chris@82 990 T4D = rio[WS(vs, 5) + WS(rs, 4)];
Chris@82 991 T4E = rio[WS(vs, 5) + WS(rs, 1)];
Chris@82 992 T4F = T4D + T4E;
Chris@82 993 T4K = T4D - T4E;
Chris@82 994 }
Chris@82 995 T4G = T4C + T4F;
Chris@82 996 T5l = KP866025403 * (T4F - T4C);
Chris@82 997 T4L = T4J + T4K;
Chris@82 998 T4W = KP866025403 * (T4K - T4J);
Chris@82 999 }
Chris@82 1000 rio[0] = T3 + Ta;
Chris@82 1001 iio[0] = TM + TN;
Chris@82 1002 rio[WS(rs, 1)] = TX + T14;
Chris@82 1003 iio[WS(rs, 1)] = T1G + T1H;
Chris@82 1004 rio[WS(rs, 3)] = T2L + T2S;
Chris@82 1005 rio[WS(rs, 2)] = T1R + T1Y;
Chris@82 1006 iio[WS(rs, 2)] = T2A + T2B;
Chris@82 1007 iio[WS(rs, 3)] = T3u + T3v;
Chris@82 1008 iio[WS(rs, 4)] = T4o + T4p;
Chris@82 1009 iio[WS(rs, 5)] = T5i + T5j;
Chris@82 1010 rio[WS(rs, 5)] = T4z + T4G;
Chris@82 1011 rio[WS(rs, 4)] = T3F + T3M;
Chris@82 1012 {
Chris@82 1013 E T1w, T1y, T1v, T1x;
Chris@82 1014 T1w = T16 + T19;
Chris@82 1015 T1y = T1n + T1o;
Chris@82 1016 T1v = W[4];
Chris@82 1017 T1x = W[5];
Chris@82 1018 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
Chris@82 1019 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
Chris@82 1020 }
Chris@82 1021 {
Chris@82 1022 E T58, T5a, T57, T59;
Chris@82 1023 T58 = T4I + T4L;
Chris@82 1024 T5a = T4Z + T50;
Chris@82 1025 T57 = W[4];
Chris@82 1026 T59 = W[5];
Chris@82 1027 rio[WS(vs, 3) + WS(rs, 5)] = FMA(T57, T58, T59 * T5a);
Chris@82 1028 iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T59, T58, T57 * T5a);
Chris@82 1029 }
Chris@82 1030 {
Chris@82 1031 E TC, TE, TB, TD;
Chris@82 1032 TC = Tc + Tf;
Chris@82 1033 TE = Tt + Tu;
Chris@82 1034 TB = W[4];
Chris@82 1035 TD = W[5];
Chris@82 1036 rio[WS(vs, 3)] = FMA(TB, TC, TD * TE);
Chris@82 1037 iio[WS(vs, 3)] = FNMS(TD, TC, TB * TE);
Chris@82 1038 }
Chris@82 1039 {
Chris@82 1040 E T4e, T4g, T4d, T4f;
Chris@82 1041 T4e = T3O + T3R;
Chris@82 1042 T4g = T45 + T46;
Chris@82 1043 T4d = W[4];
Chris@82 1044 T4f = W[5];
Chris@82 1045 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4d, T4e, T4f * T4g);
Chris@82 1046 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4f, T4e, T4d * T4g);
Chris@82 1047 }
Chris@82 1048 {
Chris@82 1049 E T3k, T3m, T3j, T3l;
Chris@82 1050 T3k = T2U + T2X;
Chris@82 1051 T3m = T3b + T3c;
Chris@82 1052 T3j = W[4];
Chris@82 1053 T3l = W[5];
Chris@82 1054 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3j, T3k, T3l * T3m);
Chris@82 1055 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3l, T3k, T3j * T3m);
Chris@82 1056 }
Chris@82 1057 {
Chris@82 1058 E T2q, T2s, T2p, T2r;
Chris@82 1059 T2q = T20 + T23;
Chris@82 1060 T2s = T2h + T2i;
Chris@82 1061 T2p = W[4];
Chris@82 1062 T2r = W[5];
Chris@82 1063 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2p, T2q, T2r * T2s);
Chris@82 1064 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2r, T2q, T2p * T2s);
Chris@82 1065 }
Chris@82 1066 {
Chris@82 1067 E T5g, T5o, T5m, T5q, T5c, T5k;
Chris@82 1068 T5c = FNMS(KP500000000, T4G, T4z);
Chris@82 1069 T5g = T5c - T5f;
Chris@82 1070 T5o = T5c + T5f;
Chris@82 1071 T5k = FNMS(KP500000000, T5j, T5i);
Chris@82 1072 T5m = T5k - T5l;
Chris@82 1073 T5q = T5l + T5k;
Chris@82 1074 {
Chris@82 1075 E T5b, T5h, T5n, T5p;
Chris@82 1076 T5b = W[2];
Chris@82 1077 T5h = W[3];
Chris@82 1078 rio[WS(vs, 2) + WS(rs, 5)] = FMA(T5b, T5g, T5h * T5m);
Chris@82 1079 iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T5h, T5g, T5b * T5m);
Chris@82 1080 T5n = W[6];
Chris@82 1081 T5p = W[7];
Chris@82 1082 rio[WS(vs, 4) + WS(rs, 5)] = FMA(T5n, T5o, T5p * T5q);
Chris@82 1083 iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T5p, T5o, T5n * T5q);
Chris@82 1084 }
Chris@82 1085 }
Chris@82 1086 {
Chris@82 1087 E To, Ty, Tw, TA, Tg, Tv;
Chris@82 1088 Tg = FNMS(KP500000000, Tf, Tc);
Chris@82 1089 To = Tg + Tn;
Chris@82 1090 Ty = Tg - Tn;
Chris@82 1091 Tv = FNMS(KP500000000, Tu, Tt);
Chris@82 1092 Tw = Tq + Tv;
Chris@82 1093 TA = Tv - Tq;
Chris@82 1094 {
Chris@82 1095 E Tb, Tp, Tx, Tz;
Chris@82 1096 Tb = W[0];
Chris@82 1097 Tp = W[1];
Chris@82 1098 rio[WS(vs, 1)] = FMA(Tb, To, Tp * Tw);
Chris@82 1099 iio[WS(vs, 1)] = FNMS(Tp, To, Tb * Tw);
Chris@82 1100 Tx = W[8];
Chris@82 1101 Tz = W[9];
Chris@82 1102 rio[WS(vs, 5)] = FMA(Tx, Ty, Tz * TA);
Chris@82 1103 iio[WS(vs, 5)] = FNMS(Tz, Ty, Tx * TA);
Chris@82 1104 }
Chris@82 1105 }
Chris@82 1106 {
Chris@82 1107 E T36, T3g, T3e, T3i, T2Y, T3d;
Chris@82 1108 T2Y = FNMS(KP500000000, T2X, T2U);
Chris@82 1109 T36 = T2Y + T35;
Chris@82 1110 T3g = T2Y - T35;
Chris@82 1111 T3d = FNMS(KP500000000, T3c, T3b);
Chris@82 1112 T3e = T38 + T3d;
Chris@82 1113 T3i = T3d - T38;
Chris@82 1114 {
Chris@82 1115 E T2T, T37, T3f, T3h;
Chris@82 1116 T2T = W[0];
Chris@82 1117 T37 = W[1];
Chris@82 1118 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2T, T36, T37 * T3e);
Chris@82 1119 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T37, T36, T2T * T3e);
Chris@82 1120 T3f = W[8];
Chris@82 1121 T3h = W[9];
Chris@82 1122 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3f, T3g, T3h * T3i);
Chris@82 1123 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3h, T3g, T3f * T3i);
Chris@82 1124 }
Chris@82 1125 }
Chris@82 1126 {
Chris@82 1127 E T2y, T2G, T2E, T2I, T2u, T2C;
Chris@82 1128 T2u = FNMS(KP500000000, T1Y, T1R);
Chris@82 1129 T2y = T2u - T2x;
Chris@82 1130 T2G = T2u + T2x;
Chris@82 1131 T2C = FNMS(KP500000000, T2B, T2A);
Chris@82 1132 T2E = T2C - T2D;
Chris@82 1133 T2I = T2D + T2C;
Chris@82 1134 {
Chris@82 1135 E T2t, T2z, T2F, T2H;
Chris@82 1136 T2t = W[2];
Chris@82 1137 T2z = W[3];
Chris@82 1138 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2t, T2y, T2z * T2E);
Chris@82 1139 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2z, T2y, T2t * T2E);
Chris@82 1140 T2F = W[6];
Chris@82 1141 T2H = W[7];
Chris@82 1142 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2F, T2G, T2H * T2I);
Chris@82 1143 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2H, T2G, T2F * T2I);
Chris@82 1144 }
Chris@82 1145 }
Chris@82 1146 {
Chris@82 1147 E T3s, T3A, T3y, T3C, T3o, T3w;
Chris@82 1148 T3o = FNMS(KP500000000, T2S, T2L);
Chris@82 1149 T3s = T3o - T3r;
Chris@82 1150 T3A = T3o + T3r;
Chris@82 1151 T3w = FNMS(KP500000000, T3v, T3u);
Chris@82 1152 T3y = T3w - T3x;
Chris@82 1153 T3C = T3x + T3w;
Chris@82 1154 {
Chris@82 1155 E T3n, T3t, T3z, T3B;
Chris@82 1156 T3n = W[2];
Chris@82 1157 T3t = W[3];
Chris@82 1158 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3n, T3s, T3t * T3y);
Chris@82 1159 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3t, T3s, T3n * T3y);
Chris@82 1160 T3z = W[6];
Chris@82 1161 T3B = W[7];
Chris@82 1162 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3z, T3A, T3B * T3C);
Chris@82 1163 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3B, T3A, T3z * T3C);
Chris@82 1164 }
Chris@82 1165 }
Chris@82 1166 {
Chris@82 1167 E T1E, T1M, T1K, T1O, T1A, T1I;
Chris@82 1168 T1A = FNMS(KP500000000, T14, TX);
Chris@82 1169 T1E = T1A - T1D;
Chris@82 1170 T1M = T1A + T1D;
Chris@82 1171 T1I = FNMS(KP500000000, T1H, T1G);
Chris@82 1172 T1K = T1I - T1J;
Chris@82 1173 T1O = T1J + T1I;
Chris@82 1174 {
Chris@82 1175 E T1z, T1F, T1L, T1N;
Chris@82 1176 T1z = W[2];
Chris@82 1177 T1F = W[3];
Chris@82 1178 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1z, T1E, T1F * T1K);
Chris@82 1179 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1F, T1E, T1z * T1K);
Chris@82 1180 T1L = W[6];
Chris@82 1181 T1N = W[7];
Chris@82 1182 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1L, T1M, T1N * T1O);
Chris@82 1183 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1N, T1M, T1L * T1O);
Chris@82 1184 }
Chris@82 1185 }
Chris@82 1186 {
Chris@82 1187 E T4m, T4u, T4s, T4w, T4i, T4q;
Chris@82 1188 T4i = FNMS(KP500000000, T3M, T3F);
Chris@82 1189 T4m = T4i - T4l;
Chris@82 1190 T4u = T4i + T4l;
Chris@82 1191 T4q = FNMS(KP500000000, T4p, T4o);
Chris@82 1192 T4s = T4q - T4r;
Chris@82 1193 T4w = T4r + T4q;
Chris@82 1194 {
Chris@82 1195 E T4h, T4n, T4t, T4v;
Chris@82 1196 T4h = W[2];
Chris@82 1197 T4n = W[3];
Chris@82 1198 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4h, T4m, T4n * T4s);
Chris@82 1199 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4n, T4m, T4h * T4s);
Chris@82 1200 T4t = W[6];
Chris@82 1201 T4v = W[7];
Chris@82 1202 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4t, T4u, T4v * T4w);
Chris@82 1203 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4v, T4u, T4t * T4w);
Chris@82 1204 }
Chris@82 1205 }
Chris@82 1206 {
Chris@82 1207 E TK, TS, TQ, TU, TG, TO;
Chris@82 1208 TG = FNMS(KP500000000, Ta, T3);
Chris@82 1209 TK = TG - TJ;
Chris@82 1210 TS = TG + TJ;
Chris@82 1211 TO = FNMS(KP500000000, TN, TM);
Chris@82 1212 TQ = TO - TP;
Chris@82 1213 TU = TP + TO;
Chris@82 1214 {
Chris@82 1215 E TF, TL, TR, TT;
Chris@82 1216 TF = W[2];
Chris@82 1217 TL = W[3];
Chris@82 1218 rio[WS(vs, 2)] = FMA(TF, TK, TL * TQ);
Chris@82 1219 iio[WS(vs, 2)] = FNMS(TL, TK, TF * TQ);
Chris@82 1220 TR = W[6];
Chris@82 1221 TT = W[7];
Chris@82 1222 rio[WS(vs, 4)] = FMA(TR, TS, TT * TU);
Chris@82 1223 iio[WS(vs, 4)] = FNMS(TT, TS, TR * TU);
Chris@82 1224 }
Chris@82 1225 }
Chris@82 1226 {
Chris@82 1227 E T2c, T2m, T2k, T2o, T24, T2j;
Chris@82 1228 T24 = FNMS(KP500000000, T23, T20);
Chris@82 1229 T2c = T24 + T2b;
Chris@82 1230 T2m = T24 - T2b;
Chris@82 1231 T2j = FNMS(KP500000000, T2i, T2h);
Chris@82 1232 T2k = T2e + T2j;
Chris@82 1233 T2o = T2j - T2e;
Chris@82 1234 {
Chris@82 1235 E T1Z, T2d, T2l, T2n;
Chris@82 1236 T1Z = W[0];
Chris@82 1237 T2d = W[1];
Chris@82 1238 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1Z, T2c, T2d * T2k);
Chris@82 1239 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2d, T2c, T1Z * T2k);
Chris@82 1240 T2l = W[8];
Chris@82 1241 T2n = W[9];
Chris@82 1242 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2l, T2m, T2n * T2o);
Chris@82 1243 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2n, T2m, T2l * T2o);
Chris@82 1244 }
Chris@82 1245 }
Chris@82 1246 {
Chris@82 1247 E T40, T4a, T48, T4c, T3S, T47;
Chris@82 1248 T3S = FNMS(KP500000000, T3R, T3O);
Chris@82 1249 T40 = T3S + T3Z;
Chris@82 1250 T4a = T3S - T3Z;
Chris@82 1251 T47 = FNMS(KP500000000, T46, T45);
Chris@82 1252 T48 = T42 + T47;
Chris@82 1253 T4c = T47 - T42;
Chris@82 1254 {
Chris@82 1255 E T3N, T41, T49, T4b;
Chris@82 1256 T3N = W[0];
Chris@82 1257 T41 = W[1];
Chris@82 1258 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3N, T40, T41 * T48);
Chris@82 1259 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T41, T40, T3N * T48);
Chris@82 1260 T49 = W[8];
Chris@82 1261 T4b = W[9];
Chris@82 1262 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T49, T4a, T4b * T4c);
Chris@82 1263 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4b, T4a, T49 * T4c);
Chris@82 1264 }
Chris@82 1265 }
Chris@82 1266 {
Chris@82 1267 E T1i, T1s, T1q, T1u, T1a, T1p;
Chris@82 1268 T1a = FNMS(KP500000000, T19, T16);
Chris@82 1269 T1i = T1a + T1h;
Chris@82 1270 T1s = T1a - T1h;
Chris@82 1271 T1p = FNMS(KP500000000, T1o, T1n);
Chris@82 1272 T1q = T1k + T1p;
Chris@82 1273 T1u = T1p - T1k;
Chris@82 1274 {
Chris@82 1275 E T15, T1j, T1r, T1t;
Chris@82 1276 T15 = W[0];
Chris@82 1277 T1j = W[1];
Chris@82 1278 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T15, T1i, T1j * T1q);
Chris@82 1279 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1j, T1i, T15 * T1q);
Chris@82 1280 T1r = W[8];
Chris@82 1281 T1t = W[9];
Chris@82 1282 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1r, T1s, T1t * T1u);
Chris@82 1283 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1t, T1s, T1r * T1u);
Chris@82 1284 }
Chris@82 1285 }
Chris@82 1286 {
Chris@82 1287 E T4U, T54, T52, T56, T4M, T51;
Chris@82 1288 T4M = FNMS(KP500000000, T4L, T4I);
Chris@82 1289 T4U = T4M + T4T;
Chris@82 1290 T54 = T4M - T4T;
Chris@82 1291 T51 = FNMS(KP500000000, T50, T4Z);
Chris@82 1292 T52 = T4W + T51;
Chris@82 1293 T56 = T51 - T4W;
Chris@82 1294 {
Chris@82 1295 E T4H, T4V, T53, T55;
Chris@82 1296 T4H = W[0];
Chris@82 1297 T4V = W[1];
Chris@82 1298 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T4H, T4U, T4V * T52);
Chris@82 1299 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T4V, T4U, T4H * T52);
Chris@82 1300 T53 = W[8];
Chris@82 1301 T55 = W[9];
Chris@82 1302 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T53, T54, T55 * T56);
Chris@82 1303 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T55, T54, T53 * T56);
Chris@82 1304 }
Chris@82 1305 }
Chris@82 1306 }
Chris@82 1307 }
Chris@82 1308 }
Chris@82 1309
Chris@82 1310 static const tw_instr twinstr[] = {
Chris@82 1311 {TW_FULL, 0, 6},
Chris@82 1312 {TW_NEXT, 1, 0}
Chris@82 1313 };
Chris@82 1314
Chris@82 1315 static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {192, 84, 84, 0}, 0, 0, 0 };
Chris@82 1316
Chris@82 1317 void X(codelet_q1_6) (planner *p) {
Chris@82 1318 X(kdft_difsq_register) (p, q1_6, &desc);
Chris@82 1319 }
Chris@82 1320 #endif