annotate src/fftw-3.3.3/dft/scalar/codelets/q1_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:24 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include q.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 276 FP additions, 192 FP multiplications,
Chris@10 32 * (or, 144 additions, 60 multiplications, 132 fused multiply/add),
Chris@10 33 * 129 stack variables, 2 constants, and 144 memory accesses
Chris@10 34 */
Chris@10 35 #include "q.h"
Chris@10 36
Chris@10 37 static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 41 {
Chris@10 42 INT m;
Chris@10 43 for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@10 44 E T4c, T4f, T4e, T4g, T4d;
Chris@10 45 {
Chris@10 46 E T3, Tw, Ta, TW, Tg, TG, TM, TT, TU, TP, Tn, T17, TV, TJ, Tv;
Chris@10 47 E T1A, T1e, T20, T1k, T1K, T1Q, T1X, T1Y, T1T, T1r, T1Z, T1N, T1z, T31, T32;
Chris@10 48 E T2X, T2v, T2b, T33, T2R, T2D, T2E, T2i, T34, T3f, T2o, T2O, T2U, T3I, T3m;
Chris@10 49 E T48, T3s, T3S, T3Y, T45, T46, T41, T3z, T4j, T47, T3V, T3H, T4M, T4q, T5c;
Chris@10 50 E T4w, T4W, T52, T59, T5a, T55, T4D, T5b, T4Z, T4L, T6d, T5r, T6e, T69, T5H;
Chris@10 51 E T5w, T5n, T6f, T63, T5P, T5s, T5o, T5p;
Chris@10 52 {
Chris@10 53 E T2f, T2k, T2g, T2c, T2d;
Chris@10 54 {
Chris@10 55 E T1b, T1g, T1c, T18, T19;
Chris@10 56 {
Chris@10 57 E T4, Tc, Te, T9, T5;
Chris@10 58 {
Chris@10 59 E T1, T2, T7, T8;
Chris@10 60 T1 = rio[0];
Chris@10 61 T2 = rio[WS(rs, 3)];
Chris@10 62 T7 = rio[WS(rs, 4)];
Chris@10 63 T8 = rio[WS(rs, 1)];
Chris@10 64 T4 = rio[WS(rs, 2)];
Chris@10 65 Tc = T1 - T2;
Chris@10 66 T3 = T1 + T2;
Chris@10 67 Te = T7 - T8;
Chris@10 68 T9 = T7 + T8;
Chris@10 69 T5 = rio[WS(rs, 5)];
Chris@10 70 }
Chris@10 71 {
Chris@10 72 E TN, Tj, Tk, Tl, Tt, Th, Ti;
Chris@10 73 Th = iio[WS(rs, 2)];
Chris@10 74 Ti = iio[WS(rs, 5)];
Chris@10 75 {
Chris@10 76 E Tr, Ts, Td, T6, Tf;
Chris@10 77 Tr = iio[0];
Chris@10 78 Td = T4 - T5;
Chris@10 79 T6 = T4 + T5;
Chris@10 80 TN = Th + Ti;
Chris@10 81 Tj = Th - Ti;
Chris@10 82 Tf = Td + Te;
Chris@10 83 Tw = Te - Td;
Chris@10 84 Ta = T6 + T9;
Chris@10 85 TW = T9 - T6;
Chris@10 86 Tg = FNMS(KP500000000, Tf, Tc);
Chris@10 87 TG = Tc + Tf;
Chris@10 88 Ts = iio[WS(rs, 3)];
Chris@10 89 TM = FNMS(KP500000000, Ta, T3);
Chris@10 90 Tk = iio[WS(rs, 4)];
Chris@10 91 Tl = iio[WS(rs, 1)];
Chris@10 92 Tt = Tr - Ts;
Chris@10 93 TT = Tr + Ts;
Chris@10 94 }
Chris@10 95 {
Chris@10 96 E T15, TO, Tm, T16, Tu;
Chris@10 97 T15 = rio[WS(vs, 1)];
Chris@10 98 TO = Tk + Tl;
Chris@10 99 Tm = Tk - Tl;
Chris@10 100 T16 = rio[WS(vs, 1) + WS(rs, 3)];
Chris@10 101 T1b = rio[WS(vs, 1) + WS(rs, 4)];
Chris@10 102 TU = TN + TO;
Chris@10 103 TP = TN - TO;
Chris@10 104 Tu = Tj + Tm;
Chris@10 105 Tn = Tj - Tm;
Chris@10 106 T1g = T15 - T16;
Chris@10 107 T17 = T15 + T16;
Chris@10 108 TV = FNMS(KP500000000, TU, TT);
Chris@10 109 TJ = Tt + Tu;
Chris@10 110 Tv = FNMS(KP500000000, Tu, Tt);
Chris@10 111 T1c = rio[WS(vs, 1) + WS(rs, 1)];
Chris@10 112 T18 = rio[WS(vs, 1) + WS(rs, 2)];
Chris@10 113 T19 = rio[WS(vs, 1) + WS(rs, 5)];
Chris@10 114 }
Chris@10 115 }
Chris@10 116 }
Chris@10 117 {
Chris@10 118 E T1v, T1R, T1n, T1w, T1o, T1p;
Chris@10 119 {
Chris@10 120 E T1l, T1i, T1d, T1h, T1a, T1m, T1j;
Chris@10 121 T1l = iio[WS(vs, 1) + WS(rs, 2)];
Chris@10 122 T1i = T1b - T1c;
Chris@10 123 T1d = T1b + T1c;
Chris@10 124 T1h = T18 - T19;
Chris@10 125 T1a = T18 + T19;
Chris@10 126 T1m = iio[WS(vs, 1) + WS(rs, 5)];
Chris@10 127 T1v = iio[WS(vs, 1)];
Chris@10 128 T1j = T1h + T1i;
Chris@10 129 T1A = T1i - T1h;
Chris@10 130 T1e = T1a + T1d;
Chris@10 131 T20 = T1d - T1a;
Chris@10 132 T1R = T1l + T1m;
Chris@10 133 T1n = T1l - T1m;
Chris@10 134 T1k = FNMS(KP500000000, T1j, T1g);
Chris@10 135 T1K = T1g + T1j;
Chris@10 136 T1Q = FNMS(KP500000000, T1e, T17);
Chris@10 137 T1w = iio[WS(vs, 1) + WS(rs, 3)];
Chris@10 138 T1o = iio[WS(vs, 1) + WS(rs, 4)];
Chris@10 139 T1p = iio[WS(vs, 1) + WS(rs, 1)];
Chris@10 140 }
Chris@10 141 {
Chris@10 142 E T2z, T2V, T2r, T2A, T2s, T2t;
Chris@10 143 {
Chris@10 144 E T2p, T1x, T1S, T1q, T2q, T1y;
Chris@10 145 T2p = iio[WS(vs, 2) + WS(rs, 2)];
Chris@10 146 T1X = T1v + T1w;
Chris@10 147 T1x = T1v - T1w;
Chris@10 148 T1S = T1o + T1p;
Chris@10 149 T1q = T1o - T1p;
Chris@10 150 T2q = iio[WS(vs, 2) + WS(rs, 5)];
Chris@10 151 T2z = iio[WS(vs, 2)];
Chris@10 152 T1Y = T1R + T1S;
Chris@10 153 T1T = T1R - T1S;
Chris@10 154 T1y = T1n + T1q;
Chris@10 155 T1r = T1n - T1q;
Chris@10 156 T2V = T2p + T2q;
Chris@10 157 T2r = T2p - T2q;
Chris@10 158 T1Z = FNMS(KP500000000, T1Y, T1X);
Chris@10 159 T1N = T1x + T1y;
Chris@10 160 T1z = FNMS(KP500000000, T1y, T1x);
Chris@10 161 T2A = iio[WS(vs, 2) + WS(rs, 3)];
Chris@10 162 T2s = iio[WS(vs, 2) + WS(rs, 4)];
Chris@10 163 T2t = iio[WS(vs, 2) + WS(rs, 1)];
Chris@10 164 }
Chris@10 165 {
Chris@10 166 E T29, T2B, T2W, T2u, T2a, T2C;
Chris@10 167 T29 = rio[WS(vs, 2)];
Chris@10 168 T31 = T2z + T2A;
Chris@10 169 T2B = T2z - T2A;
Chris@10 170 T2W = T2s + T2t;
Chris@10 171 T2u = T2s - T2t;
Chris@10 172 T2a = rio[WS(vs, 2) + WS(rs, 3)];
Chris@10 173 T2f = rio[WS(vs, 2) + WS(rs, 4)];
Chris@10 174 T32 = T2V + T2W;
Chris@10 175 T2X = T2V - T2W;
Chris@10 176 T2C = T2r + T2u;
Chris@10 177 T2v = T2r - T2u;
Chris@10 178 T2k = T29 - T2a;
Chris@10 179 T2b = T29 + T2a;
Chris@10 180 T33 = FNMS(KP500000000, T32, T31);
Chris@10 181 T2R = T2B + T2C;
Chris@10 182 T2D = FNMS(KP500000000, T2C, T2B);
Chris@10 183 T2g = rio[WS(vs, 2) + WS(rs, 1)];
Chris@10 184 T2c = rio[WS(vs, 2) + WS(rs, 2)];
Chris@10 185 T2d = rio[WS(vs, 2) + WS(rs, 5)];
Chris@10 186 }
Chris@10 187 }
Chris@10 188 }
Chris@10 189 }
Chris@10 190 {
Chris@10 191 E T4n, T4s, T4o, T4k, T4l;
Chris@10 192 {
Chris@10 193 E T3j, T3o, T3k, T3g, T3h;
Chris@10 194 {
Chris@10 195 E T3d, T2m, T2h, T2l, T2e, T3e, T2n;
Chris@10 196 T3d = rio[WS(vs, 3)];
Chris@10 197 T2m = T2f - T2g;
Chris@10 198 T2h = T2f + T2g;
Chris@10 199 T2l = T2c - T2d;
Chris@10 200 T2e = T2c + T2d;
Chris@10 201 T3e = rio[WS(vs, 3) + WS(rs, 3)];
Chris@10 202 T3j = rio[WS(vs, 3) + WS(rs, 4)];
Chris@10 203 T2n = T2l + T2m;
Chris@10 204 T2E = T2m - T2l;
Chris@10 205 T2i = T2e + T2h;
Chris@10 206 T34 = T2h - T2e;
Chris@10 207 T3o = T3d - T3e;
Chris@10 208 T3f = T3d + T3e;
Chris@10 209 T2o = FNMS(KP500000000, T2n, T2k);
Chris@10 210 T2O = T2k + T2n;
Chris@10 211 T2U = FNMS(KP500000000, T2i, T2b);
Chris@10 212 T3k = rio[WS(vs, 3) + WS(rs, 1)];
Chris@10 213 T3g = rio[WS(vs, 3) + WS(rs, 2)];
Chris@10 214 T3h = rio[WS(vs, 3) + WS(rs, 5)];
Chris@10 215 }
Chris@10 216 {
Chris@10 217 E T3D, T3Z, T3v, T3E, T3w, T3x;
Chris@10 218 {
Chris@10 219 E T3t, T3q, T3l, T3p, T3i, T3u, T3r;
Chris@10 220 T3t = iio[WS(vs, 3) + WS(rs, 2)];
Chris@10 221 T3q = T3j - T3k;
Chris@10 222 T3l = T3j + T3k;
Chris@10 223 T3p = T3g - T3h;
Chris@10 224 T3i = T3g + T3h;
Chris@10 225 T3u = iio[WS(vs, 3) + WS(rs, 5)];
Chris@10 226 T3D = iio[WS(vs, 3)];
Chris@10 227 T3r = T3p + T3q;
Chris@10 228 T3I = T3q - T3p;
Chris@10 229 T3m = T3i + T3l;
Chris@10 230 T48 = T3l - T3i;
Chris@10 231 T3Z = T3t + T3u;
Chris@10 232 T3v = T3t - T3u;
Chris@10 233 T3s = FNMS(KP500000000, T3r, T3o);
Chris@10 234 T3S = T3o + T3r;
Chris@10 235 T3Y = FNMS(KP500000000, T3m, T3f);
Chris@10 236 T3E = iio[WS(vs, 3) + WS(rs, 3)];
Chris@10 237 T3w = iio[WS(vs, 3) + WS(rs, 4)];
Chris@10 238 T3x = iio[WS(vs, 3) + WS(rs, 1)];
Chris@10 239 }
Chris@10 240 {
Chris@10 241 E T4h, T3F, T40, T3y, T4i, T3G;
Chris@10 242 T4h = rio[WS(vs, 4)];
Chris@10 243 T45 = T3D + T3E;
Chris@10 244 T3F = T3D - T3E;
Chris@10 245 T40 = T3w + T3x;
Chris@10 246 T3y = T3w - T3x;
Chris@10 247 T4i = rio[WS(vs, 4) + WS(rs, 3)];
Chris@10 248 T4n = rio[WS(vs, 4) + WS(rs, 4)];
Chris@10 249 T46 = T3Z + T40;
Chris@10 250 T41 = T3Z - T40;
Chris@10 251 T3G = T3v + T3y;
Chris@10 252 T3z = T3v - T3y;
Chris@10 253 T4s = T4h - T4i;
Chris@10 254 T4j = T4h + T4i;
Chris@10 255 T47 = FNMS(KP500000000, T46, T45);
Chris@10 256 T3V = T3F + T3G;
Chris@10 257 T3H = FNMS(KP500000000, T3G, T3F);
Chris@10 258 T4o = rio[WS(vs, 4) + WS(rs, 1)];
Chris@10 259 T4k = rio[WS(vs, 4) + WS(rs, 2)];
Chris@10 260 T4l = rio[WS(vs, 4) + WS(rs, 5)];
Chris@10 261 }
Chris@10 262 }
Chris@10 263 }
Chris@10 264 {
Chris@10 265 E T4H, T53, T4z, T4I, T4A, T4B;
Chris@10 266 {
Chris@10 267 E T4x, T4u, T4p, T4t, T4m, T4y, T4v;
Chris@10 268 T4x = iio[WS(vs, 4) + WS(rs, 2)];
Chris@10 269 T4u = T4n - T4o;
Chris@10 270 T4p = T4n + T4o;
Chris@10 271 T4t = T4k - T4l;
Chris@10 272 T4m = T4k + T4l;
Chris@10 273 T4y = iio[WS(vs, 4) + WS(rs, 5)];
Chris@10 274 T4H = iio[WS(vs, 4)];
Chris@10 275 T4v = T4t + T4u;
Chris@10 276 T4M = T4u - T4t;
Chris@10 277 T4q = T4m + T4p;
Chris@10 278 T5c = T4p - T4m;
Chris@10 279 T53 = T4x + T4y;
Chris@10 280 T4z = T4x - T4y;
Chris@10 281 T4w = FNMS(KP500000000, T4v, T4s);
Chris@10 282 T4W = T4s + T4v;
Chris@10 283 T52 = FNMS(KP500000000, T4q, T4j);
Chris@10 284 T4I = iio[WS(vs, 4) + WS(rs, 3)];
Chris@10 285 T4A = iio[WS(vs, 4) + WS(rs, 4)];
Chris@10 286 T4B = iio[WS(vs, 4) + WS(rs, 1)];
Chris@10 287 }
Chris@10 288 {
Chris@10 289 E T5L, T67, T5D, T5M, T5E, T5F;
Chris@10 290 {
Chris@10 291 E T5B, T4J, T54, T4C, T5C, T4K;
Chris@10 292 T5B = iio[WS(vs, 5) + WS(rs, 2)];
Chris@10 293 T59 = T4H + T4I;
Chris@10 294 T4J = T4H - T4I;
Chris@10 295 T54 = T4A + T4B;
Chris@10 296 T4C = T4A - T4B;
Chris@10 297 T5C = iio[WS(vs, 5) + WS(rs, 5)];
Chris@10 298 T5L = iio[WS(vs, 5)];
Chris@10 299 T5a = T53 + T54;
Chris@10 300 T55 = T53 - T54;
Chris@10 301 T4K = T4z + T4C;
Chris@10 302 T4D = T4z - T4C;
Chris@10 303 T67 = T5B + T5C;
Chris@10 304 T5D = T5B - T5C;
Chris@10 305 T5b = FNMS(KP500000000, T5a, T59);
Chris@10 306 T4Z = T4J + T4K;
Chris@10 307 T4L = FNMS(KP500000000, T4K, T4J);
Chris@10 308 T5M = iio[WS(vs, 5) + WS(rs, 3)];
Chris@10 309 T5E = iio[WS(vs, 5) + WS(rs, 4)];
Chris@10 310 T5F = iio[WS(vs, 5) + WS(rs, 1)];
Chris@10 311 }
Chris@10 312 {
Chris@10 313 E T5l, T5N, T68, T5G, T5m, T5O;
Chris@10 314 T5l = rio[WS(vs, 5)];
Chris@10 315 T6d = T5L + T5M;
Chris@10 316 T5N = T5L - T5M;
Chris@10 317 T68 = T5E + T5F;
Chris@10 318 T5G = T5E - T5F;
Chris@10 319 T5m = rio[WS(vs, 5) + WS(rs, 3)];
Chris@10 320 T5r = rio[WS(vs, 5) + WS(rs, 4)];
Chris@10 321 T6e = T67 + T68;
Chris@10 322 T69 = T67 - T68;
Chris@10 323 T5O = T5D + T5G;
Chris@10 324 T5H = T5D - T5G;
Chris@10 325 T5w = T5l - T5m;
Chris@10 326 T5n = T5l + T5m;
Chris@10 327 T6f = FNMS(KP500000000, T6e, T6d);
Chris@10 328 T63 = T5N + T5O;
Chris@10 329 T5P = FNMS(KP500000000, T5O, T5N);
Chris@10 330 T5s = rio[WS(vs, 5) + WS(rs, 1)];
Chris@10 331 T5o = rio[WS(vs, 5) + WS(rs, 2)];
Chris@10 332 T5p = rio[WS(vs, 5) + WS(rs, 5)];
Chris@10 333 }
Chris@10 334 }
Chris@10 335 }
Chris@10 336 }
Chris@10 337 }
Chris@10 338 {
Chris@10 339 E T6a, T6h, T5I, T5R, T65, T6c;
Chris@10 340 {
Chris@10 341 E T5Q, T5u, T6g, T5A, T60, T66;
Chris@10 342 {
Chris@10 343 E T5y, T5t, T5x, T5q, T5z;
Chris@10 344 rio[0] = T3 + Ta;
Chris@10 345 T5y = T5r - T5s;
Chris@10 346 T5t = T5r + T5s;
Chris@10 347 T5x = T5o - T5p;
Chris@10 348 T5q = T5o + T5p;
Chris@10 349 iio[0] = TT + TU;
Chris@10 350 rio[WS(rs, 1)] = T17 + T1e;
Chris@10 351 T5z = T5x + T5y;
Chris@10 352 T5Q = T5y - T5x;
Chris@10 353 T5u = T5q + T5t;
Chris@10 354 T6g = T5t - T5q;
Chris@10 355 T5A = FNMS(KP500000000, T5z, T5w);
Chris@10 356 T60 = T5w + T5z;
Chris@10 357 iio[WS(rs, 1)] = T1X + T1Y;
Chris@10 358 T66 = FNMS(KP500000000, T5u, T5n);
Chris@10 359 rio[WS(rs, 2)] = T2b + T2i;
Chris@10 360 }
Chris@10 361 iio[WS(rs, 2)] = T31 + T32;
Chris@10 362 iio[WS(rs, 4)] = T59 + T5a;
Chris@10 363 rio[WS(rs, 4)] = T4j + T4q;
Chris@10 364 rio[WS(rs, 3)] = T3f + T3m;
Chris@10 365 iio[WS(rs, 3)] = T45 + T46;
Chris@10 366 {
Chris@10 367 E TA, TD, TQ, T10, T13, TX, TZ, T12;
Chris@10 368 rio[WS(rs, 5)] = T5n + T5u;
Chris@10 369 iio[WS(rs, 5)] = T6d + T6e;
Chris@10 370 {
Chris@10 371 E To, Tx, Tb, Tq;
Chris@10 372 TA = FNMS(KP866025403, Tn, Tg);
Chris@10 373 To = FMA(KP866025403, Tn, Tg);
Chris@10 374 Tx = FMA(KP866025403, Tw, Tv);
Chris@10 375 TD = FNMS(KP866025403, Tw, Tv);
Chris@10 376 Tb = W[0];
Chris@10 377 Tq = W[1];
Chris@10 378 {
Chris@10 379 E TI, TK, TH, Ty, Tp, TF;
Chris@10 380 Ty = Tb * Tx;
Chris@10 381 Tp = Tb * To;
Chris@10 382 TF = W[4];
Chris@10 383 TI = W[5];
Chris@10 384 iio[WS(vs, 1)] = FNMS(Tq, To, Ty);
Chris@10 385 rio[WS(vs, 1)] = FMA(Tq, Tx, Tp);
Chris@10 386 TK = TF * TJ;
Chris@10 387 TH = TF * TG;
Chris@10 388 TQ = FNMS(KP866025403, TP, TM);
Chris@10 389 T10 = FMA(KP866025403, TP, TM);
Chris@10 390 T13 = FMA(KP866025403, TW, TV);
Chris@10 391 TX = FNMS(KP866025403, TW, TV);
Chris@10 392 iio[WS(vs, 3)] = FNMS(TI, TG, TK);
Chris@10 393 rio[WS(vs, 3)] = FMA(TI, TJ, TH);
Chris@10 394 TZ = W[6];
Chris@10 395 T12 = W[7];
Chris@10 396 }
Chris@10 397 }
Chris@10 398 {
Chris@10 399 E TC, TE, TB, TL, TS;
Chris@10 400 {
Chris@10 401 E T62, T64, T61, T14, T11, T5Z;
Chris@10 402 T14 = TZ * T13;
Chris@10 403 T11 = TZ * T10;
Chris@10 404 T5Z = W[4];
Chris@10 405 T62 = W[5];
Chris@10 406 iio[WS(vs, 4)] = FNMS(T12, T10, T14);
Chris@10 407 rio[WS(vs, 4)] = FMA(T12, T13, T11);
Chris@10 408 T64 = T5Z * T63;
Chris@10 409 T61 = T5Z * T60;
Chris@10 410 {
Chris@10 411 E T6k, T6n, T6j, T6m, T6o, T6l, Tz;
Chris@10 412 T6a = FNMS(KP866025403, T69, T66);
Chris@10 413 T6k = FMA(KP866025403, T69, T66);
Chris@10 414 T6n = FMA(KP866025403, T6g, T6f);
Chris@10 415 T6h = FNMS(KP866025403, T6g, T6f);
Chris@10 416 iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T62, T60, T64);
Chris@10 417 rio[WS(vs, 3) + WS(rs, 5)] = FMA(T62, T63, T61);
Chris@10 418 T6j = W[6];
Chris@10 419 T6m = W[7];
Chris@10 420 T6o = T6j * T6n;
Chris@10 421 T6l = T6j * T6k;
Chris@10 422 Tz = W[8];
Chris@10 423 TC = W[9];
Chris@10 424 iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T6m, T6k, T6o);
Chris@10 425 rio[WS(vs, 4) + WS(rs, 5)] = FMA(T6m, T6n, T6l);
Chris@10 426 TE = Tz * TD;
Chris@10 427 TB = Tz * TA;
Chris@10 428 }
Chris@10 429 }
Chris@10 430 iio[WS(vs, 5)] = FNMS(TC, TA, TE);
Chris@10 431 rio[WS(vs, 5)] = FMA(TC, TD, TB);
Chris@10 432 TL = W[2];
Chris@10 433 TS = W[3];
Chris@10 434 {
Chris@10 435 E T5U, T5X, T5W, T5Y, T5V, TY, TR, T5T;
Chris@10 436 T5I = FMA(KP866025403, T5H, T5A);
Chris@10 437 T5U = FNMS(KP866025403, T5H, T5A);
Chris@10 438 T5X = FNMS(KP866025403, T5Q, T5P);
Chris@10 439 T5R = FMA(KP866025403, T5Q, T5P);
Chris@10 440 TY = TL * TX;
Chris@10 441 TR = TL * TQ;
Chris@10 442 T5T = W[8];
Chris@10 443 T5W = W[9];
Chris@10 444 iio[WS(vs, 2)] = FNMS(TS, TQ, TY);
Chris@10 445 rio[WS(vs, 2)] = FMA(TS, TX, TR);
Chris@10 446 T5Y = T5T * T5X;
Chris@10 447 T5V = T5T * T5U;
Chris@10 448 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T5W, T5U, T5Y);
Chris@10 449 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T5W, T5X, T5V);
Chris@10 450 T65 = W[2];
Chris@10 451 T6c = W[3];
Chris@10 452 }
Chris@10 453 }
Chris@10 454 }
Chris@10 455 }
Chris@10 456 {
Chris@10 457 E T5g, T5j, T5f, T5i;
Chris@10 458 {
Chris@10 459 E T1E, T1H, T3M, T3P, T56, T5d, T58, T5e, T57;
Chris@10 460 {
Chris@10 461 E T1s, T1B, T1f, T1u;
Chris@10 462 {
Chris@10 463 E T5K, T5S, T5J, T6i, T6b, T5v;
Chris@10 464 T6i = T65 * T6h;
Chris@10 465 T6b = T65 * T6a;
Chris@10 466 T5v = W[0];
Chris@10 467 T5K = W[1];
Chris@10 468 iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T6c, T6a, T6i);
Chris@10 469 rio[WS(vs, 2) + WS(rs, 5)] = FMA(T6c, T6h, T6b);
Chris@10 470 T5S = T5v * T5R;
Chris@10 471 T5J = T5v * T5I;
Chris@10 472 T1E = FNMS(KP866025403, T1r, T1k);
Chris@10 473 T1s = FMA(KP866025403, T1r, T1k);
Chris@10 474 T1B = FMA(KP866025403, T1A, T1z);
Chris@10 475 T1H = FNMS(KP866025403, T1A, T1z);
Chris@10 476 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T5K, T5I, T5S);
Chris@10 477 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T5K, T5R, T5J);
Chris@10 478 T1f = W[0];
Chris@10 479 T1u = W[1];
Chris@10 480 }
Chris@10 481 {
Chris@10 482 E T3U, T3W, T3T, T1C, T1t, T3R;
Chris@10 483 T1C = T1f * T1B;
Chris@10 484 T1t = T1f * T1s;
Chris@10 485 T3R = W[4];
Chris@10 486 T3U = W[5];
Chris@10 487 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1u, T1s, T1C);
Chris@10 488 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1u, T1B, T1t);
Chris@10 489 T3W = T3R * T3V;
Chris@10 490 T3T = T3R * T3S;
Chris@10 491 {
Chris@10 492 E T3A, T3J, T3n, T3C, T3K, T3B, T51;
Chris@10 493 T3M = FNMS(KP866025403, T3z, T3s);
Chris@10 494 T3A = FMA(KP866025403, T3z, T3s);
Chris@10 495 T3J = FMA(KP866025403, T3I, T3H);
Chris@10 496 T3P = FNMS(KP866025403, T3I, T3H);
Chris@10 497 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3U, T3S, T3W);
Chris@10 498 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3U, T3V, T3T);
Chris@10 499 T3n = W[0];
Chris@10 500 T3C = W[1];
Chris@10 501 T5g = FMA(KP866025403, T55, T52);
Chris@10 502 T56 = FNMS(KP866025403, T55, T52);
Chris@10 503 T5d = FNMS(KP866025403, T5c, T5b);
Chris@10 504 T5j = FMA(KP866025403, T5c, T5b);
Chris@10 505 T3K = T3n * T3J;
Chris@10 506 T3B = T3n * T3A;
Chris@10 507 T51 = W[2];
Chris@10 508 T58 = W[3];
Chris@10 509 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T3C, T3A, T3K);
Chris@10 510 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T3C, T3J, T3B);
Chris@10 511 T5e = T51 * T5d;
Chris@10 512 T57 = T51 * T56;
Chris@10 513 }
Chris@10 514 }
Chris@10 515 }
Chris@10 516 {
Chris@10 517 E T38, T3b, T3O, T3Q, T3N, T37, T3a;
Chris@10 518 {
Chris@10 519 E T2Y, T35, T2T, T30, T36, T2Z, T3L;
Chris@10 520 T38 = FMA(KP866025403, T2X, T2U);
Chris@10 521 T2Y = FNMS(KP866025403, T2X, T2U);
Chris@10 522 T35 = FNMS(KP866025403, T34, T33);
Chris@10 523 T3b = FMA(KP866025403, T34, T33);
Chris@10 524 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T58, T56, T5e);
Chris@10 525 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T58, T5d, T57);
Chris@10 526 T2T = W[2];
Chris@10 527 T30 = W[3];
Chris@10 528 T36 = T2T * T35;
Chris@10 529 T2Z = T2T * T2Y;
Chris@10 530 T3L = W[8];
Chris@10 531 T3O = W[9];
Chris@10 532 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T30, T2Y, T36);
Chris@10 533 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T30, T35, T2Z);
Chris@10 534 T3Q = T3L * T3P;
Chris@10 535 T3N = T3L * T3M;
Chris@10 536 }
Chris@10 537 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3O, T3M, T3Q);
Chris@10 538 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3O, T3P, T3N);
Chris@10 539 T37 = W[6];
Chris@10 540 T3a = W[7];
Chris@10 541 {
Chris@10 542 E T1G, T1I, T1F, T3c, T39, T1D;
Chris@10 543 T3c = T37 * T3b;
Chris@10 544 T39 = T37 * T38;
Chris@10 545 T1D = W[8];
Chris@10 546 T1G = W[9];
Chris@10 547 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T3a, T38, T3c);
Chris@10 548 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T3a, T3b, T39);
Chris@10 549 T1I = T1D * T1H;
Chris@10 550 T1F = T1D * T1E;
Chris@10 551 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1G, T1E, T1I);
Chris@10 552 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1G, T1H, T1F);
Chris@10 553 T5f = W[6];
Chris@10 554 T5i = W[7];
Chris@10 555 }
Chris@10 556 }
Chris@10 557 }
Chris@10 558 {
Chris@10 559 E T4Q, T4T, T2I, T2w, T2F, T2L, T2y, T2G, T2x, T4V, T4Y;
Chris@10 560 {
Chris@10 561 E T1M, T1O, T1L, T5k, T5h, T1J;
Chris@10 562 T5k = T5f * T5j;
Chris@10 563 T5h = T5f * T5g;
Chris@10 564 T1J = W[4];
Chris@10 565 T1M = W[5];
Chris@10 566 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T5i, T5g, T5k);
Chris@10 567 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T5i, T5j, T5h);
Chris@10 568 T1O = T1J * T1N;
Chris@10 569 T1L = T1J * T1K;
Chris@10 570 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
Chris@10 571 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
Chris@10 572 T4V = W[4];
Chris@10 573 T4Y = W[5];
Chris@10 574 }
Chris@10 575 {
Chris@10 576 E T4E, T4N, T4G, T4O, T4F, T50, T4X, T4r;
Chris@10 577 T4Q = FNMS(KP866025403, T4D, T4w);
Chris@10 578 T4E = FMA(KP866025403, T4D, T4w);
Chris@10 579 T4N = FMA(KP866025403, T4M, T4L);
Chris@10 580 T4T = FNMS(KP866025403, T4M, T4L);
Chris@10 581 T50 = T4V * T4Z;
Chris@10 582 T4X = T4V * T4W;
Chris@10 583 T4r = W[0];
Chris@10 584 T4G = W[1];
Chris@10 585 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4Y, T4W, T50);
Chris@10 586 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4Y, T4Z, T4X);
Chris@10 587 T4O = T4r * T4N;
Chris@10 588 T4F = T4r * T4E;
Chris@10 589 {
Chris@10 590 E T2N, T2Q, T2S, T2P, T2j;
Chris@10 591 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T4G, T4E, T4O);
Chris@10 592 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T4G, T4N, T4F);
Chris@10 593 T2N = W[4];
Chris@10 594 T2Q = W[5];
Chris@10 595 T2I = FNMS(KP866025403, T2v, T2o);
Chris@10 596 T2w = FMA(KP866025403, T2v, T2o);
Chris@10 597 T2F = FMA(KP866025403, T2E, T2D);
Chris@10 598 T2L = FNMS(KP866025403, T2E, T2D);
Chris@10 599 T2S = T2N * T2R;
Chris@10 600 T2P = T2N * T2O;
Chris@10 601 T2j = W[0];
Chris@10 602 T2y = W[1];
Chris@10 603 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2Q, T2O, T2S);
Chris@10 604 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2Q, T2R, T2P);
Chris@10 605 T2G = T2j * T2F;
Chris@10 606 T2x = T2j * T2w;
Chris@10 607 }
Chris@10 608 }
Chris@10 609 {
Chris@10 610 E T1U, T21, T2H, T2K;
Chris@10 611 {
Chris@10 612 E T24, T27, T23, T26;
Chris@10 613 T1U = FNMS(KP866025403, T1T, T1Q);
Chris@10 614 T24 = FMA(KP866025403, T1T, T1Q);
Chris@10 615 T27 = FMA(KP866025403, T20, T1Z);
Chris@10 616 T21 = FNMS(KP866025403, T20, T1Z);
Chris@10 617 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2y, T2w, T2G);
Chris@10 618 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2y, T2F, T2x);
Chris@10 619 T23 = W[6];
Chris@10 620 T26 = W[7];
Chris@10 621 {
Chris@10 622 E T42, T49, T44, T4a, T43, T28, T25, T3X;
Chris@10 623 T4c = FMA(KP866025403, T41, T3Y);
Chris@10 624 T42 = FNMS(KP866025403, T41, T3Y);
Chris@10 625 T49 = FNMS(KP866025403, T48, T47);
Chris@10 626 T4f = FMA(KP866025403, T48, T47);
Chris@10 627 T28 = T23 * T27;
Chris@10 628 T25 = T23 * T24;
Chris@10 629 T3X = W[2];
Chris@10 630 T44 = W[3];
Chris@10 631 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T26, T24, T28);
Chris@10 632 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T26, T27, T25);
Chris@10 633 T4a = T3X * T49;
Chris@10 634 T43 = T3X * T42;
Chris@10 635 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T44, T42, T4a);
Chris@10 636 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T44, T49, T43);
Chris@10 637 T2H = W[8];
Chris@10 638 T2K = W[9];
Chris@10 639 }
Chris@10 640 }
Chris@10 641 {
Chris@10 642 E T4S, T4U, T4R, T2M, T2J, T4P;
Chris@10 643 T2M = T2H * T2L;
Chris@10 644 T2J = T2H * T2I;
Chris@10 645 T4P = W[8];
Chris@10 646 T4S = W[9];
Chris@10 647 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2K, T2I, T2M);
Chris@10 648 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2K, T2L, T2J);
Chris@10 649 T4U = T4P * T4T;
Chris@10 650 T4R = T4P * T4Q;
Chris@10 651 {
Chris@10 652 E T1P, T1W, T22, T1V, T4b;
Chris@10 653 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4S, T4Q, T4U);
Chris@10 654 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T4S, T4T, T4R);
Chris@10 655 T1P = W[2];
Chris@10 656 T1W = W[3];
Chris@10 657 T22 = T1P * T21;
Chris@10 658 T1V = T1P * T1U;
Chris@10 659 T4b = W[6];
Chris@10 660 T4e = W[7];
Chris@10 661 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1W, T1U, T22);
Chris@10 662 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1W, T21, T1V);
Chris@10 663 T4g = T4b * T4f;
Chris@10 664 T4d = T4b * T4c;
Chris@10 665 }
Chris@10 666 }
Chris@10 667 }
Chris@10 668 }
Chris@10 669 }
Chris@10 670 }
Chris@10 671 }
Chris@10 672 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T4e, T4c, T4g);
Chris@10 673 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T4e, T4f, T4d);
Chris@10 674 }
Chris@10 675 }
Chris@10 676 }
Chris@10 677
Chris@10 678 static const tw_instr twinstr[] = {
Chris@10 679 {TW_FULL, 0, 6},
Chris@10 680 {TW_NEXT, 1, 0}
Chris@10 681 };
Chris@10 682
Chris@10 683 static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {144, 60, 132, 0}, 0, 0, 0 };
Chris@10 684
Chris@10 685 void X(codelet_q1_6) (planner *p) {
Chris@10 686 X(kdft_difsq_register) (p, q1_6, &desc);
Chris@10 687 }
Chris@10 688 #else /* HAVE_FMA */
Chris@10 689
Chris@10 690 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 6 -name q1_6 -include q.h */
Chris@10 691
Chris@10 692 /*
Chris@10 693 * This function contains 276 FP additions, 168 FP multiplications,
Chris@10 694 * (or, 192 additions, 84 multiplications, 84 fused multiply/add),
Chris@10 695 * 85 stack variables, 2 constants, and 144 memory accesses
Chris@10 696 */
Chris@10 697 #include "q.h"
Chris@10 698
Chris@10 699 static void q1_6(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 700 {
Chris@10 701 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@10 702 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@10 703 {
Chris@10 704 INT m;
Chris@10 705 for (m = mb, W = W + (mb * 10); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@10 706 E T3, Tc, Tt, TM, TX, T16, T1n, T1G, T2h, T2A, T1R, T20, T2L, T2U, T3b;
Chris@10 707 E T3u, T3F, T3O, T45, T4o, T4Z, T5i, T4z, T4I, Ta, TP, Tf, Tq, Tn, TN;
Chris@10 708 E Tu, TJ, T14, T1J, T19, T1k, T1h, T1H, T1o, T1D, T2b, T2B, T2i, T2x, T1Y;
Chris@10 709 E T2D, T23, T2e, T2S, T3x, T2X, T38, T35, T3v, T3c, T3r, T3M, T4r, T3R, T42;
Chris@10 710 E T3Z, T4p, T46, T4l, T4T, T5j, T50, T5f, T4G, T5l, T4L, T4W;
Chris@10 711 {
Chris@10 712 E T1, T2, T1l, T1m;
Chris@10 713 T1 = rio[0];
Chris@10 714 T2 = rio[WS(rs, 3)];
Chris@10 715 T3 = T1 + T2;
Chris@10 716 Tc = T1 - T2;
Chris@10 717 {
Chris@10 718 E Tr, Ts, TV, TW;
Chris@10 719 Tr = iio[0];
Chris@10 720 Ts = iio[WS(rs, 3)];
Chris@10 721 Tt = Tr - Ts;
Chris@10 722 TM = Tr + Ts;
Chris@10 723 TV = rio[WS(vs, 1)];
Chris@10 724 TW = rio[WS(vs, 1) + WS(rs, 3)];
Chris@10 725 TX = TV + TW;
Chris@10 726 T16 = TV - TW;
Chris@10 727 }
Chris@10 728 T1l = iio[WS(vs, 1)];
Chris@10 729 T1m = iio[WS(vs, 1) + WS(rs, 3)];
Chris@10 730 T1n = T1l - T1m;
Chris@10 731 T1G = T1l + T1m;
Chris@10 732 {
Chris@10 733 E T2f, T2g, T1P, T1Q;
Chris@10 734 T2f = iio[WS(vs, 2)];
Chris@10 735 T2g = iio[WS(vs, 2) + WS(rs, 3)];
Chris@10 736 T2h = T2f - T2g;
Chris@10 737 T2A = T2f + T2g;
Chris@10 738 T1P = rio[WS(vs, 2)];
Chris@10 739 T1Q = rio[WS(vs, 2) + WS(rs, 3)];
Chris@10 740 T1R = T1P + T1Q;
Chris@10 741 T20 = T1P - T1Q;
Chris@10 742 }
Chris@10 743 }
Chris@10 744 {
Chris@10 745 E T2J, T2K, T43, T44;
Chris@10 746 T2J = rio[WS(vs, 3)];
Chris@10 747 T2K = rio[WS(vs, 3) + WS(rs, 3)];
Chris@10 748 T2L = T2J + T2K;
Chris@10 749 T2U = T2J - T2K;
Chris@10 750 {
Chris@10 751 E T39, T3a, T3D, T3E;
Chris@10 752 T39 = iio[WS(vs, 3)];
Chris@10 753 T3a = iio[WS(vs, 3) + WS(rs, 3)];
Chris@10 754 T3b = T39 - T3a;
Chris@10 755 T3u = T39 + T3a;
Chris@10 756 T3D = rio[WS(vs, 4)];
Chris@10 757 T3E = rio[WS(vs, 4) + WS(rs, 3)];
Chris@10 758 T3F = T3D + T3E;
Chris@10 759 T3O = T3D - T3E;
Chris@10 760 }
Chris@10 761 T43 = iio[WS(vs, 4)];
Chris@10 762 T44 = iio[WS(vs, 4) + WS(rs, 3)];
Chris@10 763 T45 = T43 - T44;
Chris@10 764 T4o = T43 + T44;
Chris@10 765 {
Chris@10 766 E T4X, T4Y, T4x, T4y;
Chris@10 767 T4X = iio[WS(vs, 5)];
Chris@10 768 T4Y = iio[WS(vs, 5) + WS(rs, 3)];
Chris@10 769 T4Z = T4X - T4Y;
Chris@10 770 T5i = T4X + T4Y;
Chris@10 771 T4x = rio[WS(vs, 5)];
Chris@10 772 T4y = rio[WS(vs, 5) + WS(rs, 3)];
Chris@10 773 T4z = T4x + T4y;
Chris@10 774 T4I = T4x - T4y;
Chris@10 775 }
Chris@10 776 }
Chris@10 777 {
Chris@10 778 E T6, Td, T9, Te;
Chris@10 779 {
Chris@10 780 E T4, T5, T7, T8;
Chris@10 781 T4 = rio[WS(rs, 2)];
Chris@10 782 T5 = rio[WS(rs, 5)];
Chris@10 783 T6 = T4 + T5;
Chris@10 784 Td = T4 - T5;
Chris@10 785 T7 = rio[WS(rs, 4)];
Chris@10 786 T8 = rio[WS(rs, 1)];
Chris@10 787 T9 = T7 + T8;
Chris@10 788 Te = T7 - T8;
Chris@10 789 }
Chris@10 790 Ta = T6 + T9;
Chris@10 791 TP = KP866025403 * (T9 - T6);
Chris@10 792 Tf = Td + Te;
Chris@10 793 Tq = KP866025403 * (Te - Td);
Chris@10 794 }
Chris@10 795 {
Chris@10 796 E Tj, TH, Tm, TI;
Chris@10 797 {
Chris@10 798 E Th, Ti, Tk, Tl;
Chris@10 799 Th = iio[WS(rs, 2)];
Chris@10 800 Ti = iio[WS(rs, 5)];
Chris@10 801 Tj = Th - Ti;
Chris@10 802 TH = Th + Ti;
Chris@10 803 Tk = iio[WS(rs, 4)];
Chris@10 804 Tl = iio[WS(rs, 1)];
Chris@10 805 Tm = Tk - Tl;
Chris@10 806 TI = Tk + Tl;
Chris@10 807 }
Chris@10 808 Tn = KP866025403 * (Tj - Tm);
Chris@10 809 TN = TH + TI;
Chris@10 810 Tu = Tj + Tm;
Chris@10 811 TJ = KP866025403 * (TH - TI);
Chris@10 812 }
Chris@10 813 {
Chris@10 814 E T10, T17, T13, T18;
Chris@10 815 {
Chris@10 816 E TY, TZ, T11, T12;
Chris@10 817 TY = rio[WS(vs, 1) + WS(rs, 2)];
Chris@10 818 TZ = rio[WS(vs, 1) + WS(rs, 5)];
Chris@10 819 T10 = TY + TZ;
Chris@10 820 T17 = TY - TZ;
Chris@10 821 T11 = rio[WS(vs, 1) + WS(rs, 4)];
Chris@10 822 T12 = rio[WS(vs, 1) + WS(rs, 1)];
Chris@10 823 T13 = T11 + T12;
Chris@10 824 T18 = T11 - T12;
Chris@10 825 }
Chris@10 826 T14 = T10 + T13;
Chris@10 827 T1J = KP866025403 * (T13 - T10);
Chris@10 828 T19 = T17 + T18;
Chris@10 829 T1k = KP866025403 * (T18 - T17);
Chris@10 830 }
Chris@10 831 {
Chris@10 832 E T1d, T1B, T1g, T1C;
Chris@10 833 {
Chris@10 834 E T1b, T1c, T1e, T1f;
Chris@10 835 T1b = iio[WS(vs, 1) + WS(rs, 2)];
Chris@10 836 T1c = iio[WS(vs, 1) + WS(rs, 5)];
Chris@10 837 T1d = T1b - T1c;
Chris@10 838 T1B = T1b + T1c;
Chris@10 839 T1e = iio[WS(vs, 1) + WS(rs, 4)];
Chris@10 840 T1f = iio[WS(vs, 1) + WS(rs, 1)];
Chris@10 841 T1g = T1e - T1f;
Chris@10 842 T1C = T1e + T1f;
Chris@10 843 }
Chris@10 844 T1h = KP866025403 * (T1d - T1g);
Chris@10 845 T1H = T1B + T1C;
Chris@10 846 T1o = T1d + T1g;
Chris@10 847 T1D = KP866025403 * (T1B - T1C);
Chris@10 848 }
Chris@10 849 {
Chris@10 850 E T27, T2v, T2a, T2w;
Chris@10 851 {
Chris@10 852 E T25, T26, T28, T29;
Chris@10 853 T25 = iio[WS(vs, 2) + WS(rs, 2)];
Chris@10 854 T26 = iio[WS(vs, 2) + WS(rs, 5)];
Chris@10 855 T27 = T25 - T26;
Chris@10 856 T2v = T25 + T26;
Chris@10 857 T28 = iio[WS(vs, 2) + WS(rs, 4)];
Chris@10 858 T29 = iio[WS(vs, 2) + WS(rs, 1)];
Chris@10 859 T2a = T28 - T29;
Chris@10 860 T2w = T28 + T29;
Chris@10 861 }
Chris@10 862 T2b = KP866025403 * (T27 - T2a);
Chris@10 863 T2B = T2v + T2w;
Chris@10 864 T2i = T27 + T2a;
Chris@10 865 T2x = KP866025403 * (T2v - T2w);
Chris@10 866 }
Chris@10 867 {
Chris@10 868 E T1U, T21, T1X, T22;
Chris@10 869 {
Chris@10 870 E T1S, T1T, T1V, T1W;
Chris@10 871 T1S = rio[WS(vs, 2) + WS(rs, 2)];
Chris@10 872 T1T = rio[WS(vs, 2) + WS(rs, 5)];
Chris@10 873 T1U = T1S + T1T;
Chris@10 874 T21 = T1S - T1T;
Chris@10 875 T1V = rio[WS(vs, 2) + WS(rs, 4)];
Chris@10 876 T1W = rio[WS(vs, 2) + WS(rs, 1)];
Chris@10 877 T1X = T1V + T1W;
Chris@10 878 T22 = T1V - T1W;
Chris@10 879 }
Chris@10 880 T1Y = T1U + T1X;
Chris@10 881 T2D = KP866025403 * (T1X - T1U);
Chris@10 882 T23 = T21 + T22;
Chris@10 883 T2e = KP866025403 * (T22 - T21);
Chris@10 884 }
Chris@10 885 {
Chris@10 886 E T2O, T2V, T2R, T2W;
Chris@10 887 {
Chris@10 888 E T2M, T2N, T2P, T2Q;
Chris@10 889 T2M = rio[WS(vs, 3) + WS(rs, 2)];
Chris@10 890 T2N = rio[WS(vs, 3) + WS(rs, 5)];
Chris@10 891 T2O = T2M + T2N;
Chris@10 892 T2V = T2M - T2N;
Chris@10 893 T2P = rio[WS(vs, 3) + WS(rs, 4)];
Chris@10 894 T2Q = rio[WS(vs, 3) + WS(rs, 1)];
Chris@10 895 T2R = T2P + T2Q;
Chris@10 896 T2W = T2P - T2Q;
Chris@10 897 }
Chris@10 898 T2S = T2O + T2R;
Chris@10 899 T3x = KP866025403 * (T2R - T2O);
Chris@10 900 T2X = T2V + T2W;
Chris@10 901 T38 = KP866025403 * (T2W - T2V);
Chris@10 902 }
Chris@10 903 {
Chris@10 904 E T31, T3p, T34, T3q;
Chris@10 905 {
Chris@10 906 E T2Z, T30, T32, T33;
Chris@10 907 T2Z = iio[WS(vs, 3) + WS(rs, 2)];
Chris@10 908 T30 = iio[WS(vs, 3) + WS(rs, 5)];
Chris@10 909 T31 = T2Z - T30;
Chris@10 910 T3p = T2Z + T30;
Chris@10 911 T32 = iio[WS(vs, 3) + WS(rs, 4)];
Chris@10 912 T33 = iio[WS(vs, 3) + WS(rs, 1)];
Chris@10 913 T34 = T32 - T33;
Chris@10 914 T3q = T32 + T33;
Chris@10 915 }
Chris@10 916 T35 = KP866025403 * (T31 - T34);
Chris@10 917 T3v = T3p + T3q;
Chris@10 918 T3c = T31 + T34;
Chris@10 919 T3r = KP866025403 * (T3p - T3q);
Chris@10 920 }
Chris@10 921 {
Chris@10 922 E T3I, T3P, T3L, T3Q;
Chris@10 923 {
Chris@10 924 E T3G, T3H, T3J, T3K;
Chris@10 925 T3G = rio[WS(vs, 4) + WS(rs, 2)];
Chris@10 926 T3H = rio[WS(vs, 4) + WS(rs, 5)];
Chris@10 927 T3I = T3G + T3H;
Chris@10 928 T3P = T3G - T3H;
Chris@10 929 T3J = rio[WS(vs, 4) + WS(rs, 4)];
Chris@10 930 T3K = rio[WS(vs, 4) + WS(rs, 1)];
Chris@10 931 T3L = T3J + T3K;
Chris@10 932 T3Q = T3J - T3K;
Chris@10 933 }
Chris@10 934 T3M = T3I + T3L;
Chris@10 935 T4r = KP866025403 * (T3L - T3I);
Chris@10 936 T3R = T3P + T3Q;
Chris@10 937 T42 = KP866025403 * (T3Q - T3P);
Chris@10 938 }
Chris@10 939 {
Chris@10 940 E T3V, T4j, T3Y, T4k;
Chris@10 941 {
Chris@10 942 E T3T, T3U, T3W, T3X;
Chris@10 943 T3T = iio[WS(vs, 4) + WS(rs, 2)];
Chris@10 944 T3U = iio[WS(vs, 4) + WS(rs, 5)];
Chris@10 945 T3V = T3T - T3U;
Chris@10 946 T4j = T3T + T3U;
Chris@10 947 T3W = iio[WS(vs, 4) + WS(rs, 4)];
Chris@10 948 T3X = iio[WS(vs, 4) + WS(rs, 1)];
Chris@10 949 T3Y = T3W - T3X;
Chris@10 950 T4k = T3W + T3X;
Chris@10 951 }
Chris@10 952 T3Z = KP866025403 * (T3V - T3Y);
Chris@10 953 T4p = T4j + T4k;
Chris@10 954 T46 = T3V + T3Y;
Chris@10 955 T4l = KP866025403 * (T4j - T4k);
Chris@10 956 }
Chris@10 957 {
Chris@10 958 E T4P, T5d, T4S, T5e;
Chris@10 959 {
Chris@10 960 E T4N, T4O, T4Q, T4R;
Chris@10 961 T4N = iio[WS(vs, 5) + WS(rs, 2)];
Chris@10 962 T4O = iio[WS(vs, 5) + WS(rs, 5)];
Chris@10 963 T4P = T4N - T4O;
Chris@10 964 T5d = T4N + T4O;
Chris@10 965 T4Q = iio[WS(vs, 5) + WS(rs, 4)];
Chris@10 966 T4R = iio[WS(vs, 5) + WS(rs, 1)];
Chris@10 967 T4S = T4Q - T4R;
Chris@10 968 T5e = T4Q + T4R;
Chris@10 969 }
Chris@10 970 T4T = KP866025403 * (T4P - T4S);
Chris@10 971 T5j = T5d + T5e;
Chris@10 972 T50 = T4P + T4S;
Chris@10 973 T5f = KP866025403 * (T5d - T5e);
Chris@10 974 }
Chris@10 975 {
Chris@10 976 E T4C, T4J, T4F, T4K;
Chris@10 977 {
Chris@10 978 E T4A, T4B, T4D, T4E;
Chris@10 979 T4A = rio[WS(vs, 5) + WS(rs, 2)];
Chris@10 980 T4B = rio[WS(vs, 5) + WS(rs, 5)];
Chris@10 981 T4C = T4A + T4B;
Chris@10 982 T4J = T4A - T4B;
Chris@10 983 T4D = rio[WS(vs, 5) + WS(rs, 4)];
Chris@10 984 T4E = rio[WS(vs, 5) + WS(rs, 1)];
Chris@10 985 T4F = T4D + T4E;
Chris@10 986 T4K = T4D - T4E;
Chris@10 987 }
Chris@10 988 T4G = T4C + T4F;
Chris@10 989 T5l = KP866025403 * (T4F - T4C);
Chris@10 990 T4L = T4J + T4K;
Chris@10 991 T4W = KP866025403 * (T4K - T4J);
Chris@10 992 }
Chris@10 993 rio[0] = T3 + Ta;
Chris@10 994 iio[0] = TM + TN;
Chris@10 995 rio[WS(rs, 1)] = TX + T14;
Chris@10 996 iio[WS(rs, 1)] = T1G + T1H;
Chris@10 997 rio[WS(rs, 3)] = T2L + T2S;
Chris@10 998 rio[WS(rs, 2)] = T1R + T1Y;
Chris@10 999 iio[WS(rs, 2)] = T2A + T2B;
Chris@10 1000 iio[WS(rs, 3)] = T3u + T3v;
Chris@10 1001 iio[WS(rs, 4)] = T4o + T4p;
Chris@10 1002 iio[WS(rs, 5)] = T5i + T5j;
Chris@10 1003 rio[WS(rs, 5)] = T4z + T4G;
Chris@10 1004 rio[WS(rs, 4)] = T3F + T3M;
Chris@10 1005 {
Chris@10 1006 E T1w, T1y, T1v, T1x;
Chris@10 1007 T1w = T16 + T19;
Chris@10 1008 T1y = T1n + T1o;
Chris@10 1009 T1v = W[4];
Chris@10 1010 T1x = W[5];
Chris@10 1011 rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
Chris@10 1012 iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
Chris@10 1013 }
Chris@10 1014 {
Chris@10 1015 E T58, T5a, T57, T59;
Chris@10 1016 T58 = T4I + T4L;
Chris@10 1017 T5a = T4Z + T50;
Chris@10 1018 T57 = W[4];
Chris@10 1019 T59 = W[5];
Chris@10 1020 rio[WS(vs, 3) + WS(rs, 5)] = FMA(T57, T58, T59 * T5a);
Chris@10 1021 iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T59, T58, T57 * T5a);
Chris@10 1022 }
Chris@10 1023 {
Chris@10 1024 E TC, TE, TB, TD;
Chris@10 1025 TC = Tc + Tf;
Chris@10 1026 TE = Tt + Tu;
Chris@10 1027 TB = W[4];
Chris@10 1028 TD = W[5];
Chris@10 1029 rio[WS(vs, 3)] = FMA(TB, TC, TD * TE);
Chris@10 1030 iio[WS(vs, 3)] = FNMS(TD, TC, TB * TE);
Chris@10 1031 }
Chris@10 1032 {
Chris@10 1033 E T4e, T4g, T4d, T4f;
Chris@10 1034 T4e = T3O + T3R;
Chris@10 1035 T4g = T45 + T46;
Chris@10 1036 T4d = W[4];
Chris@10 1037 T4f = W[5];
Chris@10 1038 rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4d, T4e, T4f * T4g);
Chris@10 1039 iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4f, T4e, T4d * T4g);
Chris@10 1040 }
Chris@10 1041 {
Chris@10 1042 E T3k, T3m, T3j, T3l;
Chris@10 1043 T3k = T2U + T2X;
Chris@10 1044 T3m = T3b + T3c;
Chris@10 1045 T3j = W[4];
Chris@10 1046 T3l = W[5];
Chris@10 1047 rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3j, T3k, T3l * T3m);
Chris@10 1048 iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3l, T3k, T3j * T3m);
Chris@10 1049 }
Chris@10 1050 {
Chris@10 1051 E T2q, T2s, T2p, T2r;
Chris@10 1052 T2q = T20 + T23;
Chris@10 1053 T2s = T2h + T2i;
Chris@10 1054 T2p = W[4];
Chris@10 1055 T2r = W[5];
Chris@10 1056 rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2p, T2q, T2r * T2s);
Chris@10 1057 iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2r, T2q, T2p * T2s);
Chris@10 1058 }
Chris@10 1059 {
Chris@10 1060 E T5g, T5o, T5m, T5q, T5c, T5k;
Chris@10 1061 T5c = FNMS(KP500000000, T4G, T4z);
Chris@10 1062 T5g = T5c - T5f;
Chris@10 1063 T5o = T5c + T5f;
Chris@10 1064 T5k = FNMS(KP500000000, T5j, T5i);
Chris@10 1065 T5m = T5k - T5l;
Chris@10 1066 T5q = T5l + T5k;
Chris@10 1067 {
Chris@10 1068 E T5b, T5h, T5n, T5p;
Chris@10 1069 T5b = W[2];
Chris@10 1070 T5h = W[3];
Chris@10 1071 rio[WS(vs, 2) + WS(rs, 5)] = FMA(T5b, T5g, T5h * T5m);
Chris@10 1072 iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T5h, T5g, T5b * T5m);
Chris@10 1073 T5n = W[6];
Chris@10 1074 T5p = W[7];
Chris@10 1075 rio[WS(vs, 4) + WS(rs, 5)] = FMA(T5n, T5o, T5p * T5q);
Chris@10 1076 iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T5p, T5o, T5n * T5q);
Chris@10 1077 }
Chris@10 1078 }
Chris@10 1079 {
Chris@10 1080 E To, Ty, Tw, TA, Tg, Tv;
Chris@10 1081 Tg = FNMS(KP500000000, Tf, Tc);
Chris@10 1082 To = Tg + Tn;
Chris@10 1083 Ty = Tg - Tn;
Chris@10 1084 Tv = FNMS(KP500000000, Tu, Tt);
Chris@10 1085 Tw = Tq + Tv;
Chris@10 1086 TA = Tv - Tq;
Chris@10 1087 {
Chris@10 1088 E Tb, Tp, Tx, Tz;
Chris@10 1089 Tb = W[0];
Chris@10 1090 Tp = W[1];
Chris@10 1091 rio[WS(vs, 1)] = FMA(Tb, To, Tp * Tw);
Chris@10 1092 iio[WS(vs, 1)] = FNMS(Tp, To, Tb * Tw);
Chris@10 1093 Tx = W[8];
Chris@10 1094 Tz = W[9];
Chris@10 1095 rio[WS(vs, 5)] = FMA(Tx, Ty, Tz * TA);
Chris@10 1096 iio[WS(vs, 5)] = FNMS(Tz, Ty, Tx * TA);
Chris@10 1097 }
Chris@10 1098 }
Chris@10 1099 {
Chris@10 1100 E T36, T3g, T3e, T3i, T2Y, T3d;
Chris@10 1101 T2Y = FNMS(KP500000000, T2X, T2U);
Chris@10 1102 T36 = T2Y + T35;
Chris@10 1103 T3g = T2Y - T35;
Chris@10 1104 T3d = FNMS(KP500000000, T3c, T3b);
Chris@10 1105 T3e = T38 + T3d;
Chris@10 1106 T3i = T3d - T38;
Chris@10 1107 {
Chris@10 1108 E T2T, T37, T3f, T3h;
Chris@10 1109 T2T = W[0];
Chris@10 1110 T37 = W[1];
Chris@10 1111 rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2T, T36, T37 * T3e);
Chris@10 1112 iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T37, T36, T2T * T3e);
Chris@10 1113 T3f = W[8];
Chris@10 1114 T3h = W[9];
Chris@10 1115 rio[WS(vs, 5) + WS(rs, 3)] = FMA(T3f, T3g, T3h * T3i);
Chris@10 1116 iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T3h, T3g, T3f * T3i);
Chris@10 1117 }
Chris@10 1118 }
Chris@10 1119 {
Chris@10 1120 E T2y, T2G, T2E, T2I, T2u, T2C;
Chris@10 1121 T2u = FNMS(KP500000000, T1Y, T1R);
Chris@10 1122 T2y = T2u - T2x;
Chris@10 1123 T2G = T2u + T2x;
Chris@10 1124 T2C = FNMS(KP500000000, T2B, T2A);
Chris@10 1125 T2E = T2C - T2D;
Chris@10 1126 T2I = T2D + T2C;
Chris@10 1127 {
Chris@10 1128 E T2t, T2z, T2F, T2H;
Chris@10 1129 T2t = W[2];
Chris@10 1130 T2z = W[3];
Chris@10 1131 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2t, T2y, T2z * T2E);
Chris@10 1132 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2z, T2y, T2t * T2E);
Chris@10 1133 T2F = W[6];
Chris@10 1134 T2H = W[7];
Chris@10 1135 rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2F, T2G, T2H * T2I);
Chris@10 1136 iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2H, T2G, T2F * T2I);
Chris@10 1137 }
Chris@10 1138 }
Chris@10 1139 {
Chris@10 1140 E T3s, T3A, T3y, T3C, T3o, T3w;
Chris@10 1141 T3o = FNMS(KP500000000, T2S, T2L);
Chris@10 1142 T3s = T3o - T3r;
Chris@10 1143 T3A = T3o + T3r;
Chris@10 1144 T3w = FNMS(KP500000000, T3v, T3u);
Chris@10 1145 T3y = T3w - T3x;
Chris@10 1146 T3C = T3x + T3w;
Chris@10 1147 {
Chris@10 1148 E T3n, T3t, T3z, T3B;
Chris@10 1149 T3n = W[2];
Chris@10 1150 T3t = W[3];
Chris@10 1151 rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3n, T3s, T3t * T3y);
Chris@10 1152 iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3t, T3s, T3n * T3y);
Chris@10 1153 T3z = W[6];
Chris@10 1154 T3B = W[7];
Chris@10 1155 rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3z, T3A, T3B * T3C);
Chris@10 1156 iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3B, T3A, T3z * T3C);
Chris@10 1157 }
Chris@10 1158 }
Chris@10 1159 {
Chris@10 1160 E T1E, T1M, T1K, T1O, T1A, T1I;
Chris@10 1161 T1A = FNMS(KP500000000, T14, TX);
Chris@10 1162 T1E = T1A - T1D;
Chris@10 1163 T1M = T1A + T1D;
Chris@10 1164 T1I = FNMS(KP500000000, T1H, T1G);
Chris@10 1165 T1K = T1I - T1J;
Chris@10 1166 T1O = T1J + T1I;
Chris@10 1167 {
Chris@10 1168 E T1z, T1F, T1L, T1N;
Chris@10 1169 T1z = W[2];
Chris@10 1170 T1F = W[3];
Chris@10 1171 rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1z, T1E, T1F * T1K);
Chris@10 1172 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1F, T1E, T1z * T1K);
Chris@10 1173 T1L = W[6];
Chris@10 1174 T1N = W[7];
Chris@10 1175 rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1L, T1M, T1N * T1O);
Chris@10 1176 iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1N, T1M, T1L * T1O);
Chris@10 1177 }
Chris@10 1178 }
Chris@10 1179 {
Chris@10 1180 E T4m, T4u, T4s, T4w, T4i, T4q;
Chris@10 1181 T4i = FNMS(KP500000000, T3M, T3F);
Chris@10 1182 T4m = T4i - T4l;
Chris@10 1183 T4u = T4i + T4l;
Chris@10 1184 T4q = FNMS(KP500000000, T4p, T4o);
Chris@10 1185 T4s = T4q - T4r;
Chris@10 1186 T4w = T4r + T4q;
Chris@10 1187 {
Chris@10 1188 E T4h, T4n, T4t, T4v;
Chris@10 1189 T4h = W[2];
Chris@10 1190 T4n = W[3];
Chris@10 1191 rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4h, T4m, T4n * T4s);
Chris@10 1192 iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4n, T4m, T4h * T4s);
Chris@10 1193 T4t = W[6];
Chris@10 1194 T4v = W[7];
Chris@10 1195 rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4t, T4u, T4v * T4w);
Chris@10 1196 iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4v, T4u, T4t * T4w);
Chris@10 1197 }
Chris@10 1198 }
Chris@10 1199 {
Chris@10 1200 E TK, TS, TQ, TU, TG, TO;
Chris@10 1201 TG = FNMS(KP500000000, Ta, T3);
Chris@10 1202 TK = TG - TJ;
Chris@10 1203 TS = TG + TJ;
Chris@10 1204 TO = FNMS(KP500000000, TN, TM);
Chris@10 1205 TQ = TO - TP;
Chris@10 1206 TU = TP + TO;
Chris@10 1207 {
Chris@10 1208 E TF, TL, TR, TT;
Chris@10 1209 TF = W[2];
Chris@10 1210 TL = W[3];
Chris@10 1211 rio[WS(vs, 2)] = FMA(TF, TK, TL * TQ);
Chris@10 1212 iio[WS(vs, 2)] = FNMS(TL, TK, TF * TQ);
Chris@10 1213 TR = W[6];
Chris@10 1214 TT = W[7];
Chris@10 1215 rio[WS(vs, 4)] = FMA(TR, TS, TT * TU);
Chris@10 1216 iio[WS(vs, 4)] = FNMS(TT, TS, TR * TU);
Chris@10 1217 }
Chris@10 1218 }
Chris@10 1219 {
Chris@10 1220 E T2c, T2m, T2k, T2o, T24, T2j;
Chris@10 1221 T24 = FNMS(KP500000000, T23, T20);
Chris@10 1222 T2c = T24 + T2b;
Chris@10 1223 T2m = T24 - T2b;
Chris@10 1224 T2j = FNMS(KP500000000, T2i, T2h);
Chris@10 1225 T2k = T2e + T2j;
Chris@10 1226 T2o = T2j - T2e;
Chris@10 1227 {
Chris@10 1228 E T1Z, T2d, T2l, T2n;
Chris@10 1229 T1Z = W[0];
Chris@10 1230 T2d = W[1];
Chris@10 1231 rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1Z, T2c, T2d * T2k);
Chris@10 1232 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2d, T2c, T1Z * T2k);
Chris@10 1233 T2l = W[8];
Chris@10 1234 T2n = W[9];
Chris@10 1235 rio[WS(vs, 5) + WS(rs, 2)] = FMA(T2l, T2m, T2n * T2o);
Chris@10 1236 iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T2n, T2m, T2l * T2o);
Chris@10 1237 }
Chris@10 1238 }
Chris@10 1239 {
Chris@10 1240 E T40, T4a, T48, T4c, T3S, T47;
Chris@10 1241 T3S = FNMS(KP500000000, T3R, T3O);
Chris@10 1242 T40 = T3S + T3Z;
Chris@10 1243 T4a = T3S - T3Z;
Chris@10 1244 T47 = FNMS(KP500000000, T46, T45);
Chris@10 1245 T48 = T42 + T47;
Chris@10 1246 T4c = T47 - T42;
Chris@10 1247 {
Chris@10 1248 E T3N, T41, T49, T4b;
Chris@10 1249 T3N = W[0];
Chris@10 1250 T41 = W[1];
Chris@10 1251 rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3N, T40, T41 * T48);
Chris@10 1252 iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T41, T40, T3N * T48);
Chris@10 1253 T49 = W[8];
Chris@10 1254 T4b = W[9];
Chris@10 1255 rio[WS(vs, 5) + WS(rs, 4)] = FMA(T49, T4a, T4b * T4c);
Chris@10 1256 iio[WS(vs, 5) + WS(rs, 4)] = FNMS(T4b, T4a, T49 * T4c);
Chris@10 1257 }
Chris@10 1258 }
Chris@10 1259 {
Chris@10 1260 E T1i, T1s, T1q, T1u, T1a, T1p;
Chris@10 1261 T1a = FNMS(KP500000000, T19, T16);
Chris@10 1262 T1i = T1a + T1h;
Chris@10 1263 T1s = T1a - T1h;
Chris@10 1264 T1p = FNMS(KP500000000, T1o, T1n);
Chris@10 1265 T1q = T1k + T1p;
Chris@10 1266 T1u = T1p - T1k;
Chris@10 1267 {
Chris@10 1268 E T15, T1j, T1r, T1t;
Chris@10 1269 T15 = W[0];
Chris@10 1270 T1j = W[1];
Chris@10 1271 rio[WS(vs, 1) + WS(rs, 1)] = FMA(T15, T1i, T1j * T1q);
Chris@10 1272 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1j, T1i, T15 * T1q);
Chris@10 1273 T1r = W[8];
Chris@10 1274 T1t = W[9];
Chris@10 1275 rio[WS(vs, 5) + WS(rs, 1)] = FMA(T1r, T1s, T1t * T1u);
Chris@10 1276 iio[WS(vs, 5) + WS(rs, 1)] = FNMS(T1t, T1s, T1r * T1u);
Chris@10 1277 }
Chris@10 1278 }
Chris@10 1279 {
Chris@10 1280 E T4U, T54, T52, T56, T4M, T51;
Chris@10 1281 T4M = FNMS(KP500000000, T4L, T4I);
Chris@10 1282 T4U = T4M + T4T;
Chris@10 1283 T54 = T4M - T4T;
Chris@10 1284 T51 = FNMS(KP500000000, T50, T4Z);
Chris@10 1285 T52 = T4W + T51;
Chris@10 1286 T56 = T51 - T4W;
Chris@10 1287 {
Chris@10 1288 E T4H, T4V, T53, T55;
Chris@10 1289 T4H = W[0];
Chris@10 1290 T4V = W[1];
Chris@10 1291 rio[WS(vs, 1) + WS(rs, 5)] = FMA(T4H, T4U, T4V * T52);
Chris@10 1292 iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T4V, T4U, T4H * T52);
Chris@10 1293 T53 = W[8];
Chris@10 1294 T55 = W[9];
Chris@10 1295 rio[WS(vs, 5) + WS(rs, 5)] = FMA(T53, T54, T55 * T56);
Chris@10 1296 iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T55, T54, T53 * T56);
Chris@10 1297 }
Chris@10 1298 }
Chris@10 1299 }
Chris@10 1300 }
Chris@10 1301 }
Chris@10 1302
Chris@10 1303 static const tw_instr twinstr[] = {
Chris@10 1304 {TW_FULL, 0, 6},
Chris@10 1305 {TW_NEXT, 1, 0}
Chris@10 1306 };
Chris@10 1307
Chris@10 1308 static const ct_desc desc = { 6, "q1_6", twinstr, &GENUS, {192, 84, 84, 0}, 0, 0, 0 };
Chris@10 1309
Chris@10 1310 void X(codelet_q1_6) (planner *p) {
Chris@10 1311 X(kdft_difsq_register) (p, q1_6, &desc);
Chris@10 1312 }
Chris@10 1313 #endif /* HAVE_FMA */