annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:43 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 206 FP additions, 132 FP multiplications,
Chris@42 32 * (or, 136 additions, 62 multiplications, 70 fused multiply/add),
Chris@42 33 * 96 stack variables, 4 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 46 E T4d, T4g;
Chris@42 47 {
Chris@42 48 E T1f, T2e, T3D, T1K, T2g, T1c, T3H, T2W, T2j, TR, T3E, T2R, T2l, T11, T3G;
Chris@42 49 E T1v, T3p, T2s, Tl, T3o, T3w, T2G, T3z, T1Y, T23, T20, T2H, T21, T29, Tz;
Chris@42 50 E T26, TE, TA, T2v, T2J, T27, Tv, T2u, TB, T22, T28;
Chris@42 51 {
Chris@42 52 E T1o, T1u, T2T, T2V;
Chris@42 53 {
Chris@42 54 E T1I, T1A, T16, T1C, T1H, T1G, T2U, T1z, T1b, T1x, T1w;
Chris@42 55 {
Chris@42 56 E T1d, T1e, T14, T15;
Chris@42 57 T1d = Ip[0];
Chris@42 58 T1e = Im[0];
Chris@42 59 T14 = Ip[WS(rs, 4)];
Chris@42 60 T15 = Im[WS(rs, 4)];
Chris@42 61 {
Chris@42 62 E T1F, T1D, T1E, T19, T1a;
Chris@42 63 T1D = Rm[0];
Chris@42 64 T1I = T1d + T1e;
Chris@42 65 T1f = T1d - T1e;
Chris@42 66 T1E = Rp[0];
Chris@42 67 T1A = T14 + T15;
Chris@42 68 T16 = T14 - T15;
Chris@42 69 T1C = W[0];
Chris@42 70 T2e = T1E + T1D;
Chris@42 71 T1F = T1D - T1E;
Chris@42 72 T1H = W[1];
Chris@42 73 T19 = Rp[WS(rs, 4)];
Chris@42 74 T1a = Rm[WS(rs, 4)];
Chris@42 75 T1G = T1C * T1F;
Chris@42 76 T2U = T1H * T1F;
Chris@42 77 T1z = W[17];
Chris@42 78 T1b = T19 + T1a;
Chris@42 79 T1x = T1a - T19;
Chris@42 80 T1w = W[16];
Chris@42 81 }
Chris@42 82 }
Chris@42 83 {
Chris@42 84 E T2S, T1y, T13, T18;
Chris@42 85 T2S = T1z * T1x;
Chris@42 86 T1y = T1w * T1x;
Chris@42 87 T13 = W[14];
Chris@42 88 T18 = W[15];
Chris@42 89 {
Chris@42 90 E T1J, T1B, T2f, T17;
Chris@42 91 T1J = FNMS(T1H, T1I, T1G);
Chris@42 92 T1B = FNMS(T1z, T1A, T1y);
Chris@42 93 T2f = T13 * T1b;
Chris@42 94 T17 = T13 * T16;
Chris@42 95 T2T = FMA(T1w, T1A, T2S);
Chris@42 96 T3D = T1J - T1B;
Chris@42 97 T1K = T1B + T1J;
Chris@42 98 T2g = FMA(T18, T16, T2f);
Chris@42 99 T1c = FNMS(T18, T1b, T17);
Chris@42 100 T2V = FMA(T1C, T1I, T2U);
Chris@42 101 }
Chris@42 102 }
Chris@42 103 }
Chris@42 104 {
Chris@42 105 E T1n, TL, T1m, T1j, TQ, T1l, T2N, TV, T1t, T10, T1q, T1s, T1p, T1r, T2O;
Chris@42 106 E T2Q;
Chris@42 107 {
Chris@42 108 E TO, TP, TJ, TK;
Chris@42 109 TJ = Ip[WS(rs, 2)];
Chris@42 110 TK = Im[WS(rs, 2)];
Chris@42 111 TO = Rp[WS(rs, 2)];
Chris@42 112 T3H = T2V - T2T;
Chris@42 113 T2W = T2T + T2V;
Chris@42 114 T1n = TJ + TK;
Chris@42 115 TL = TJ - TK;
Chris@42 116 TP = Rm[WS(rs, 2)];
Chris@42 117 T1m = W[9];
Chris@42 118 T1j = W[8];
Chris@42 119 {
Chris@42 120 E TT, T1k, TU, TY, TZ;
Chris@42 121 TT = Ip[WS(rs, 6)];
Chris@42 122 TQ = TO + TP;
Chris@42 123 T1k = TP - TO;
Chris@42 124 TU = Im[WS(rs, 6)];
Chris@42 125 TY = Rp[WS(rs, 6)];
Chris@42 126 TZ = Rm[WS(rs, 6)];
Chris@42 127 T1l = T1j * T1k;
Chris@42 128 T2N = T1m * T1k;
Chris@42 129 TV = TT - TU;
Chris@42 130 T1t = TT + TU;
Chris@42 131 T10 = TY + TZ;
Chris@42 132 T1q = TZ - TY;
Chris@42 133 T1s = W[25];
Chris@42 134 T1p = W[24];
Chris@42 135 }
Chris@42 136 }
Chris@42 137 {
Chris@42 138 E TN, T2P, T2i, TM, TI;
Chris@42 139 TI = W[6];
Chris@42 140 TN = W[7];
Chris@42 141 T2P = T1s * T1q;
Chris@42 142 T1r = T1p * T1q;
Chris@42 143 T2i = TI * TQ;
Chris@42 144 TM = TI * TL;
Chris@42 145 T2O = FMA(T1j, T1n, T2N);
Chris@42 146 T2Q = FMA(T1p, T1t, T2P);
Chris@42 147 T2j = FMA(TN, TL, T2i);
Chris@42 148 TR = FNMS(TN, TQ, TM);
Chris@42 149 }
Chris@42 150 {
Chris@42 151 E TX, T2k, TW, TS;
Chris@42 152 TS = W[22];
Chris@42 153 T3E = T2O - T2Q;
Chris@42 154 T2R = T2O + T2Q;
Chris@42 155 TX = W[23];
Chris@42 156 T2k = TS * T10;
Chris@42 157 TW = TS * TV;
Chris@42 158 T1o = FNMS(T1m, T1n, T1l);
Chris@42 159 T1u = FNMS(T1s, T1t, T1r);
Chris@42 160 T2l = FMA(TX, TV, T2k);
Chris@42 161 T11 = FNMS(TX, T10, TW);
Chris@42 162 }
Chris@42 163 }
Chris@42 164 {
Chris@42 165 E T1Q, T1N, T2C, T1O, T1W, Te, T1T, Tj, Tf, T2q, T2E, T1U, Ta, T2p, Tg;
Chris@42 166 E T1P, T1V;
Chris@42 167 {
Chris@42 168 E T4, T9, T5, T2o, Tb, T1S, T1, T1M, T6;
Chris@42 169 {
Chris@42 170 E T2, T3, T7, T8;
Chris@42 171 T2 = Ip[WS(rs, 1)];
Chris@42 172 T3G = T1o - T1u;
Chris@42 173 T1v = T1o + T1u;
Chris@42 174 T3 = Im[WS(rs, 1)];
Chris@42 175 T7 = Rp[WS(rs, 1)];
Chris@42 176 T8 = Rm[WS(rs, 1)];
Chris@42 177 T1 = W[2];
Chris@42 178 T1Q = T2 + T3;
Chris@42 179 T4 = T2 - T3;
Chris@42 180 T1N = T7 - T8;
Chris@42 181 T9 = T7 + T8;
Chris@42 182 T1M = W[4];
Chris@42 183 T5 = T1 * T4;
Chris@42 184 }
Chris@42 185 {
Chris@42 186 E Tc, Td, Th, Ti;
Chris@42 187 Tc = Ip[WS(rs, 5)];
Chris@42 188 T2o = T1 * T9;
Chris@42 189 T2C = T1M * T1Q;
Chris@42 190 T1O = T1M * T1N;
Chris@42 191 Td = Im[WS(rs, 5)];
Chris@42 192 Th = Rp[WS(rs, 5)];
Chris@42 193 Ti = Rm[WS(rs, 5)];
Chris@42 194 Tb = W[18];
Chris@42 195 T1W = Tc + Td;
Chris@42 196 Te = Tc - Td;
Chris@42 197 T1T = Th - Ti;
Chris@42 198 Tj = Th + Ti;
Chris@42 199 T1S = W[20];
Chris@42 200 Tf = Tb * Te;
Chris@42 201 }
Chris@42 202 T6 = W[3];
Chris@42 203 T2q = Tb * Tj;
Chris@42 204 T2E = T1S * T1W;
Chris@42 205 T1U = T1S * T1T;
Chris@42 206 Ta = FNMS(T6, T9, T5);
Chris@42 207 T2p = FMA(T6, T4, T2o);
Chris@42 208 Tg = W[19];
Chris@42 209 T1P = W[5];
Chris@42 210 T1V = W[21];
Chris@42 211 }
Chris@42 212 {
Chris@42 213 E Tp, Tu, Tq, T2t, Tw, T25, Tm, T1Z, Tr;
Chris@42 214 {
Chris@42 215 E Tn, To, Ts, Tt, T2r, Tk;
Chris@42 216 Tn = Ip[WS(rs, 7)];
Chris@42 217 T2r = FMA(Tg, Te, T2q);
Chris@42 218 Tk = FNMS(Tg, Tj, Tf);
Chris@42 219 {
Chris@42 220 E T2D, T1R, T2F, T1X;
Chris@42 221 T2D = FNMS(T1P, T1N, T2C);
Chris@42 222 T1R = FMA(T1P, T1Q, T1O);
Chris@42 223 T2F = FNMS(T1V, T1T, T2E);
Chris@42 224 T1X = FMA(T1V, T1W, T1U);
Chris@42 225 T3p = T2p - T2r;
Chris@42 226 T2s = T2p + T2r;
Chris@42 227 Tl = Ta + Tk;
Chris@42 228 T3o = Ta - Tk;
Chris@42 229 T3w = T2F - T2D;
Chris@42 230 T2G = T2D + T2F;
Chris@42 231 T3z = T1X - T1R;
Chris@42 232 T1Y = T1R + T1X;
Chris@42 233 To = Im[WS(rs, 7)];
Chris@42 234 }
Chris@42 235 Ts = Rp[WS(rs, 7)];
Chris@42 236 Tt = Rm[WS(rs, 7)];
Chris@42 237 Tm = W[26];
Chris@42 238 T23 = Tn + To;
Chris@42 239 Tp = Tn - To;
Chris@42 240 T20 = Ts - Tt;
Chris@42 241 Tu = Ts + Tt;
Chris@42 242 T1Z = W[28];
Chris@42 243 Tq = Tm * Tp;
Chris@42 244 }
Chris@42 245 {
Chris@42 246 E Tx, Ty, TC, TD;
Chris@42 247 Tx = Ip[WS(rs, 3)];
Chris@42 248 T2t = Tm * Tu;
Chris@42 249 T2H = T1Z * T23;
Chris@42 250 T21 = T1Z * T20;
Chris@42 251 Ty = Im[WS(rs, 3)];
Chris@42 252 TC = Rp[WS(rs, 3)];
Chris@42 253 TD = Rm[WS(rs, 3)];
Chris@42 254 Tw = W[10];
Chris@42 255 T29 = Tx + Ty;
Chris@42 256 Tz = Tx - Ty;
Chris@42 257 T26 = TC - TD;
Chris@42 258 TE = TC + TD;
Chris@42 259 T25 = W[12];
Chris@42 260 TA = Tw * Tz;
Chris@42 261 }
Chris@42 262 Tr = W[27];
Chris@42 263 T2v = Tw * TE;
Chris@42 264 T2J = T25 * T29;
Chris@42 265 T27 = T25 * T26;
Chris@42 266 Tv = FNMS(Tr, Tu, Tq);
Chris@42 267 T2u = FMA(Tr, Tp, T2t);
Chris@42 268 TB = W[11];
Chris@42 269 T22 = W[29];
Chris@42 270 T28 = W[13];
Chris@42 271 }
Chris@42 272 }
Chris@42 273 }
Chris@42 274 {
Chris@42 275 E T3r, T3s, T3A, T3x, T3M, T3l, T3L, T3m, T3f, T3i;
Chris@42 276 {
Chris@42 277 E T3c, TH, T36, T3g, T3h, T39, T32, T1h, T2A, T2d, T2h, T31, T2y, T30, T2Y;
Chris@42 278 E T2m, T2B, T1i;
Chris@42 279 {
Chris@42 280 E T2x, T2M, T1L, T2c, T2X, T12, T1g;
Chris@42 281 {
Chris@42 282 E TG, T2b, T34, T2L, T2w, TF, T37, T38, T35;
Chris@42 283 T2w = FMA(TB, Tz, T2v);
Chris@42 284 TF = FNMS(TB, TE, TA);
Chris@42 285 {
Chris@42 286 E T2I, T24, T2K, T2a;
Chris@42 287 T2I = FNMS(T22, T20, T2H);
Chris@42 288 T24 = FMA(T22, T23, T21);
Chris@42 289 T2K = FNMS(T28, T26, T2J);
Chris@42 290 T2a = FMA(T28, T29, T27);
Chris@42 291 T3r = T2u - T2w;
Chris@42 292 T2x = T2u + T2w;
Chris@42 293 TG = Tv + TF;
Chris@42 294 T3s = Tv - TF;
Chris@42 295 T2L = T2I + T2K;
Chris@42 296 T3A = T2I - T2K;
Chris@42 297 T3x = T2a - T24;
Chris@42 298 T2b = T24 + T2a;
Chris@42 299 }
Chris@42 300 T2M = T2G + T2L;
Chris@42 301 T34 = T2L - T2G;
Chris@42 302 T37 = T1K - T1v;
Chris@42 303 T1L = T1v + T1K;
Chris@42 304 T2c = T1Y + T2b;
Chris@42 305 T35 = T1Y - T2b;
Chris@42 306 T3c = Tl - TG;
Chris@42 307 TH = Tl + TG;
Chris@42 308 T38 = T2W - T2R;
Chris@42 309 T2X = T2R + T2W;
Chris@42 310 T36 = T34 + T35;
Chris@42 311 T3g = T34 - T35;
Chris@42 312 T3M = TR - T11;
Chris@42 313 T12 = TR + T11;
Chris@42 314 T3h = T37 + T38;
Chris@42 315 T39 = T37 - T38;
Chris@42 316 T1g = T1c + T1f;
Chris@42 317 T3l = T1f - T1c;
Chris@42 318 }
Chris@42 319 T32 = T1g - T12;
Chris@42 320 T1h = T12 + T1g;
Chris@42 321 T2A = T2c + T1L;
Chris@42 322 T2d = T1L - T2c;
Chris@42 323 T3L = T2e - T2g;
Chris@42 324 T2h = T2e + T2g;
Chris@42 325 T31 = T2x - T2s;
Chris@42 326 T2y = T2s + T2x;
Chris@42 327 T30 = T2M + T2X;
Chris@42 328 T2Y = T2M - T2X;
Chris@42 329 T2m = T2j + T2l;
Chris@42 330 T3m = T2j - T2l;
Chris@42 331 }
Chris@42 332 T2B = T1h - TH;
Chris@42 333 T1i = TH + T1h;
Chris@42 334 {
Chris@42 335 E T3e, T3d, T3j, T3k;
Chris@42 336 {
Chris@42 337 E T33, T3b, T2z, T2Z, T3a, T2n;
Chris@42 338 T3f = T32 - T31;
Chris@42 339 T33 = T31 + T32;
Chris@42 340 T3b = T2h - T2m;
Chris@42 341 T2n = T2h + T2m;
Chris@42 342 Im[WS(rs, 7)] = KP500000000 * (T2d - T1i);
Chris@42 343 Ip[0] = KP500000000 * (T1i + T2d);
Chris@42 344 Im[WS(rs, 3)] = KP500000000 * (T2Y - T2B);
Chris@42 345 Ip[WS(rs, 4)] = KP500000000 * (T2B + T2Y);
Chris@42 346 T2z = T2n - T2y;
Chris@42 347 T2Z = T2n + T2y;
Chris@42 348 T3a = T36 + T39;
Chris@42 349 T3e = T39 - T36;
Chris@42 350 T3d = T3b - T3c;
Chris@42 351 T3j = T3b + T3c;
Chris@42 352 Rp[WS(rs, 4)] = KP500000000 * (T2z + T2A);
Chris@42 353 Rm[WS(rs, 3)] = KP500000000 * (T2z - T2A);
Chris@42 354 Rp[0] = KP500000000 * (T2Z + T30);
Chris@42 355 Rm[WS(rs, 7)] = KP500000000 * (T2Z - T30);
Chris@42 356 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3a, T33)));
Chris@42 357 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3a, T33));
Chris@42 358 T3k = T3g + T3h;
Chris@42 359 T3i = T3g - T3h;
Chris@42 360 }
Chris@42 361 Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3k, T3j));
Chris@42 362 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3k, T3j));
Chris@42 363 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3e, T3d));
Chris@42 364 Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3e, T3d));
Chris@42 365 }
Chris@42 366 }
Chris@42 367 {
Chris@42 368 E T3Z, T3n, T3F, T3I, T4e, T44, T4f, T47, T4a, T3u, T3U, T3C, T49, T3N, T40;
Chris@42 369 E T3Q;
Chris@42 370 {
Chris@42 371 E T3y, T3B, T3O, T3q, T3t, T3P;
Chris@42 372 {
Chris@42 373 E T42, T43, T45, T46;
Chris@42 374 T3y = T3w + T3x;
Chris@42 375 T42 = T3w - T3x;
Chris@42 376 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3i, T3f)));
Chris@42 377 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3i, T3f));
Chris@42 378 T3Z = T3m + T3l;
Chris@42 379 T3n = T3l - T3m;
Chris@42 380 T43 = T3A - T3z;
Chris@42 381 T3B = T3z + T3A;
Chris@42 382 T3F = T3D - T3E;
Chris@42 383 T45 = T3E + T3D;
Chris@42 384 T46 = T3H - T3G;
Chris@42 385 T3I = T3G + T3H;
Chris@42 386 T3O = T3p + T3o;
Chris@42 387 T3q = T3o - T3p;
Chris@42 388 T4e = FNMS(KP414213562, T42, T43);
Chris@42 389 T44 = FMA(KP414213562, T43, T42);
Chris@42 390 T4f = FNMS(KP414213562, T45, T46);
Chris@42 391 T47 = FMA(KP414213562, T46, T45);
Chris@42 392 T3t = T3r + T3s;
Chris@42 393 T3P = T3r - T3s;
Chris@42 394 }
Chris@42 395 T4a = T3q - T3t;
Chris@42 396 T3u = T3q + T3t;
Chris@42 397 T3U = FNMS(KP414213562, T3y, T3B);
Chris@42 398 T3C = FMA(KP414213562, T3B, T3y);
Chris@42 399 T49 = T3L - T3M;
Chris@42 400 T3N = T3L + T3M;
Chris@42 401 T40 = T3P - T3O;
Chris@42 402 T3Q = T3O + T3P;
Chris@42 403 }
Chris@42 404 {
Chris@42 405 E T3T, T3v, T3X, T3R, T3J, T3V;
Chris@42 406 T3T = FNMS(KP707106781, T3u, T3n);
Chris@42 407 T3v = FMA(KP707106781, T3u, T3n);
Chris@42 408 T3X = FMA(KP707106781, T3Q, T3N);
Chris@42 409 T3R = FNMS(KP707106781, T3Q, T3N);
Chris@42 410 T3J = FNMS(KP414213562, T3I, T3F);
Chris@42 411 T3V = FMA(KP414213562, T3F, T3I);
Chris@42 412 {
Chris@42 413 E T4c, T4b, T4h, T4i, T41, T48;
Chris@42 414 T4d = FMA(KP707106781, T40, T3Z);
Chris@42 415 T41 = FNMS(KP707106781, T40, T3Z);
Chris@42 416 T48 = T44 - T47;
Chris@42 417 T4c = T44 + T47;
Chris@42 418 {
Chris@42 419 E T3Y, T3W, T3K, T3S;
Chris@42 420 T3Y = T3U + T3V;
Chris@42 421 T3W = T3U - T3V;
Chris@42 422 T3K = T3C + T3J;
Chris@42 423 T3S = T3J - T3C;
Chris@42 424 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3T)));
Chris@42 425 Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3W, T3T));
Chris@42 426 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3Y, T3X));
Chris@42 427 Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T3Y, T3X));
Chris@42 428 Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3S, T3R));
Chris@42 429 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T3S, T3R));
Chris@42 430 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3K, T3v)));
Chris@42 431 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3K, T3v));
Chris@42 432 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T48, T41));
Chris@42 433 Im[0] = -(KP500000000 * (FNMS(KP923879532, T48, T41)));
Chris@42 434 }
Chris@42 435 T4b = FMA(KP707106781, T4a, T49);
Chris@42 436 T4h = FNMS(KP707106781, T4a, T49);
Chris@42 437 T4i = T4e + T4f;
Chris@42 438 T4g = T4e - T4f;
Chris@42 439 Rm[0] = KP500000000 * (FMA(KP923879532, T4i, T4h));
Chris@42 440 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4i, T4h));
Chris@42 441 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4c, T4b));
Chris@42 442 Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4c, T4b));
Chris@42 443 }
Chris@42 444 }
Chris@42 445 }
Chris@42 446 }
Chris@42 447 }
Chris@42 448 Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4g, T4d)));
Chris@42 449 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4g, T4d));
Chris@42 450 }
Chris@42 451 }
Chris@42 452 }
Chris@42 453
Chris@42 454 static const tw_instr twinstr[] = {
Chris@42 455 {TW_FULL, 1, 16},
Chris@42 456 {TW_NEXT, 1, 0}
Chris@42 457 };
Chris@42 458
Chris@42 459 static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, {136, 62, 70, 0} };
Chris@42 460
Chris@42 461 void X(codelet_hc2cfdft_16) (planner *p) {
Chris@42 462 X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
Chris@42 463 }
Chris@42 464 #else /* HAVE_FMA */
Chris@42 465
Chris@42 466 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include hc2cf.h */
Chris@42 467
Chris@42 468 /*
Chris@42 469 * This function contains 206 FP additions, 100 FP multiplications,
Chris@42 470 * (or, 168 additions, 62 multiplications, 38 fused multiply/add),
Chris@42 471 * 61 stack variables, 4 constants, and 64 memory accesses
Chris@42 472 */
Chris@42 473 #include "hc2cf.h"
Chris@42 474
Chris@42 475 static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 476 {
Chris@42 477 DK(KP461939766, +0.461939766255643378064091594698394143411208313);
Chris@42 478 DK(KP191341716, +0.191341716182544885864229992015199433380672281);
Chris@42 479 DK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@42 480 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 481 {
Chris@42 482 INT m;
Chris@42 483 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 484 E T19, T3h, T21, T2Y, T1o, T3d, T2s, T39, TW, T3i, T24, T2Z, T1z, T3c, T2p;
Chris@42 485 E T3a, Tj, T2S, T28, T2R, T1L, T36, T2i, T32, TC, T2V, T2b, T2U, T1W, T35;
Chris@42 486 E T2l, T33;
Chris@42 487 {
Chris@42 488 E T10, T1m, T14, T1k, T18, T1h, T1f, T1Z;
Chris@42 489 {
Chris@42 490 E TY, TZ, T12, T13;
Chris@42 491 TY = Ip[WS(rs, 4)];
Chris@42 492 TZ = Im[WS(rs, 4)];
Chris@42 493 T10 = TY - TZ;
Chris@42 494 T1m = TY + TZ;
Chris@42 495 T12 = Rp[WS(rs, 4)];
Chris@42 496 T13 = Rm[WS(rs, 4)];
Chris@42 497 T14 = T12 + T13;
Chris@42 498 T1k = T12 - T13;
Chris@42 499 }
Chris@42 500 {
Chris@42 501 E T16, T17, T1d, T1e;
Chris@42 502 T16 = Ip[0];
Chris@42 503 T17 = Im[0];
Chris@42 504 T18 = T16 - T17;
Chris@42 505 T1h = T16 + T17;
Chris@42 506 T1d = Rm[0];
Chris@42 507 T1e = Rp[0];
Chris@42 508 T1f = T1d - T1e;
Chris@42 509 T1Z = T1e + T1d;
Chris@42 510 }
Chris@42 511 {
Chris@42 512 E T15, T20, TX, T11;
Chris@42 513 TX = W[14];
Chris@42 514 T11 = W[15];
Chris@42 515 T15 = FNMS(T11, T14, TX * T10);
Chris@42 516 T20 = FMA(TX, T14, T11 * T10);
Chris@42 517 T19 = T15 + T18;
Chris@42 518 T3h = T1Z - T20;
Chris@42 519 T21 = T1Z + T20;
Chris@42 520 T2Y = T18 - T15;
Chris@42 521 }
Chris@42 522 {
Chris@42 523 E T1i, T2r, T1n, T2q;
Chris@42 524 {
Chris@42 525 E T1c, T1g, T1j, T1l;
Chris@42 526 T1c = W[0];
Chris@42 527 T1g = W[1];
Chris@42 528 T1i = FNMS(T1g, T1h, T1c * T1f);
Chris@42 529 T2r = FMA(T1g, T1f, T1c * T1h);
Chris@42 530 T1j = W[16];
Chris@42 531 T1l = W[17];
Chris@42 532 T1n = FMA(T1j, T1k, T1l * T1m);
Chris@42 533 T2q = FNMS(T1l, T1k, T1j * T1m);
Chris@42 534 }
Chris@42 535 T1o = T1i - T1n;
Chris@42 536 T3d = T2r - T2q;
Chris@42 537 T2s = T2q + T2r;
Chris@42 538 T39 = T1n + T1i;
Chris@42 539 }
Chris@42 540 }
Chris@42 541 {
Chris@42 542 E TH, T1s, TL, T1q, TQ, T1x, TU, T1v;
Chris@42 543 {
Chris@42 544 E TF, TG, TJ, TK;
Chris@42 545 TF = Ip[WS(rs, 2)];
Chris@42 546 TG = Im[WS(rs, 2)];
Chris@42 547 TH = TF - TG;
Chris@42 548 T1s = TF + TG;
Chris@42 549 TJ = Rp[WS(rs, 2)];
Chris@42 550 TK = Rm[WS(rs, 2)];
Chris@42 551 TL = TJ + TK;
Chris@42 552 T1q = TJ - TK;
Chris@42 553 }
Chris@42 554 {
Chris@42 555 E TO, TP, TS, TT;
Chris@42 556 TO = Ip[WS(rs, 6)];
Chris@42 557 TP = Im[WS(rs, 6)];
Chris@42 558 TQ = TO - TP;
Chris@42 559 T1x = TO + TP;
Chris@42 560 TS = Rp[WS(rs, 6)];
Chris@42 561 TT = Rm[WS(rs, 6)];
Chris@42 562 TU = TS + TT;
Chris@42 563 T1v = TS - TT;
Chris@42 564 }
Chris@42 565 {
Chris@42 566 E TM, T22, TV, T23;
Chris@42 567 {
Chris@42 568 E TE, TI, TN, TR;
Chris@42 569 TE = W[6];
Chris@42 570 TI = W[7];
Chris@42 571 TM = FNMS(TI, TL, TE * TH);
Chris@42 572 T22 = FMA(TE, TL, TI * TH);
Chris@42 573 TN = W[22];
Chris@42 574 TR = W[23];
Chris@42 575 TV = FNMS(TR, TU, TN * TQ);
Chris@42 576 T23 = FMA(TN, TU, TR * TQ);
Chris@42 577 }
Chris@42 578 TW = TM + TV;
Chris@42 579 T3i = TM - TV;
Chris@42 580 T24 = T22 + T23;
Chris@42 581 T2Z = T22 - T23;
Chris@42 582 }
Chris@42 583 {
Chris@42 584 E T1t, T2n, T1y, T2o;
Chris@42 585 {
Chris@42 586 E T1p, T1r, T1u, T1w;
Chris@42 587 T1p = W[8];
Chris@42 588 T1r = W[9];
Chris@42 589 T1t = FMA(T1p, T1q, T1r * T1s);
Chris@42 590 T2n = FNMS(T1r, T1q, T1p * T1s);
Chris@42 591 T1u = W[24];
Chris@42 592 T1w = W[25];
Chris@42 593 T1y = FMA(T1u, T1v, T1w * T1x);
Chris@42 594 T2o = FNMS(T1w, T1v, T1u * T1x);
Chris@42 595 }
Chris@42 596 T1z = T1t + T1y;
Chris@42 597 T3c = T1y - T1t;
Chris@42 598 T2p = T2n + T2o;
Chris@42 599 T3a = T2n - T2o;
Chris@42 600 }
Chris@42 601 }
Chris@42 602 {
Chris@42 603 E T4, T1E, T8, T1C, Td, T1J, Th, T1H;
Chris@42 604 {
Chris@42 605 E T2, T3, T6, T7;
Chris@42 606 T2 = Ip[WS(rs, 1)];
Chris@42 607 T3 = Im[WS(rs, 1)];
Chris@42 608 T4 = T2 - T3;
Chris@42 609 T1E = T2 + T3;
Chris@42 610 T6 = Rp[WS(rs, 1)];
Chris@42 611 T7 = Rm[WS(rs, 1)];
Chris@42 612 T8 = T6 + T7;
Chris@42 613 T1C = T6 - T7;
Chris@42 614 }
Chris@42 615 {
Chris@42 616 E Tb, Tc, Tf, Tg;
Chris@42 617 Tb = Ip[WS(rs, 5)];
Chris@42 618 Tc = Im[WS(rs, 5)];
Chris@42 619 Td = Tb - Tc;
Chris@42 620 T1J = Tb + Tc;
Chris@42 621 Tf = Rp[WS(rs, 5)];
Chris@42 622 Tg = Rm[WS(rs, 5)];
Chris@42 623 Th = Tf + Tg;
Chris@42 624 T1H = Tf - Tg;
Chris@42 625 }
Chris@42 626 {
Chris@42 627 E T9, T26, Ti, T27;
Chris@42 628 {
Chris@42 629 E T1, T5, Ta, Te;
Chris@42 630 T1 = W[2];
Chris@42 631 T5 = W[3];
Chris@42 632 T9 = FNMS(T5, T8, T1 * T4);
Chris@42 633 T26 = FMA(T1, T8, T5 * T4);
Chris@42 634 Ta = W[18];
Chris@42 635 Te = W[19];
Chris@42 636 Ti = FNMS(Te, Th, Ta * Td);
Chris@42 637 T27 = FMA(Ta, Th, Te * Td);
Chris@42 638 }
Chris@42 639 Tj = T9 + Ti;
Chris@42 640 T2S = T26 - T27;
Chris@42 641 T28 = T26 + T27;
Chris@42 642 T2R = T9 - Ti;
Chris@42 643 }
Chris@42 644 {
Chris@42 645 E T1F, T2g, T1K, T2h;
Chris@42 646 {
Chris@42 647 E T1B, T1D, T1G, T1I;
Chris@42 648 T1B = W[4];
Chris@42 649 T1D = W[5];
Chris@42 650 T1F = FMA(T1B, T1C, T1D * T1E);
Chris@42 651 T2g = FNMS(T1D, T1C, T1B * T1E);
Chris@42 652 T1G = W[20];
Chris@42 653 T1I = W[21];
Chris@42 654 T1K = FMA(T1G, T1H, T1I * T1J);
Chris@42 655 T2h = FNMS(T1I, T1H, T1G * T1J);
Chris@42 656 }
Chris@42 657 T1L = T1F + T1K;
Chris@42 658 T36 = T2g - T2h;
Chris@42 659 T2i = T2g + T2h;
Chris@42 660 T32 = T1K - T1F;
Chris@42 661 }
Chris@42 662 }
Chris@42 663 {
Chris@42 664 E Tn, T1P, Tr, T1N, Tw, T1U, TA, T1S;
Chris@42 665 {
Chris@42 666 E Tl, Tm, Tp, Tq;
Chris@42 667 Tl = Ip[WS(rs, 7)];
Chris@42 668 Tm = Im[WS(rs, 7)];
Chris@42 669 Tn = Tl - Tm;
Chris@42 670 T1P = Tl + Tm;
Chris@42 671 Tp = Rp[WS(rs, 7)];
Chris@42 672 Tq = Rm[WS(rs, 7)];
Chris@42 673 Tr = Tp + Tq;
Chris@42 674 T1N = Tp - Tq;
Chris@42 675 }
Chris@42 676 {
Chris@42 677 E Tu, Tv, Ty, Tz;
Chris@42 678 Tu = Ip[WS(rs, 3)];
Chris@42 679 Tv = Im[WS(rs, 3)];
Chris@42 680 Tw = Tu - Tv;
Chris@42 681 T1U = Tu + Tv;
Chris@42 682 Ty = Rp[WS(rs, 3)];
Chris@42 683 Tz = Rm[WS(rs, 3)];
Chris@42 684 TA = Ty + Tz;
Chris@42 685 T1S = Ty - Tz;
Chris@42 686 }
Chris@42 687 {
Chris@42 688 E Ts, T29, TB, T2a;
Chris@42 689 {
Chris@42 690 E Tk, To, Tt, Tx;
Chris@42 691 Tk = W[26];
Chris@42 692 To = W[27];
Chris@42 693 Ts = FNMS(To, Tr, Tk * Tn);
Chris@42 694 T29 = FMA(Tk, Tr, To * Tn);
Chris@42 695 Tt = W[10];
Chris@42 696 Tx = W[11];
Chris@42 697 TB = FNMS(Tx, TA, Tt * Tw);
Chris@42 698 T2a = FMA(Tt, TA, Tx * Tw);
Chris@42 699 }
Chris@42 700 TC = Ts + TB;
Chris@42 701 T2V = Ts - TB;
Chris@42 702 T2b = T29 + T2a;
Chris@42 703 T2U = T29 - T2a;
Chris@42 704 }
Chris@42 705 {
Chris@42 706 E T1Q, T2j, T1V, T2k;
Chris@42 707 {
Chris@42 708 E T1M, T1O, T1R, T1T;
Chris@42 709 T1M = W[28];
Chris@42 710 T1O = W[29];
Chris@42 711 T1Q = FMA(T1M, T1N, T1O * T1P);
Chris@42 712 T2j = FNMS(T1O, T1N, T1M * T1P);
Chris@42 713 T1R = W[12];
Chris@42 714 T1T = W[13];
Chris@42 715 T1V = FMA(T1R, T1S, T1T * T1U);
Chris@42 716 T2k = FNMS(T1T, T1S, T1R * T1U);
Chris@42 717 }
Chris@42 718 T1W = T1Q + T1V;
Chris@42 719 T35 = T1V - T1Q;
Chris@42 720 T2l = T2j + T2k;
Chris@42 721 T33 = T2j - T2k;
Chris@42 722 }
Chris@42 723 }
Chris@42 724 {
Chris@42 725 E T1b, T2f, T2u, T2w, T1Y, T2e, T2d, T2v;
Chris@42 726 {
Chris@42 727 E TD, T1a, T2m, T2t;
Chris@42 728 TD = Tj + TC;
Chris@42 729 T1a = TW + T19;
Chris@42 730 T1b = TD + T1a;
Chris@42 731 T2f = T1a - TD;
Chris@42 732 T2m = T2i + T2l;
Chris@42 733 T2t = T2p + T2s;
Chris@42 734 T2u = T2m - T2t;
Chris@42 735 T2w = T2m + T2t;
Chris@42 736 }
Chris@42 737 {
Chris@42 738 E T1A, T1X, T25, T2c;
Chris@42 739 T1A = T1o - T1z;
Chris@42 740 T1X = T1L + T1W;
Chris@42 741 T1Y = T1A - T1X;
Chris@42 742 T2e = T1X + T1A;
Chris@42 743 T25 = T21 + T24;
Chris@42 744 T2c = T28 + T2b;
Chris@42 745 T2d = T25 - T2c;
Chris@42 746 T2v = T25 + T2c;
Chris@42 747 }
Chris@42 748 Ip[0] = KP500000000 * (T1b + T1Y);
Chris@42 749 Rp[0] = KP500000000 * (T2v + T2w);
Chris@42 750 Im[WS(rs, 7)] = KP500000000 * (T1Y - T1b);
Chris@42 751 Rm[WS(rs, 7)] = KP500000000 * (T2v - T2w);
Chris@42 752 Rm[WS(rs, 3)] = KP500000000 * (T2d - T2e);
Chris@42 753 Im[WS(rs, 3)] = KP500000000 * (T2u - T2f);
Chris@42 754 Rp[WS(rs, 4)] = KP500000000 * (T2d + T2e);
Chris@42 755 Ip[WS(rs, 4)] = KP500000000 * (T2f + T2u);
Chris@42 756 }
Chris@42 757 {
Chris@42 758 E T2z, T2L, T2J, T2P, T2C, T2M, T2F, T2N;
Chris@42 759 {
Chris@42 760 E T2x, T2y, T2H, T2I;
Chris@42 761 T2x = T2b - T28;
Chris@42 762 T2y = T19 - TW;
Chris@42 763 T2z = KP500000000 * (T2x + T2y);
Chris@42 764 T2L = KP500000000 * (T2y - T2x);
Chris@42 765 T2H = T21 - T24;
Chris@42 766 T2I = Tj - TC;
Chris@42 767 T2J = KP500000000 * (T2H - T2I);
Chris@42 768 T2P = KP500000000 * (T2H + T2I);
Chris@42 769 }
Chris@42 770 {
Chris@42 771 E T2A, T2B, T2D, T2E;
Chris@42 772 T2A = T2l - T2i;
Chris@42 773 T2B = T1L - T1W;
Chris@42 774 T2C = T2A + T2B;
Chris@42 775 T2M = T2A - T2B;
Chris@42 776 T2D = T1z + T1o;
Chris@42 777 T2E = T2s - T2p;
Chris@42 778 T2F = T2D - T2E;
Chris@42 779 T2N = T2D + T2E;
Chris@42 780 }
Chris@42 781 {
Chris@42 782 E T2G, T2Q, T2K, T2O;
Chris@42 783 T2G = KP353553390 * (T2C + T2F);
Chris@42 784 Ip[WS(rs, 2)] = T2z + T2G;
Chris@42 785 Im[WS(rs, 5)] = T2G - T2z;
Chris@42 786 T2Q = KP353553390 * (T2M + T2N);
Chris@42 787 Rm[WS(rs, 5)] = T2P - T2Q;
Chris@42 788 Rp[WS(rs, 2)] = T2P + T2Q;
Chris@42 789 T2K = KP353553390 * (T2F - T2C);
Chris@42 790 Rm[WS(rs, 1)] = T2J - T2K;
Chris@42 791 Rp[WS(rs, 6)] = T2J + T2K;
Chris@42 792 T2O = KP353553390 * (T2M - T2N);
Chris@42 793 Ip[WS(rs, 6)] = T2L + T2O;
Chris@42 794 Im[WS(rs, 1)] = T2O - T2L;
Chris@42 795 }
Chris@42 796 }
Chris@42 797 {
Chris@42 798 E T30, T3w, T3F, T3j, T2X, T3G, T3D, T3L, T3m, T3v, T38, T3q, T3A, T3K, T3f;
Chris@42 799 E T3r;
Chris@42 800 {
Chris@42 801 E T2T, T2W, T34, T37;
Chris@42 802 T30 = KP500000000 * (T2Y - T2Z);
Chris@42 803 T3w = KP500000000 * (T2Z + T2Y);
Chris@42 804 T3F = KP500000000 * (T3h - T3i);
Chris@42 805 T3j = KP500000000 * (T3h + T3i);
Chris@42 806 T2T = T2R - T2S;
Chris@42 807 T2W = T2U + T2V;
Chris@42 808 T2X = KP353553390 * (T2T + T2W);
Chris@42 809 T3G = KP353553390 * (T2T - T2W);
Chris@42 810 {
Chris@42 811 E T3B, T3C, T3k, T3l;
Chris@42 812 T3B = T3a + T39;
Chris@42 813 T3C = T3d - T3c;
Chris@42 814 T3D = FNMS(KP461939766, T3C, KP191341716 * T3B);
Chris@42 815 T3L = FMA(KP461939766, T3B, KP191341716 * T3C);
Chris@42 816 T3k = T2S + T2R;
Chris@42 817 T3l = T2U - T2V;
Chris@42 818 T3m = KP353553390 * (T3k + T3l);
Chris@42 819 T3v = KP353553390 * (T3l - T3k);
Chris@42 820 }
Chris@42 821 T34 = T32 + T33;
Chris@42 822 T37 = T35 - T36;
Chris@42 823 T38 = FMA(KP191341716, T34, KP461939766 * T37);
Chris@42 824 T3q = FNMS(KP191341716, T37, KP461939766 * T34);
Chris@42 825 {
Chris@42 826 E T3y, T3z, T3b, T3e;
Chris@42 827 T3y = T33 - T32;
Chris@42 828 T3z = T36 + T35;
Chris@42 829 T3A = FMA(KP461939766, T3y, KP191341716 * T3z);
Chris@42 830 T3K = FNMS(KP461939766, T3z, KP191341716 * T3y);
Chris@42 831 T3b = T39 - T3a;
Chris@42 832 T3e = T3c + T3d;
Chris@42 833 T3f = FNMS(KP191341716, T3e, KP461939766 * T3b);
Chris@42 834 T3r = FMA(KP191341716, T3b, KP461939766 * T3e);
Chris@42 835 }
Chris@42 836 }
Chris@42 837 {
Chris@42 838 E T31, T3g, T3t, T3u;
Chris@42 839 T31 = T2X + T30;
Chris@42 840 T3g = T38 + T3f;
Chris@42 841 Ip[WS(rs, 1)] = T31 + T3g;
Chris@42 842 Im[WS(rs, 6)] = T3g - T31;
Chris@42 843 T3t = T3j + T3m;
Chris@42 844 T3u = T3q + T3r;
Chris@42 845 Rm[WS(rs, 6)] = T3t - T3u;
Chris@42 846 Rp[WS(rs, 1)] = T3t + T3u;
Chris@42 847 }
Chris@42 848 {
Chris@42 849 E T3n, T3o, T3p, T3s;
Chris@42 850 T3n = T3j - T3m;
Chris@42 851 T3o = T3f - T38;
Chris@42 852 Rm[WS(rs, 2)] = T3n - T3o;
Chris@42 853 Rp[WS(rs, 5)] = T3n + T3o;
Chris@42 854 T3p = T30 - T2X;
Chris@42 855 T3s = T3q - T3r;
Chris@42 856 Ip[WS(rs, 5)] = T3p + T3s;
Chris@42 857 Im[WS(rs, 2)] = T3s - T3p;
Chris@42 858 }
Chris@42 859 {
Chris@42 860 E T3x, T3E, T3N, T3O;
Chris@42 861 T3x = T3v + T3w;
Chris@42 862 T3E = T3A + T3D;
Chris@42 863 Ip[WS(rs, 3)] = T3x + T3E;
Chris@42 864 Im[WS(rs, 4)] = T3E - T3x;
Chris@42 865 T3N = T3F + T3G;
Chris@42 866 T3O = T3K + T3L;
Chris@42 867 Rm[WS(rs, 4)] = T3N - T3O;
Chris@42 868 Rp[WS(rs, 3)] = T3N + T3O;
Chris@42 869 }
Chris@42 870 {
Chris@42 871 E T3H, T3I, T3J, T3M;
Chris@42 872 T3H = T3F - T3G;
Chris@42 873 T3I = T3D - T3A;
Chris@42 874 Rm[0] = T3H - T3I;
Chris@42 875 Rp[WS(rs, 7)] = T3H + T3I;
Chris@42 876 T3J = T3w - T3v;
Chris@42 877 T3M = T3K - T3L;
Chris@42 878 Ip[WS(rs, 7)] = T3J + T3M;
Chris@42 879 Im[0] = T3M - T3J;
Chris@42 880 }
Chris@42 881 }
Chris@42 882 }
Chris@42 883 }
Chris@42 884 }
Chris@42 885
Chris@42 886 static const tw_instr twinstr[] = {
Chris@42 887 {TW_FULL, 1, 16},
Chris@42 888 {TW_NEXT, 1, 0}
Chris@42 889 };
Chris@42 890
Chris@42 891 static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, {168, 62, 38, 0} };
Chris@42 892
Chris@42 893 void X(codelet_hc2cfdft_16) (planner *p) {
Chris@42 894 X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
Chris@42 895 }
Chris@42 896 #endif /* HAVE_FMA */