annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:49 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@42 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@42 33 * 106 stack variables, 3 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 45 E T35, T32;
Chris@42 46 {
Chris@42 47 E T2, Tf, TM, TO, T3, Tg, TN, TS, T4, Tp, T6, T5, Th;
Chris@42 48 T2 = W[0];
Chris@42 49 Tf = W[2];
Chris@42 50 TM = W[6];
Chris@42 51 TO = W[7];
Chris@42 52 T3 = W[4];
Chris@42 53 Tg = T2 * Tf;
Chris@42 54 TN = T2 * TM;
Chris@42 55 TS = T2 * TO;
Chris@42 56 T4 = T2 * T3;
Chris@42 57 Tp = Tf * T3;
Chris@42 58 T6 = W[5];
Chris@42 59 T5 = W[1];
Chris@42 60 Th = W[3];
Chris@42 61 {
Chris@42 62 E TZ, Te, T1U, T3A, T3M, T2w, T1G, T2I, T3h, T1R, T2D, T2B, T3i, Tx, T3L;
Chris@42 63 E T1Z, T3w, TL, T21, T26, T38, T1d, T2h, T2s, T3c, T1s, T2t, T2m, T3d, TX;
Chris@42 64 E T10, TV, T2a, TY, T2b;
Chris@42 65 {
Chris@42 66 E TF, TP, TT, Tq, TW, Tz, Tu, TI, TC, T1m, T1f, T1p, T1j, Tr, Ts;
Chris@42 67 E Tv, To, T1W;
Chris@42 68 {
Chris@42 69 E Ti, Tm, T1L, T1O, T1D, T1A, T1x, T2G, T1F, T2F;
Chris@42 70 {
Chris@42 71 E T1, T7, Tb, T3z, T8, T1z, T9, Tc;
Chris@42 72 {
Chris@42 73 E T1i, T1e, T1C, T1y, Tt, Ta, Tl;
Chris@42 74 T1 = cr[0];
Chris@42 75 Tt = Tf * T6;
Chris@42 76 Ta = T2 * T6;
Chris@42 77 T7 = FMA(T5, T6, T4);
Chris@42 78 TF = FNMS(T5, T6, T4);
Chris@42 79 TP = FMA(T5, TO, TN);
Chris@42 80 TT = FNMS(T5, TM, TS);
Chris@42 81 Tq = FNMS(Th, T6, Tp);
Chris@42 82 TW = FMA(Th, T6, Tp);
Chris@42 83 Tz = FMA(T5, Th, Tg);
Chris@42 84 Ti = FNMS(T5, Th, Tg);
Chris@42 85 Tl = T2 * Th;
Chris@42 86 Tu = FMA(Th, T3, Tt);
Chris@42 87 TZ = FNMS(Th, T3, Tt);
Chris@42 88 TI = FMA(T5, T3, Ta);
Chris@42 89 Tb = FNMS(T5, T3, Ta);
Chris@42 90 T1i = Ti * T6;
Chris@42 91 T1e = Ti * T3;
Chris@42 92 T1C = Tz * T6;
Chris@42 93 T1y = Tz * T3;
Chris@42 94 Tm = FMA(T5, Tf, Tl);
Chris@42 95 TC = FNMS(T5, Tf, Tl);
Chris@42 96 T3z = ci[0];
Chris@42 97 T8 = cr[WS(rs, 8)];
Chris@42 98 T1m = FNMS(Tm, T6, T1e);
Chris@42 99 T1f = FMA(Tm, T6, T1e);
Chris@42 100 T1p = FMA(Tm, T3, T1i);
Chris@42 101 T1j = FNMS(Tm, T3, T1i);
Chris@42 102 T1L = FNMS(TC, T6, T1y);
Chris@42 103 T1z = FMA(TC, T6, T1y);
Chris@42 104 T1O = FMA(TC, T3, T1C);
Chris@42 105 T1D = FNMS(TC, T3, T1C);
Chris@42 106 T9 = T7 * T8;
Chris@42 107 Tc = ci[WS(rs, 8)];
Chris@42 108 }
Chris@42 109 {
Chris@42 110 E T1u, T1w, T1v, T2E, T3y, T1B, T1E, Td, T3x;
Chris@42 111 T1u = cr[WS(rs, 15)];
Chris@42 112 T1w = ci[WS(rs, 15)];
Chris@42 113 T1A = cr[WS(rs, 7)];
Chris@42 114 Td = FMA(Tb, Tc, T9);
Chris@42 115 T3x = T7 * Tc;
Chris@42 116 T1v = TM * T1u;
Chris@42 117 T2E = TM * T1w;
Chris@42 118 Te = T1 + Td;
Chris@42 119 T1U = T1 - Td;
Chris@42 120 T3y = FNMS(Tb, T8, T3x);
Chris@42 121 T1B = T1z * T1A;
Chris@42 122 T1E = ci[WS(rs, 7)];
Chris@42 123 T1x = FMA(TO, T1w, T1v);
Chris@42 124 T3A = T3y + T3z;
Chris@42 125 T3M = T3z - T3y;
Chris@42 126 T2G = T1z * T1E;
Chris@42 127 T1F = FMA(T1D, T1E, T1B);
Chris@42 128 T2F = FNMS(TO, T1u, T2E);
Chris@42 129 }
Chris@42 130 }
Chris@42 131 {
Chris@42 132 E T1H, T1I, T1J, T1M, T1P, T2H;
Chris@42 133 T1H = cr[WS(rs, 3)];
Chris@42 134 T2H = FNMS(T1D, T1A, T2G);
Chris@42 135 T2w = T1x - T1F;
Chris@42 136 T1G = T1x + T1F;
Chris@42 137 T1I = Tf * T1H;
Chris@42 138 T2I = T2F - T2H;
Chris@42 139 T3h = T2F + T2H;
Chris@42 140 T1J = ci[WS(rs, 3)];
Chris@42 141 T1M = cr[WS(rs, 11)];
Chris@42 142 T1P = ci[WS(rs, 11)];
Chris@42 143 {
Chris@42 144 E Tj, Tk, Tn, T1V;
Chris@42 145 {
Chris@42 146 E T1K, T2y, T1Q, T2A, T2x, T1N, T2z;
Chris@42 147 Tj = cr[WS(rs, 4)];
Chris@42 148 T1K = FMA(Th, T1J, T1I);
Chris@42 149 T2x = Tf * T1J;
Chris@42 150 T1N = T1L * T1M;
Chris@42 151 T2z = T1L * T1P;
Chris@42 152 Tk = Ti * Tj;
Chris@42 153 T2y = FNMS(Th, T1H, T2x);
Chris@42 154 T1Q = FMA(T1O, T1P, T1N);
Chris@42 155 T2A = FNMS(T1O, T1M, T2z);
Chris@42 156 Tn = ci[WS(rs, 4)];
Chris@42 157 Tr = cr[WS(rs, 12)];
Chris@42 158 T1R = T1K + T1Q;
Chris@42 159 T2D = T1Q - T1K;
Chris@42 160 T2B = T2y - T2A;
Chris@42 161 T3i = T2y + T2A;
Chris@42 162 T1V = Ti * Tn;
Chris@42 163 Ts = Tq * Tr;
Chris@42 164 Tv = ci[WS(rs, 12)];
Chris@42 165 }
Chris@42 166 To = FMA(Tm, Tn, Tk);
Chris@42 167 T1W = FNMS(Tm, Tj, T1V);
Chris@42 168 }
Chris@42 169 }
Chris@42 170 }
Chris@42 171 {
Chris@42 172 E T19, T1b, T18, T2p, T1a, T2q;
Chris@42 173 {
Chris@42 174 E TE, T23, TK, T25;
Chris@42 175 {
Chris@42 176 E TA, TD, TB, T22, TG, TJ, TH, T24, T1Y, Tw, T1X;
Chris@42 177 TA = cr[WS(rs, 2)];
Chris@42 178 Tw = FMA(Tu, Tv, Ts);
Chris@42 179 T1X = Tq * Tv;
Chris@42 180 TD = ci[WS(rs, 2)];
Chris@42 181 TB = Tz * TA;
Chris@42 182 Tx = To + Tw;
Chris@42 183 T3L = To - Tw;
Chris@42 184 T1Y = FNMS(Tu, Tr, T1X);
Chris@42 185 T22 = Tz * TD;
Chris@42 186 TG = cr[WS(rs, 10)];
Chris@42 187 TJ = ci[WS(rs, 10)];
Chris@42 188 T1Z = T1W - T1Y;
Chris@42 189 T3w = T1W + T1Y;
Chris@42 190 TH = TF * TG;
Chris@42 191 T24 = TF * TJ;
Chris@42 192 TE = FMA(TC, TD, TB);
Chris@42 193 T23 = FNMS(TC, TA, T22);
Chris@42 194 TK = FMA(TI, TJ, TH);
Chris@42 195 T25 = FNMS(TI, TG, T24);
Chris@42 196 }
Chris@42 197 {
Chris@42 198 E T15, T17, T16, T2o;
Chris@42 199 T15 = cr[WS(rs, 1)];
Chris@42 200 T17 = ci[WS(rs, 1)];
Chris@42 201 TL = TE + TK;
Chris@42 202 T21 = TE - TK;
Chris@42 203 T26 = T23 - T25;
Chris@42 204 T38 = T23 + T25;
Chris@42 205 T16 = T2 * T15;
Chris@42 206 T2o = T2 * T17;
Chris@42 207 T19 = cr[WS(rs, 9)];
Chris@42 208 T1b = ci[WS(rs, 9)];
Chris@42 209 T18 = FMA(T5, T17, T16);
Chris@42 210 T2p = FNMS(T5, T15, T2o);
Chris@42 211 T1a = T3 * T19;
Chris@42 212 T2q = T3 * T1b;
Chris@42 213 }
Chris@42 214 }
Chris@42 215 {
Chris@42 216 E T1n, T1q, T1l, T2j, T1o, T2k;
Chris@42 217 {
Chris@42 218 E T1g, T1k, T1h, T2i, T1c, T2r;
Chris@42 219 T1g = cr[WS(rs, 5)];
Chris@42 220 T1k = ci[WS(rs, 5)];
Chris@42 221 T1c = FMA(T6, T1b, T1a);
Chris@42 222 T2r = FNMS(T6, T19, T2q);
Chris@42 223 T1h = T1f * T1g;
Chris@42 224 T2i = T1f * T1k;
Chris@42 225 T1d = T18 + T1c;
Chris@42 226 T2h = T18 - T1c;
Chris@42 227 T2s = T2p - T2r;
Chris@42 228 T3c = T2p + T2r;
Chris@42 229 T1n = cr[WS(rs, 13)];
Chris@42 230 T1q = ci[WS(rs, 13)];
Chris@42 231 T1l = FMA(T1j, T1k, T1h);
Chris@42 232 T2j = FNMS(T1j, T1g, T2i);
Chris@42 233 T1o = T1m * T1n;
Chris@42 234 T2k = T1m * T1q;
Chris@42 235 }
Chris@42 236 {
Chris@42 237 E TQ, TU, TR, T29, T1r, T2l;
Chris@42 238 TQ = cr[WS(rs, 14)];
Chris@42 239 TU = ci[WS(rs, 14)];
Chris@42 240 T1r = FMA(T1p, T1q, T1o);
Chris@42 241 T2l = FNMS(T1p, T1n, T2k);
Chris@42 242 TR = TP * TQ;
Chris@42 243 T29 = TP * TU;
Chris@42 244 T1s = T1l + T1r;
Chris@42 245 T2t = T1l - T1r;
Chris@42 246 T2m = T2j - T2l;
Chris@42 247 T3d = T2j + T2l;
Chris@42 248 TX = cr[WS(rs, 6)];
Chris@42 249 T10 = ci[WS(rs, 6)];
Chris@42 250 TV = FMA(TT, TU, TR);
Chris@42 251 T2a = FNMS(TT, TQ, T29);
Chris@42 252 TY = TW * TX;
Chris@42 253 T2b = TW * T10;
Chris@42 254 }
Chris@42 255 }
Chris@42 256 }
Chris@42 257 }
Chris@42 258 {
Chris@42 259 E T36, T3G, T3b, T3g, T28, T2d, T3F, T39, T3j, T3q, T3C, T3e, T3u, T3t;
Chris@42 260 {
Chris@42 261 E T3D, T1T, T3r, T14, T3E, T3s;
Chris@42 262 {
Chris@42 263 E Ty, T3B, T11, T2c, T13, T3v;
Chris@42 264 T36 = Te - Tx;
Chris@42 265 Ty = Te + Tx;
Chris@42 266 T3B = T3w + T3A;
Chris@42 267 T3G = T3A - T3w;
Chris@42 268 T11 = FMA(TZ, T10, TY);
Chris@42 269 T2c = FNMS(TZ, TX, T2b);
Chris@42 270 {
Chris@42 271 E T1t, T1S, T12, T37;
Chris@42 272 T3b = T1d - T1s;
Chris@42 273 T1t = T1d + T1s;
Chris@42 274 T1S = T1G + T1R;
Chris@42 275 T3g = T1G - T1R;
Chris@42 276 T12 = TV + T11;
Chris@42 277 T28 = TV - T11;
Chris@42 278 T2d = T2a - T2c;
Chris@42 279 T37 = T2a + T2c;
Chris@42 280 T3D = T1S - T1t;
Chris@42 281 T1T = T1t + T1S;
Chris@42 282 T13 = TL + T12;
Chris@42 283 T3F = TL - T12;
Chris@42 284 T39 = T37 - T38;
Chris@42 285 T3v = T38 + T37;
Chris@42 286 }
Chris@42 287 T3j = T3h - T3i;
Chris@42 288 T3r = T3h + T3i;
Chris@42 289 T3q = Ty - T13;
Chris@42 290 T14 = Ty + T13;
Chris@42 291 T3E = T3B - T3v;
Chris@42 292 T3C = T3v + T3B;
Chris@42 293 T3s = T3c + T3d;
Chris@42 294 T3e = T3c - T3d;
Chris@42 295 }
Chris@42 296 ci[WS(rs, 7)] = T14 - T1T;
Chris@42 297 cr[WS(rs, 12)] = T3D - T3E;
Chris@42 298 ci[WS(rs, 11)] = T3D + T3E;
Chris@42 299 T3u = T3s + T3r;
Chris@42 300 T3t = T3r - T3s;
Chris@42 301 cr[0] = T14 + T1T;
Chris@42 302 }
Chris@42 303 {
Chris@42 304 E T3m, T3a, T3J, T3H;
Chris@42 305 ci[WS(rs, 15)] = T3u + T3C;
Chris@42 306 cr[WS(rs, 8)] = T3u - T3C;
Chris@42 307 ci[WS(rs, 3)] = T3q + T3t;
Chris@42 308 cr[WS(rs, 4)] = T3q - T3t;
Chris@42 309 T3m = T36 + T39;
Chris@42 310 T3a = T36 - T39;
Chris@42 311 T3J = T3G - T3F;
Chris@42 312 T3H = T3F + T3G;
Chris@42 313 {
Chris@42 314 E T2Q, T20, T3N, T3T, T2C, T2J, T3U, T2f, T33, T30, T2V, T2W, T3O, T2T, T2N;
Chris@42 315 E T2v;
Chris@42 316 {
Chris@42 317 E T2R, T27, T2e, T2S;
Chris@42 318 {
Chris@42 319 E T3n, T3f, T3o, T3k;
Chris@42 320 T2Q = T1U + T1Z;
Chris@42 321 T20 = T1U - T1Z;
Chris@42 322 T3n = T3b - T3e;
Chris@42 323 T3f = T3b + T3e;
Chris@42 324 T3o = T3g + T3j;
Chris@42 325 T3k = T3g - T3j;
Chris@42 326 T3N = T3L + T3M;
Chris@42 327 T3T = T3M - T3L;
Chris@42 328 {
Chris@42 329 E T3p, T3K, T3I, T3l;
Chris@42 330 T3p = T3n + T3o;
Chris@42 331 T3K = T3o - T3n;
Chris@42 332 T3I = T3k - T3f;
Chris@42 333 T3l = T3f + T3k;
Chris@42 334 ci[WS(rs, 1)] = FMA(KP707106781, T3p, T3m);
Chris@42 335 cr[WS(rs, 6)] = FNMS(KP707106781, T3p, T3m);
Chris@42 336 ci[WS(rs, 13)] = FMA(KP707106781, T3K, T3J);
Chris@42 337 cr[WS(rs, 10)] = FMS(KP707106781, T3K, T3J);
Chris@42 338 ci[WS(rs, 9)] = FMA(KP707106781, T3I, T3H);
Chris@42 339 cr[WS(rs, 14)] = FMS(KP707106781, T3I, T3H);
Chris@42 340 cr[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
Chris@42 341 ci[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
Chris@42 342 T2R = T21 + T26;
Chris@42 343 T27 = T21 - T26;
Chris@42 344 T2e = T28 + T2d;
Chris@42 345 T2S = T28 - T2d;
Chris@42 346 }
Chris@42 347 }
Chris@42 348 {
Chris@42 349 E T2Y, T2Z, T2n, T2u;
Chris@42 350 T2C = T2w - T2B;
Chris@42 351 T2Y = T2w + T2B;
Chris@42 352 T2Z = T2I + T2D;
Chris@42 353 T2J = T2D - T2I;
Chris@42 354 T3U = T2e - T27;
Chris@42 355 T2f = T27 + T2e;
Chris@42 356 T33 = FMA(KP414213562, T2Y, T2Z);
Chris@42 357 T30 = FNMS(KP414213562, T2Z, T2Y);
Chris@42 358 T2V = T2h + T2m;
Chris@42 359 T2n = T2h - T2m;
Chris@42 360 T2u = T2s + T2t;
Chris@42 361 T2W = T2s - T2t;
Chris@42 362 T3O = T2R - T2S;
Chris@42 363 T2T = T2R + T2S;
Chris@42 364 T2N = FMA(KP414213562, T2n, T2u);
Chris@42 365 T2v = FNMS(KP414213562, T2u, T2n);
Chris@42 366 }
Chris@42 367 }
Chris@42 368 {
Chris@42 369 E T2M, T3S, T31, T2P, T3Q, T3R, T3P, T2U;
Chris@42 370 {
Chris@42 371 E T2g, T2X, T2O, T2K, T3V, T3X, T3W, T34, T2L, T3Y;
Chris@42 372 T2M = FNMS(KP707106781, T2f, T20);
Chris@42 373 T2g = FMA(KP707106781, T2f, T20);
Chris@42 374 T34 = FNMS(KP414213562, T2V, T2W);
Chris@42 375 T2X = FMA(KP414213562, T2W, T2V);
Chris@42 376 T2O = FMA(KP414213562, T2C, T2J);
Chris@42 377 T2K = FNMS(KP414213562, T2J, T2C);
Chris@42 378 T3V = FMA(KP707106781, T3U, T3T);
Chris@42 379 T3X = FNMS(KP707106781, T3U, T3T);
Chris@42 380 T35 = T33 - T34;
Chris@42 381 T3W = T34 + T33;
Chris@42 382 T3S = T2K - T2v;
Chris@42 383 T2L = T2v + T2K;
Chris@42 384 T3Y = T30 - T2X;
Chris@42 385 T31 = T2X + T30;
Chris@42 386 ci[WS(rs, 14)] = FMA(KP923879532, T3W, T3V);
Chris@42 387 cr[WS(rs, 9)] = FMS(KP923879532, T3W, T3V);
Chris@42 388 ci[0] = FMA(KP923879532, T2L, T2g);
Chris@42 389 cr[WS(rs, 7)] = FNMS(KP923879532, T2L, T2g);
Chris@42 390 cr[WS(rs, 13)] = FMS(KP923879532, T3Y, T3X);
Chris@42 391 ci[WS(rs, 10)] = FMA(KP923879532, T3Y, T3X);
Chris@42 392 T2P = T2N + T2O;
Chris@42 393 T3Q = T2O - T2N;
Chris@42 394 }
Chris@42 395 T32 = FNMS(KP707106781, T2T, T2Q);
Chris@42 396 T2U = FMA(KP707106781, T2T, T2Q);
Chris@42 397 T3R = FNMS(KP707106781, T3O, T3N);
Chris@42 398 T3P = FMA(KP707106781, T3O, T3N);
Chris@42 399 cr[WS(rs, 3)] = FMA(KP923879532, T2P, T2M);
Chris@42 400 ci[WS(rs, 4)] = FNMS(KP923879532, T2P, T2M);
Chris@42 401 cr[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
Chris@42 402 ci[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
Chris@42 403 ci[WS(rs, 8)] = FMA(KP923879532, T3Q, T3P);
Chris@42 404 cr[WS(rs, 15)] = FMS(KP923879532, T3Q, T3P);
Chris@42 405 ci[WS(rs, 12)] = FMA(KP923879532, T3S, T3R);
Chris@42 406 cr[WS(rs, 11)] = FMS(KP923879532, T3S, T3R);
Chris@42 407 }
Chris@42 408 }
Chris@42 409 }
Chris@42 410 }
Chris@42 411 }
Chris@42 412 }
Chris@42 413 ci[WS(rs, 2)] = FMA(KP923879532, T35, T32);
Chris@42 414 cr[WS(rs, 5)] = FNMS(KP923879532, T35, T32);
Chris@42 415 }
Chris@42 416 }
Chris@42 417 }
Chris@42 418
Chris@42 419 static const tw_instr twinstr[] = {
Chris@42 420 {TW_CEXP, 1, 1},
Chris@42 421 {TW_CEXP, 1, 3},
Chris@42 422 {TW_CEXP, 1, 9},
Chris@42 423 {TW_CEXP, 1, 15},
Chris@42 424 {TW_NEXT, 1, 0}
Chris@42 425 };
Chris@42 426
Chris@42 427 static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@42 428
Chris@42 429 void X(codelet_hf2_16) (planner *p) {
Chris@42 430 X(khc2hc_register) (p, hf2_16, &desc);
Chris@42 431 }
Chris@42 432 #else /* HAVE_FMA */
Chris@42 433
Chris@42 434 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include hf.h */
Chris@42 435
Chris@42 436 /*
Chris@42 437 * This function contains 196 FP additions, 108 FP multiplications,
Chris@42 438 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@42 439 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@42 440 */
Chris@42 441 #include "hf.h"
Chris@42 442
Chris@42 443 static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 444 {
Chris@42 445 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 446 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 447 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 448 {
Chris@42 449 INT m;
Chris@42 450 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 451 E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@42 452 E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@42 453 {
Chris@42 454 E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@42 455 {
Chris@42 456 E Th, Tn, Tj, Tm;
Chris@42 457 T2 = W[0];
Chris@42 458 T5 = W[1];
Chris@42 459 Tg = W[2];
Chris@42 460 Ti = W[3];
Chris@42 461 Th = T2 * Tg;
Chris@42 462 Tn = T5 * Tg;
Chris@42 463 Tj = T5 * Ti;
Chris@42 464 Tm = T2 * Ti;
Chris@42 465 Tk = Th - Tj;
Chris@42 466 To = Tm + Tn;
Chris@42 467 TE = Tm - Tn;
Chris@42 468 TC = Th + Tj;
Chris@42 469 T6 = W[5];
Chris@42 470 T7 = T5 * T6;
Chris@42 471 Tv = Tg * T6;
Chris@42 472 Ta = T2 * T6;
Chris@42 473 Ts = Ti * T6;
Chris@42 474 T3 = W[4];
Chris@42 475 T4 = T2 * T3;
Chris@42 476 Tw = Ti * T3;
Chris@42 477 Tb = T5 * T3;
Chris@42 478 Tr = Tg * T3;
Chris@42 479 }
Chris@42 480 T8 = T4 + T7;
Chris@42 481 TW = Tv - Tw;
Chris@42 482 TJ = Ta + Tb;
Chris@42 483 Tt = Tr - Ts;
Chris@42 484 TU = Tr + Ts;
Chris@42 485 Tc = Ta - Tb;
Chris@42 486 Tx = Tv + Tw;
Chris@42 487 TH = T4 - T7;
Chris@42 488 TN = W[6];
Chris@42 489 TO = W[7];
Chris@42 490 TP = FMA(T2, TN, T5 * TO);
Chris@42 491 TR = FNMS(T5, TN, T2 * TO);
Chris@42 492 {
Chris@42 493 E T1d, T1e, T19, T1a;
Chris@42 494 T1d = Tk * T6;
Chris@42 495 T1e = To * T3;
Chris@42 496 T1f = T1d - T1e;
Chris@42 497 T1k = T1d + T1e;
Chris@42 498 T19 = Tk * T3;
Chris@42 499 T1a = To * T6;
Chris@42 500 T1b = T19 + T1a;
Chris@42 501 T1i = T19 - T1a;
Chris@42 502 }
Chris@42 503 {
Chris@42 504 E T1w, T1x, T1s, T1t;
Chris@42 505 T1w = TC * T6;
Chris@42 506 T1x = TE * T3;
Chris@42 507 T1y = T1w - T1x;
Chris@42 508 T1H = T1w + T1x;
Chris@42 509 T1s = TC * T3;
Chris@42 510 T1t = TE * T6;
Chris@42 511 T1u = T1s + T1t;
Chris@42 512 T1F = T1s - T1t;
Chris@42 513 }
Chris@42 514 }
Chris@42 515 {
Chris@42 516 E Tf, T3s, T1N, T3e, TA, T3r, T1Q, T3b, TM, T2N, T1W, T2w, TZ, T2M, T21;
Chris@42 517 E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2E, T2o, T2D, T18, T1n, T2Q, T2R;
Chris@42 518 E T2S, T2T, T28, T2B, T2d, T2A;
Chris@42 519 {
Chris@42 520 E T1, T3d, Te, T3c, T9, Td;
Chris@42 521 T1 = cr[0];
Chris@42 522 T3d = ci[0];
Chris@42 523 T9 = cr[WS(rs, 8)];
Chris@42 524 Td = ci[WS(rs, 8)];
Chris@42 525 Te = FMA(T8, T9, Tc * Td);
Chris@42 526 T3c = FNMS(Tc, T9, T8 * Td);
Chris@42 527 Tf = T1 + Te;
Chris@42 528 T3s = T3d - T3c;
Chris@42 529 T1N = T1 - Te;
Chris@42 530 T3e = T3c + T3d;
Chris@42 531 }
Chris@42 532 {
Chris@42 533 E Tq, T1O, Tz, T1P;
Chris@42 534 {
Chris@42 535 E Tl, Tp, Tu, Ty;
Chris@42 536 Tl = cr[WS(rs, 4)];
Chris@42 537 Tp = ci[WS(rs, 4)];
Chris@42 538 Tq = FMA(Tk, Tl, To * Tp);
Chris@42 539 T1O = FNMS(To, Tl, Tk * Tp);
Chris@42 540 Tu = cr[WS(rs, 12)];
Chris@42 541 Ty = ci[WS(rs, 12)];
Chris@42 542 Tz = FMA(Tt, Tu, Tx * Ty);
Chris@42 543 T1P = FNMS(Tx, Tu, Tt * Ty);
Chris@42 544 }
Chris@42 545 TA = Tq + Tz;
Chris@42 546 T3r = Tq - Tz;
Chris@42 547 T1Q = T1O - T1P;
Chris@42 548 T3b = T1O + T1P;
Chris@42 549 }
Chris@42 550 {
Chris@42 551 E TG, T1T, TL, T1U, T1S, T1V;
Chris@42 552 {
Chris@42 553 E TD, TF, TI, TK;
Chris@42 554 TD = cr[WS(rs, 2)];
Chris@42 555 TF = ci[WS(rs, 2)];
Chris@42 556 TG = FMA(TC, TD, TE * TF);
Chris@42 557 T1T = FNMS(TE, TD, TC * TF);
Chris@42 558 TI = cr[WS(rs, 10)];
Chris@42 559 TK = ci[WS(rs, 10)];
Chris@42 560 TL = FMA(TH, TI, TJ * TK);
Chris@42 561 T1U = FNMS(TJ, TI, TH * TK);
Chris@42 562 }
Chris@42 563 TM = TG + TL;
Chris@42 564 T2N = T1T + T1U;
Chris@42 565 T1S = TG - TL;
Chris@42 566 T1V = T1T - T1U;
Chris@42 567 T1W = T1S - T1V;
Chris@42 568 T2w = T1S + T1V;
Chris@42 569 }
Chris@42 570 {
Chris@42 571 E TT, T1Y, TY, T1Z, T1X, T20;
Chris@42 572 {
Chris@42 573 E TQ, TS, TV, TX;
Chris@42 574 TQ = cr[WS(rs, 14)];
Chris@42 575 TS = ci[WS(rs, 14)];
Chris@42 576 TT = FMA(TP, TQ, TR * TS);
Chris@42 577 T1Y = FNMS(TR, TQ, TP * TS);
Chris@42 578 TV = cr[WS(rs, 6)];
Chris@42 579 TX = ci[WS(rs, 6)];
Chris@42 580 TY = FMA(TU, TV, TW * TX);
Chris@42 581 T1Z = FNMS(TW, TV, TU * TX);
Chris@42 582 }
Chris@42 583 TZ = TT + TY;
Chris@42 584 T2M = T1Y + T1Z;
Chris@42 585 T1X = TT - TY;
Chris@42 586 T20 = T1Y - T1Z;
Chris@42 587 T21 = T1X + T20;
Chris@42 588 T2x = T1X - T20;
Chris@42 589 }
Chris@42 590 {
Chris@42 591 E T1r, T2f, T1J, T2m, T1A, T2g, T1E, T2l;
Chris@42 592 {
Chris@42 593 E T1p, T1q, T1G, T1I;
Chris@42 594 T1p = cr[WS(rs, 15)];
Chris@42 595 T1q = ci[WS(rs, 15)];
Chris@42 596 T1r = FMA(TN, T1p, TO * T1q);
Chris@42 597 T2f = FNMS(TO, T1p, TN * T1q);
Chris@42 598 T1G = cr[WS(rs, 11)];
Chris@42 599 T1I = ci[WS(rs, 11)];
Chris@42 600 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@42 601 T2m = FNMS(T1H, T1G, T1F * T1I);
Chris@42 602 }
Chris@42 603 {
Chris@42 604 E T1v, T1z, T1C, T1D;
Chris@42 605 T1v = cr[WS(rs, 7)];
Chris@42 606 T1z = ci[WS(rs, 7)];
Chris@42 607 T1A = FMA(T1u, T1v, T1y * T1z);
Chris@42 608 T2g = FNMS(T1y, T1v, T1u * T1z);
Chris@42 609 T1C = cr[WS(rs, 3)];
Chris@42 610 T1D = ci[WS(rs, 3)];
Chris@42 611 T1E = FMA(Tg, T1C, Ti * T1D);
Chris@42 612 T2l = FNMS(Ti, T1C, Tg * T1D);
Chris@42 613 }
Chris@42 614 T1B = T1r + T1A;
Chris@42 615 T1K = T1E + T1J;
Chris@42 616 T2V = T1B - T1K;
Chris@42 617 T2W = T2f + T2g;
Chris@42 618 T2X = T2l + T2m;
Chris@42 619 T2Y = T2W - T2X;
Chris@42 620 {
Chris@42 621 E T2h, T2i, T2k, T2n;
Chris@42 622 T2h = T2f - T2g;
Chris@42 623 T2i = T1E - T1J;
Chris@42 624 T2j = T2h + T2i;
Chris@42 625 T2E = T2h - T2i;
Chris@42 626 T2k = T1r - T1A;
Chris@42 627 T2n = T2l - T2m;
Chris@42 628 T2o = T2k - T2n;
Chris@42 629 T2D = T2k + T2n;
Chris@42 630 }
Chris@42 631 }
Chris@42 632 {
Chris@42 633 E T14, T29, T1m, T26, T17, T2a, T1h, T25;
Chris@42 634 {
Chris@42 635 E T12, T13, T1j, T1l;
Chris@42 636 T12 = cr[WS(rs, 1)];
Chris@42 637 T13 = ci[WS(rs, 1)];
Chris@42 638 T14 = FMA(T2, T12, T5 * T13);
Chris@42 639 T29 = FNMS(T5, T12, T2 * T13);
Chris@42 640 T1j = cr[WS(rs, 13)];
Chris@42 641 T1l = ci[WS(rs, 13)];
Chris@42 642 T1m = FMA(T1i, T1j, T1k * T1l);
Chris@42 643 T26 = FNMS(T1k, T1j, T1i * T1l);
Chris@42 644 }
Chris@42 645 {
Chris@42 646 E T15, T16, T1c, T1g;
Chris@42 647 T15 = cr[WS(rs, 9)];
Chris@42 648 T16 = ci[WS(rs, 9)];
Chris@42 649 T17 = FMA(T3, T15, T6 * T16);
Chris@42 650 T2a = FNMS(T6, T15, T3 * T16);
Chris@42 651 T1c = cr[WS(rs, 5)];
Chris@42 652 T1g = ci[WS(rs, 5)];
Chris@42 653 T1h = FMA(T1b, T1c, T1f * T1g);
Chris@42 654 T25 = FNMS(T1f, T1c, T1b * T1g);
Chris@42 655 }
Chris@42 656 T18 = T14 + T17;
Chris@42 657 T1n = T1h + T1m;
Chris@42 658 T2Q = T18 - T1n;
Chris@42 659 T2R = T29 + T2a;
Chris@42 660 T2S = T25 + T26;
Chris@42 661 T2T = T2R - T2S;
Chris@42 662 {
Chris@42 663 E T24, T27, T2b, T2c;
Chris@42 664 T24 = T14 - T17;
Chris@42 665 T27 = T25 - T26;
Chris@42 666 T28 = T24 - T27;
Chris@42 667 T2B = T24 + T27;
Chris@42 668 T2b = T29 - T2a;
Chris@42 669 T2c = T1h - T1m;
Chris@42 670 T2d = T2b + T2c;
Chris@42 671 T2A = T2b - T2c;
Chris@42 672 }
Chris@42 673 }
Chris@42 674 {
Chris@42 675 E T23, T2r, T3u, T3w, T2q, T3v, T2u, T3p;
Chris@42 676 {
Chris@42 677 E T1R, T22, T3q, T3t;
Chris@42 678 T1R = T1N - T1Q;
Chris@42 679 T22 = KP707106781 * (T1W + T21);
Chris@42 680 T23 = T1R + T22;
Chris@42 681 T2r = T1R - T22;
Chris@42 682 T3q = KP707106781 * (T2w - T2x);
Chris@42 683 T3t = T3r + T3s;
Chris@42 684 T3u = T3q + T3t;
Chris@42 685 T3w = T3t - T3q;
Chris@42 686 }
Chris@42 687 {
Chris@42 688 E T2e, T2p, T2s, T2t;
Chris@42 689 T2e = FNMS(KP382683432, T2d, KP923879532 * T28);
Chris@42 690 T2p = FMA(KP382683432, T2j, KP923879532 * T2o);
Chris@42 691 T2q = T2e + T2p;
Chris@42 692 T3v = T2p - T2e;
Chris@42 693 T2s = FMA(KP923879532, T2d, KP382683432 * T28);
Chris@42 694 T2t = FNMS(KP923879532, T2j, KP382683432 * T2o);
Chris@42 695 T2u = T2s + T2t;
Chris@42 696 T3p = T2t - T2s;
Chris@42 697 }
Chris@42 698 cr[WS(rs, 7)] = T23 - T2q;
Chris@42 699 cr[WS(rs, 11)] = T3v - T3w;
Chris@42 700 ci[WS(rs, 12)] = T3v + T3w;
Chris@42 701 ci[0] = T23 + T2q;
Chris@42 702 ci[WS(rs, 4)] = T2r - T2u;
Chris@42 703 cr[WS(rs, 15)] = T3p - T3u;
Chris@42 704 ci[WS(rs, 8)] = T3p + T3u;
Chris@42 705 cr[WS(rs, 3)] = T2r + T2u;
Chris@42 706 }
Chris@42 707 {
Chris@42 708 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@42 709 {
Chris@42 710 E TB, T10, T3a, T3f;
Chris@42 711 TB = Tf + TA;
Chris@42 712 T10 = TM + TZ;
Chris@42 713 T11 = TB + T10;
Chris@42 714 T35 = TB - T10;
Chris@42 715 T3a = T2N + T2M;
Chris@42 716 T3f = T3b + T3e;
Chris@42 717 T3g = T3a + T3f;
Chris@42 718 T3i = T3f - T3a;
Chris@42 719 }
Chris@42 720 {
Chris@42 721 E T1o, T1L, T36, T37;
Chris@42 722 T1o = T18 + T1n;
Chris@42 723 T1L = T1B + T1K;
Chris@42 724 T1M = T1o + T1L;
Chris@42 725 T3h = T1L - T1o;
Chris@42 726 T36 = T2W + T2X;
Chris@42 727 T37 = T2R + T2S;
Chris@42 728 T38 = T36 - T37;
Chris@42 729 T39 = T37 + T36;
Chris@42 730 }
Chris@42 731 ci[WS(rs, 7)] = T11 - T1M;
Chris@42 732 cr[WS(rs, 12)] = T3h - T3i;
Chris@42 733 ci[WS(rs, 11)] = T3h + T3i;
Chris@42 734 cr[0] = T11 + T1M;
Chris@42 735 cr[WS(rs, 4)] = T35 - T38;
Chris@42 736 cr[WS(rs, 8)] = T39 - T3g;
Chris@42 737 ci[WS(rs, 15)] = T39 + T3g;
Chris@42 738 ci[WS(rs, 3)] = T35 + T38;
Chris@42 739 }
Chris@42 740 {
Chris@42 741 E T2z, T2H, T3A, T3C, T2G, T3B, T2K, T3x;
Chris@42 742 {
Chris@42 743 E T2v, T2y, T3y, T3z;
Chris@42 744 T2v = T1N + T1Q;
Chris@42 745 T2y = KP707106781 * (T2w + T2x);
Chris@42 746 T2z = T2v + T2y;
Chris@42 747 T2H = T2v - T2y;
Chris@42 748 T3y = KP707106781 * (T21 - T1W);
Chris@42 749 T3z = T3s - T3r;
Chris@42 750 T3A = T3y + T3z;
Chris@42 751 T3C = T3z - T3y;
Chris@42 752 }
Chris@42 753 {
Chris@42 754 E T2C, T2F, T2I, T2J;
Chris@42 755 T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
Chris@42 756 T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
Chris@42 757 T2G = T2C + T2F;
Chris@42 758 T3B = T2F - T2C;
Chris@42 759 T2I = FNMS(KP923879532, T2A, KP382683432 * T2B);
Chris@42 760 T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
Chris@42 761 T2K = T2I + T2J;
Chris@42 762 T3x = T2J - T2I;
Chris@42 763 }
Chris@42 764 ci[WS(rs, 6)] = T2z - T2G;
Chris@42 765 cr[WS(rs, 13)] = T3B - T3C;
Chris@42 766 ci[WS(rs, 10)] = T3B + T3C;
Chris@42 767 cr[WS(rs, 1)] = T2z + T2G;
Chris@42 768 cr[WS(rs, 5)] = T2H - T2K;
Chris@42 769 cr[WS(rs, 9)] = T3x - T3A;
Chris@42 770 ci[WS(rs, 14)] = T3x + T3A;
Chris@42 771 ci[WS(rs, 2)] = T2H + T2K;
Chris@42 772 }
Chris@42 773 {
Chris@42 774 E T2P, T31, T3m, T3o, T30, T3j, T34, T3n;
Chris@42 775 {
Chris@42 776 E T2L, T2O, T3k, T3l;
Chris@42 777 T2L = Tf - TA;
Chris@42 778 T2O = T2M - T2N;
Chris@42 779 T2P = T2L - T2O;
Chris@42 780 T31 = T2L + T2O;
Chris@42 781 T3k = TM - TZ;
Chris@42 782 T3l = T3e - T3b;
Chris@42 783 T3m = T3k + T3l;
Chris@42 784 T3o = T3l - T3k;
Chris@42 785 }
Chris@42 786 {
Chris@42 787 E T2U, T2Z, T32, T33;
Chris@42 788 T2U = T2Q + T2T;
Chris@42 789 T2Z = T2V - T2Y;
Chris@42 790 T30 = KP707106781 * (T2U + T2Z);
Chris@42 791 T3j = KP707106781 * (T2Z - T2U);
Chris@42 792 T32 = T2Q - T2T;
Chris@42 793 T33 = T2V + T2Y;
Chris@42 794 T34 = KP707106781 * (T32 + T33);
Chris@42 795 T3n = KP707106781 * (T33 - T32);
Chris@42 796 }
Chris@42 797 ci[WS(rs, 5)] = T2P - T30;
Chris@42 798 cr[WS(rs, 10)] = T3n - T3o;
Chris@42 799 ci[WS(rs, 13)] = T3n + T3o;
Chris@42 800 cr[WS(rs, 2)] = T2P + T30;
Chris@42 801 cr[WS(rs, 6)] = T31 - T34;
Chris@42 802 cr[WS(rs, 14)] = T3j - T3m;
Chris@42 803 ci[WS(rs, 9)] = T3j + T3m;
Chris@42 804 ci[WS(rs, 1)] = T31 + T34;
Chris@42 805 }
Chris@42 806 }
Chris@42 807 }
Chris@42 808 }
Chris@42 809 }
Chris@42 810
Chris@42 811 static const tw_instr twinstr[] = {
Chris@42 812 {TW_CEXP, 1, 1},
Chris@42 813 {TW_CEXP, 1, 3},
Chris@42 814 {TW_CEXP, 1, 9},
Chris@42 815 {TW_CEXP, 1, 15},
Chris@42 816 {TW_NEXT, 1, 0}
Chris@42 817 };
Chris@42 818
Chris@42 819 static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@42 820
Chris@42 821 void X(codelet_hf2_16) (planner *p) {
Chris@42 822 X(khc2hc_register) (p, hf2_16, &desc);
Chris@42 823 }
Chris@42 824 #endif /* HAVE_FMA */