annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:23 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@42 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@42 33 * 97 stack variables, 6 constants, and 60 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 48 E T3v, T3E, T3G, T3A, T3y, T3z, T3F, T3B;
Chris@42 49 {
Chris@42 50 E T1G, T3l, T3H, T3k, T1B, Tf, T37, T1y, T2Y, T2M, T2a, T2i, T39, Tz, T2U;
Chris@42 51 E T2t, T1O, T2e, T3a, TT, T10, T2V, T2z, T1V, T2f, T2C, T12, T15, T14, T21;
Chris@42 52 E T1c, T1Y, T13;
Chris@42 53 {
Chris@42 54 E T2I, T1k, T1m, T1p, T1o, T28, T1w, T25, T1n;
Chris@42 55 {
Chris@42 56 E T1, T3i, T9, Tc, Tb, T1D, T7, T1E, Ta, T1j, T1i, T1h;
Chris@42 57 T1 = cr[0];
Chris@42 58 T3i = ci[0];
Chris@42 59 {
Chris@42 60 E T3, T6, T2, T5, T1C, T4, T8;
Chris@42 61 T3 = cr[WS(rs, 5)];
Chris@42 62 T6 = ci[WS(rs, 5)];
Chris@42 63 T2 = W[8];
Chris@42 64 T5 = W[9];
Chris@42 65 T9 = cr[WS(rs, 10)];
Chris@42 66 Tc = ci[WS(rs, 10)];
Chris@42 67 T1C = T2 * T6;
Chris@42 68 T4 = T2 * T3;
Chris@42 69 T8 = W[18];
Chris@42 70 Tb = W[19];
Chris@42 71 T1D = FNMS(T5, T3, T1C);
Chris@42 72 T7 = FMA(T5, T6, T4);
Chris@42 73 T1E = T8 * Tc;
Chris@42 74 Ta = T8 * T9;
Chris@42 75 }
Chris@42 76 {
Chris@42 77 E T1g, T1F, Td, T1f, T3j, Te, T2H;
Chris@42 78 T1g = cr[WS(rs, 9)];
Chris@42 79 T1j = ci[WS(rs, 9)];
Chris@42 80 T1F = FNMS(Tb, T9, T1E);
Chris@42 81 Td = FMA(Tb, Tc, Ta);
Chris@42 82 T1f = W[16];
Chris@42 83 T1i = W[17];
Chris@42 84 T1G = T1D - T1F;
Chris@42 85 T3j = T1D + T1F;
Chris@42 86 T3l = Td - T7;
Chris@42 87 Te = T7 + Td;
Chris@42 88 T2H = T1f * T1j;
Chris@42 89 T1h = T1f * T1g;
Chris@42 90 T3H = T3j + T3i;
Chris@42 91 T3k = FNMS(KP500000000, T3j, T3i);
Chris@42 92 T1B = FNMS(KP500000000, Te, T1);
Chris@42 93 Tf = T1 + Te;
Chris@42 94 T2I = FNMS(T1i, T1g, T2H);
Chris@42 95 }
Chris@42 96 T1k = FMA(T1i, T1j, T1h);
Chris@42 97 {
Chris@42 98 E T1s, T1v, T1r, T1u, T27, T1t, T1l;
Chris@42 99 T1s = cr[WS(rs, 4)];
Chris@42 100 T1v = ci[WS(rs, 4)];
Chris@42 101 T1r = W[6];
Chris@42 102 T1u = W[7];
Chris@42 103 T1m = cr[WS(rs, 14)];
Chris@42 104 T1p = ci[WS(rs, 14)];
Chris@42 105 T27 = T1r * T1v;
Chris@42 106 T1t = T1r * T1s;
Chris@42 107 T1l = W[26];
Chris@42 108 T1o = W[27];
Chris@42 109 T28 = FNMS(T1u, T1s, T27);
Chris@42 110 T1w = FMA(T1u, T1v, T1t);
Chris@42 111 T25 = T1l * T1p;
Chris@42 112 T1n = T1l * T1m;
Chris@42 113 }
Chris@42 114 }
Chris@42 115 {
Chris@42 116 E Tl, T2p, Tn, Tq, Tp, T1M, Tx, T1J, To;
Chris@42 117 {
Chris@42 118 E Th, Tk, T26, T1q, Tg, Tj;
Chris@42 119 Th = cr[WS(rs, 3)];
Chris@42 120 Tk = ci[WS(rs, 3)];
Chris@42 121 T26 = FNMS(T1o, T1m, T25);
Chris@42 122 T1q = FMA(T1o, T1p, T1n);
Chris@42 123 Tg = W[4];
Chris@42 124 Tj = W[5];
Chris@42 125 {
Chris@42 126 E T29, T2J, T1x, T2L;
Chris@42 127 T29 = T26 - T28;
Chris@42 128 T2J = T26 + T28;
Chris@42 129 T1x = T1q + T1w;
Chris@42 130 T2L = T1q - T1w;
Chris@42 131 {
Chris@42 132 E T2o, Ti, T2K, T24;
Chris@42 133 T2o = Tg * Tk;
Chris@42 134 Ti = Tg * Th;
Chris@42 135 T2K = FNMS(KP500000000, T2J, T2I);
Chris@42 136 T37 = T2I + T2J;
Chris@42 137 T24 = FNMS(KP500000000, T1x, T1k);
Chris@42 138 T1y = T1k + T1x;
Chris@42 139 Tl = FMA(Tj, Tk, Ti);
Chris@42 140 T2Y = FMA(KP866025403, T2L, T2K);
Chris@42 141 T2M = FNMS(KP866025403, T2L, T2K);
Chris@42 142 T2a = FNMS(KP866025403, T29, T24);
Chris@42 143 T2i = FMA(KP866025403, T29, T24);
Chris@42 144 T2p = FNMS(Tj, Th, T2o);
Chris@42 145 }
Chris@42 146 }
Chris@42 147 }
Chris@42 148 {
Chris@42 149 E Tt, Tw, Ts, Tv, T1L, Tu, Tm;
Chris@42 150 Tt = cr[WS(rs, 13)];
Chris@42 151 Tw = ci[WS(rs, 13)];
Chris@42 152 Ts = W[24];
Chris@42 153 Tv = W[25];
Chris@42 154 Tn = cr[WS(rs, 8)];
Chris@42 155 Tq = ci[WS(rs, 8)];
Chris@42 156 T1L = Ts * Tw;
Chris@42 157 Tu = Ts * Tt;
Chris@42 158 Tm = W[14];
Chris@42 159 Tp = W[15];
Chris@42 160 T1M = FNMS(Tv, Tt, T1L);
Chris@42 161 Tx = FMA(Tv, Tw, Tu);
Chris@42 162 T1J = Tm * Tq;
Chris@42 163 To = Tm * Tn;
Chris@42 164 }
Chris@42 165 {
Chris@42 166 E TF, T2v, TH, TK, TJ, T1T, TR, T1Q, TI;
Chris@42 167 {
Chris@42 168 E TB, TE, T1K, Tr, TA, TD;
Chris@42 169 TB = cr[WS(rs, 12)];
Chris@42 170 TE = ci[WS(rs, 12)];
Chris@42 171 T1K = FNMS(Tp, Tn, T1J);
Chris@42 172 Tr = FMA(Tp, Tq, To);
Chris@42 173 TA = W[22];
Chris@42 174 TD = W[23];
Chris@42 175 {
Chris@42 176 E T1N, T2q, Ty, T2s;
Chris@42 177 T1N = T1K - T1M;
Chris@42 178 T2q = T1K + T1M;
Chris@42 179 Ty = Tr + Tx;
Chris@42 180 T2s = Tr - Tx;
Chris@42 181 {
Chris@42 182 E T2u, TC, T2r, T1I;
Chris@42 183 T2u = TA * TE;
Chris@42 184 TC = TA * TB;
Chris@42 185 T2r = FNMS(KP500000000, T2q, T2p);
Chris@42 186 T39 = T2p + T2q;
Chris@42 187 T1I = FNMS(KP500000000, Ty, Tl);
Chris@42 188 Tz = Tl + Ty;
Chris@42 189 TF = FMA(TD, TE, TC);
Chris@42 190 T2U = FMA(KP866025403, T2s, T2r);
Chris@42 191 T2t = FNMS(KP866025403, T2s, T2r);
Chris@42 192 T1O = FNMS(KP866025403, T1N, T1I);
Chris@42 193 T2e = FMA(KP866025403, T1N, T1I);
Chris@42 194 T2v = FNMS(TD, TB, T2u);
Chris@42 195 }
Chris@42 196 }
Chris@42 197 }
Chris@42 198 {
Chris@42 199 E TN, TQ, TM, TP, T1S, TO, TG;
Chris@42 200 TN = cr[WS(rs, 7)];
Chris@42 201 TQ = ci[WS(rs, 7)];
Chris@42 202 TM = W[12];
Chris@42 203 TP = W[13];
Chris@42 204 TH = cr[WS(rs, 2)];
Chris@42 205 TK = ci[WS(rs, 2)];
Chris@42 206 T1S = TM * TQ;
Chris@42 207 TO = TM * TN;
Chris@42 208 TG = W[2];
Chris@42 209 TJ = W[3];
Chris@42 210 T1T = FNMS(TP, TN, T1S);
Chris@42 211 TR = FMA(TP, TQ, TO);
Chris@42 212 T1Q = TG * TK;
Chris@42 213 TI = TG * TH;
Chris@42 214 }
Chris@42 215 {
Chris@42 216 E TW, TZ, T1R, TL, TV, TY;
Chris@42 217 TW = cr[WS(rs, 6)];
Chris@42 218 TZ = ci[WS(rs, 6)];
Chris@42 219 T1R = FNMS(TJ, TH, T1Q);
Chris@42 220 TL = FMA(TJ, TK, TI);
Chris@42 221 TV = W[10];
Chris@42 222 TY = W[11];
Chris@42 223 {
Chris@42 224 E T1U, T2w, TS, T2y;
Chris@42 225 T1U = T1R - T1T;
Chris@42 226 T2w = T1R + T1T;
Chris@42 227 TS = TL + TR;
Chris@42 228 T2y = TL - TR;
Chris@42 229 {
Chris@42 230 E T2B, TX, T2x, T1P;
Chris@42 231 T2B = TV * TZ;
Chris@42 232 TX = TV * TW;
Chris@42 233 T2x = FNMS(KP500000000, T2w, T2v);
Chris@42 234 T3a = T2v + T2w;
Chris@42 235 T1P = FNMS(KP500000000, TS, TF);
Chris@42 236 TT = TF + TS;
Chris@42 237 T10 = FMA(TY, TZ, TX);
Chris@42 238 T2V = FMA(KP866025403, T2y, T2x);
Chris@42 239 T2z = FNMS(KP866025403, T2y, T2x);
Chris@42 240 T1V = FNMS(KP866025403, T1U, T1P);
Chris@42 241 T2f = FMA(KP866025403, T1U, T1P);
Chris@42 242 T2C = FNMS(TY, TW, T2B);
Chris@42 243 }
Chris@42 244 }
Chris@42 245 }
Chris@42 246 {
Chris@42 247 E T18, T1b, T17, T1a, T20, T19, T11;
Chris@42 248 T18 = cr[WS(rs, 1)];
Chris@42 249 T1b = ci[WS(rs, 1)];
Chris@42 250 T17 = W[0];
Chris@42 251 T1a = W[1];
Chris@42 252 T12 = cr[WS(rs, 11)];
Chris@42 253 T15 = ci[WS(rs, 11)];
Chris@42 254 T20 = T17 * T1b;
Chris@42 255 T19 = T17 * T18;
Chris@42 256 T11 = W[20];
Chris@42 257 T14 = W[21];
Chris@42 258 T21 = FNMS(T1a, T18, T20);
Chris@42 259 T1c = FMA(T1a, T1b, T19);
Chris@42 260 T1Y = T11 * T15;
Chris@42 261 T13 = T11 * T12;
Chris@42 262 }
Chris@42 263 }
Chris@42 264 }
Chris@42 265 }
Chris@42 266 {
Chris@42 267 E T3I, T3O, T3w, T2d, T3J, T3P, T3x, T3C, T3D, T3f, T3g, T2Q, T2O, T3r, T3q;
Chris@42 268 E T2k, T2m;
Chris@42 269 {
Chris@42 270 E T3b, T1Z, T16, TU;
Chris@42 271 T3I = T39 + T3a;
Chris@42 272 T3b = T39 - T3a;
Chris@42 273 T1Z = FNMS(T14, T12, T1Y);
Chris@42 274 T16 = FMA(T14, T15, T13);
Chris@42 275 T3O = TT - Tz;
Chris@42 276 TU = Tz + TT;
Chris@42 277 {
Chris@42 278 E T1H, T2G, T2h, T3e, T3c, T34, T1W, T32, T30, T33, T2b, T2S, T2R;
Chris@42 279 {
Chris@42 280 E T2W, T22, T1d, T2F, T2E, T36, T2D;
Chris@42 281 T2W = T2U - T2V;
Chris@42 282 T3w = T2U + T2V;
Chris@42 283 T22 = T1Z - T21;
Chris@42 284 T2D = T1Z + T21;
Chris@42 285 T1d = T16 + T1c;
Chris@42 286 T2F = T16 - T1c;
Chris@42 287 T2E = FNMS(KP500000000, T2D, T2C);
Chris@42 288 T36 = T2C + T2D;
Chris@42 289 T2d = FMA(KP866025403, T1G, T1B);
Chris@42 290 T1H = FNMS(KP866025403, T1G, T1B);
Chris@42 291 {
Chris@42 292 E T1e, T1X, T38, T2X;
Chris@42 293 T1e = T10 + T1d;
Chris@42 294 T1X = FNMS(KP500000000, T1d, T10);
Chris@42 295 T38 = T36 - T37;
Chris@42 296 T3J = T36 + T37;
Chris@42 297 T2G = FNMS(KP866025403, T2F, T2E);
Chris@42 298 T2X = FMA(KP866025403, T2F, T2E);
Chris@42 299 {
Chris@42 300 E T1z, T23, T2Z, T1A;
Chris@42 301 T3P = T1y - T1e;
Chris@42 302 T1z = T1e + T1y;
Chris@42 303 T23 = FNMS(KP866025403, T22, T1X);
Chris@42 304 T2h = FMA(KP866025403, T22, T1X);
Chris@42 305 T3e = FMA(KP618033988, T38, T3b);
Chris@42 306 T3c = FNMS(KP618033988, T3b, T38);
Chris@42 307 T2Z = T2X - T2Y;
Chris@42 308 T3x = T2X + T2Y;
Chris@42 309 T1A = TU + T1z;
Chris@42 310 T34 = TU - T1z;
Chris@42 311 T3C = T1O - T1V;
Chris@42 312 T1W = T1O + T1V;
Chris@42 313 T32 = FNMS(KP618033988, T2W, T2Z);
Chris@42 314 T30 = FMA(KP618033988, T2Z, T2W);
Chris@42 315 cr[0] = Tf + T1A;
Chris@42 316 T33 = FNMS(KP250000000, T1A, Tf);
Chris@42 317 T2b = T23 + T2a;
Chris@42 318 T3D = T23 - T2a;
Chris@42 319 }
Chris@42 320 }
Chris@42 321 }
Chris@42 322 {
Chris@42 323 E T2A, T2N, T3d, T35, T2c;
Chris@42 324 T3f = T2t + T2z;
Chris@42 325 T2A = T2t - T2z;
Chris@42 326 T2N = T2G - T2M;
Chris@42 327 T3g = T2G + T2M;
Chris@42 328 T3d = FMA(KP559016994, T34, T33);
Chris@42 329 T35 = FNMS(KP559016994, T34, T33);
Chris@42 330 T2c = T1W + T2b;
Chris@42 331 T2S = T1W - T2b;
Chris@42 332 cr[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
Chris@42 333 ci[WS(rs, 2)] = FNMS(KP951056516, T3c, T35);
Chris@42 334 cr[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
Chris@42 335 ci[WS(rs, 5)] = FNMS(KP951056516, T3e, T3d);
Chris@42 336 cr[WS(rs, 5)] = T1H + T2c;
Chris@42 337 T2R = FNMS(KP250000000, T2c, T1H);
Chris@42 338 T2Q = FNMS(KP618033988, T2A, T2N);
Chris@42 339 T2O = FMA(KP618033988, T2N, T2A);
Chris@42 340 }
Chris@42 341 {
Chris@42 342 E T2T, T31, T2g, T2j;
Chris@42 343 T2T = FMA(KP559016994, T2S, T2R);
Chris@42 344 T31 = FNMS(KP559016994, T2S, T2R);
Chris@42 345 T2g = T2e + T2f;
Chris@42 346 T3r = T2e - T2f;
Chris@42 347 T3q = T2h - T2i;
Chris@42 348 T2j = T2h + T2i;
Chris@42 349 ci[WS(rs, 3)] = FMA(KP951056516, T30, T2T);
Chris@42 350 ci[0] = FNMS(KP951056516, T30, T2T);
Chris@42 351 ci[WS(rs, 6)] = FMA(KP951056516, T32, T31);
Chris@42 352 cr[WS(rs, 2)] = FNMS(KP951056516, T32, T31);
Chris@42 353 T2k = T2g + T2j;
Chris@42 354 T2m = T2g - T2j;
Chris@42 355 }
Chris@42 356 }
Chris@42 357 }
Chris@42 358 {
Chris@42 359 E T3m, T3s, T3u, T3o, T3h, T2l, T2n, T2P;
Chris@42 360 ci[WS(rs, 4)] = T2d + T2k;
Chris@42 361 T2l = FNMS(KP250000000, T2k, T2d);
Chris@42 362 T3m = FMA(KP866025403, T3l, T3k);
Chris@42 363 T3v = FNMS(KP866025403, T3l, T3k);
Chris@42 364 T3s = FNMS(KP618033988, T3r, T3q);
Chris@42 365 T3u = FMA(KP618033988, T3q, T3r);
Chris@42 366 T2n = FMA(KP559016994, T2m, T2l);
Chris@42 367 T2P = FNMS(KP559016994, T2m, T2l);
Chris@42 368 ci[WS(rs, 1)] = FMA(KP951056516, T2Q, T2P);
Chris@42 369 cr[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
Chris@42 370 cr[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
Chris@42 371 cr[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
Chris@42 372 T3o = T3f - T3g;
Chris@42 373 T3h = T3f + T3g;
Chris@42 374 {
Chris@42 375 E T3S, T3Q, T3K, T3M, T3n, T3p, T3t, T3L, T3R, T3N;
Chris@42 376 cr[WS(rs, 10)] = -(T3h + T3m);
Chris@42 377 T3n = FNMS(KP250000000, T3h, T3m);
Chris@42 378 T3S = FNMS(KP618033988, T3O, T3P);
Chris@42 379 T3Q = FMA(KP618033988, T3P, T3O);
Chris@42 380 T3p = FNMS(KP559016994, T3o, T3n);
Chris@42 381 T3t = FMA(KP559016994, T3o, T3n);
Chris@42 382 ci[WS(rs, 7)] = FMA(KP951056516, T3s, T3p);
Chris@42 383 cr[WS(rs, 13)] = FMS(KP951056516, T3s, T3p);
Chris@42 384 ci[WS(rs, 13)] = FNMS(KP951056516, T3u, T3t);
Chris@42 385 ci[WS(rs, 10)] = FMA(KP951056516, T3u, T3t);
Chris@42 386 T3K = T3I + T3J;
Chris@42 387 T3M = T3I - T3J;
Chris@42 388 ci[WS(rs, 14)] = T3K + T3H;
Chris@42 389 T3L = FNMS(KP250000000, T3K, T3H);
Chris@42 390 T3E = FMA(KP618033988, T3D, T3C);
Chris@42 391 T3G = FNMS(KP618033988, T3C, T3D);
Chris@42 392 T3R = FNMS(KP559016994, T3M, T3L);
Chris@42 393 T3N = FMA(KP559016994, T3M, T3L);
Chris@42 394 ci[WS(rs, 8)] = FMA(KP951056516, T3Q, T3N);
Chris@42 395 cr[WS(rs, 9)] = FMS(KP951056516, T3Q, T3N);
Chris@42 396 ci[WS(rs, 11)] = FMA(KP951056516, T3S, T3R);
Chris@42 397 cr[WS(rs, 12)] = FMS(KP951056516, T3S, T3R);
Chris@42 398 T3A = T3x - T3w;
Chris@42 399 T3y = T3w + T3x;
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 }
Chris@42 404 ci[WS(rs, 9)] = T3y + T3v;
Chris@42 405 T3z = FNMS(KP250000000, T3y, T3v);
Chris@42 406 T3F = FMA(KP559016994, T3A, T3z);
Chris@42 407 T3B = FNMS(KP559016994, T3A, T3z);
Chris@42 408 cr[WS(rs, 14)] = -(FMA(KP951056516, T3E, T3B));
Chris@42 409 cr[WS(rs, 11)] = FMS(KP951056516, T3E, T3B);
Chris@42 410 ci[WS(rs, 12)] = FMA(KP951056516, T3G, T3F);
Chris@42 411 cr[WS(rs, 8)] = FMS(KP951056516, T3G, T3F);
Chris@42 412 }
Chris@42 413 }
Chris@42 414 }
Chris@42 415
Chris@42 416 static const tw_instr twinstr[] = {
Chris@42 417 {TW_FULL, 1, 15},
Chris@42 418 {TW_NEXT, 1, 0}
Chris@42 419 };
Chris@42 420
Chris@42 421 static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {72, 28, 112, 0} };
Chris@42 422
Chris@42 423 void X(codelet_hf_15) (planner *p) {
Chris@42 424 X(khc2hc_register) (p, hf_15, &desc);
Chris@42 425 }
Chris@42 426 #else /* HAVE_FMA */
Chris@42 427
Chris@42 428 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include hf.h */
Chris@42 429
Chris@42 430 /*
Chris@42 431 * This function contains 184 FP additions, 112 FP multiplications,
Chris@42 432 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@42 433 * 65 stack variables, 6 constants, and 60 memory accesses
Chris@42 434 */
Chris@42 435 #include "hf.h"
Chris@42 436
Chris@42 437 static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 438 {
Chris@42 439 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 440 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 441 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 442 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 443 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 444 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 445 {
Chris@42 446 INT m;
Chris@42 447 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 448 E T1q, T2Q, Td, T1n, T2T, T3l, T13, T1k, T1l, T2E, T2F, T3j, T1H, T1T, T2k;
Chris@42 449 E T2w, T2f, T2v, T1M, T1U, Tu, TL, TM, T2H, T2I, T3i, T1w, T1Q, T29, T2t;
Chris@42 450 E T24, T2s, T1B, T1R;
Chris@42 451 {
Chris@42 452 E T1, T2R, T6, T1o, Tb, T1p, Tc, T2S;
Chris@42 453 T1 = cr[0];
Chris@42 454 T2R = ci[0];
Chris@42 455 {
Chris@42 456 E T3, T5, T2, T4;
Chris@42 457 T3 = cr[WS(rs, 5)];
Chris@42 458 T5 = ci[WS(rs, 5)];
Chris@42 459 T2 = W[8];
Chris@42 460 T4 = W[9];
Chris@42 461 T6 = FMA(T2, T3, T4 * T5);
Chris@42 462 T1o = FNMS(T4, T3, T2 * T5);
Chris@42 463 }
Chris@42 464 {
Chris@42 465 E T8, Ta, T7, T9;
Chris@42 466 T8 = cr[WS(rs, 10)];
Chris@42 467 Ta = ci[WS(rs, 10)];
Chris@42 468 T7 = W[18];
Chris@42 469 T9 = W[19];
Chris@42 470 Tb = FMA(T7, T8, T9 * Ta);
Chris@42 471 T1p = FNMS(T9, T8, T7 * Ta);
Chris@42 472 }
Chris@42 473 T1q = KP866025403 * (T1o - T1p);
Chris@42 474 T2Q = KP866025403 * (Tb - T6);
Chris@42 475 Tc = T6 + Tb;
Chris@42 476 Td = T1 + Tc;
Chris@42 477 T1n = FNMS(KP500000000, Tc, T1);
Chris@42 478 T2S = T1o + T1p;
Chris@42 479 T2T = FNMS(KP500000000, T2S, T2R);
Chris@42 480 T3l = T2S + T2R;
Chris@42 481 }
Chris@42 482 {
Chris@42 483 E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
Chris@42 484 E T2i;
Chris@42 485 {
Chris@42 486 E TO, TQ, TN, TP;
Chris@42 487 TO = cr[WS(rs, 6)];
Chris@42 488 TQ = ci[WS(rs, 6)];
Chris@42 489 TN = W[10];
Chris@42 490 TP = W[11];
Chris@42 491 TR = FMA(TN, TO, TP * TQ);
Chris@42 492 T2c = FNMS(TP, TO, TN * TQ);
Chris@42 493 }
Chris@42 494 {
Chris@42 495 E T15, T17, T14, T16;
Chris@42 496 T15 = cr[WS(rs, 9)];
Chris@42 497 T17 = ci[WS(rs, 9)];
Chris@42 498 T14 = W[16];
Chris@42 499 T16 = W[17];
Chris@42 500 T18 = FMA(T14, T15, T16 * T17);
Chris@42 501 T2h = FNMS(T16, T15, T14 * T17);
Chris@42 502 }
Chris@42 503 {
Chris@42 504 E TT, TV, TS, TU;
Chris@42 505 TT = cr[WS(rs, 11)];
Chris@42 506 TV = ci[WS(rs, 11)];
Chris@42 507 TS = W[20];
Chris@42 508 TU = W[21];
Chris@42 509 TW = FMA(TS, TT, TU * TV);
Chris@42 510 T1E = FNMS(TU, TT, TS * TV);
Chris@42 511 }
Chris@42 512 {
Chris@42 513 E TY, T10, TX, TZ;
Chris@42 514 TY = cr[WS(rs, 1)];
Chris@42 515 T10 = ci[WS(rs, 1)];
Chris@42 516 TX = W[0];
Chris@42 517 TZ = W[1];
Chris@42 518 T11 = FMA(TX, TY, TZ * T10);
Chris@42 519 T1F = FNMS(TZ, TY, TX * T10);
Chris@42 520 }
Chris@42 521 T12 = TW + T11;
Chris@42 522 T2d = T1E + T1F;
Chris@42 523 {
Chris@42 524 E T1a, T1c, T19, T1b;
Chris@42 525 T1a = cr[WS(rs, 14)];
Chris@42 526 T1c = ci[WS(rs, 14)];
Chris@42 527 T19 = W[26];
Chris@42 528 T1b = W[27];
Chris@42 529 T1d = FMA(T19, T1a, T1b * T1c);
Chris@42 530 T1J = FNMS(T1b, T1a, T19 * T1c);
Chris@42 531 }
Chris@42 532 {
Chris@42 533 E T1f, T1h, T1e, T1g;
Chris@42 534 T1f = cr[WS(rs, 4)];
Chris@42 535 T1h = ci[WS(rs, 4)];
Chris@42 536 T1e = W[6];
Chris@42 537 T1g = W[7];
Chris@42 538 T1i = FMA(T1e, T1f, T1g * T1h);
Chris@42 539 T1K = FNMS(T1g, T1f, T1e * T1h);
Chris@42 540 }
Chris@42 541 T1j = T1d + T1i;
Chris@42 542 T2i = T1J + T1K;
Chris@42 543 {
Chris@42 544 E T1D, T1G, T2g, T2j;
Chris@42 545 T13 = TR + T12;
Chris@42 546 T1k = T18 + T1j;
Chris@42 547 T1l = T13 + T1k;
Chris@42 548 T2E = T2c + T2d;
Chris@42 549 T2F = T2h + T2i;
Chris@42 550 T3j = T2E + T2F;
Chris@42 551 T1D = FNMS(KP500000000, T12, TR);
Chris@42 552 T1G = KP866025403 * (T1E - T1F);
Chris@42 553 T1H = T1D - T1G;
Chris@42 554 T1T = T1D + T1G;
Chris@42 555 T2g = KP866025403 * (T1d - T1i);
Chris@42 556 T2j = FNMS(KP500000000, T2i, T2h);
Chris@42 557 T2k = T2g - T2j;
Chris@42 558 T2w = T2g + T2j;
Chris@42 559 {
Chris@42 560 E T2b, T2e, T1I, T1L;
Chris@42 561 T2b = KP866025403 * (T11 - TW);
Chris@42 562 T2e = FNMS(KP500000000, T2d, T2c);
Chris@42 563 T2f = T2b + T2e;
Chris@42 564 T2v = T2e - T2b;
Chris@42 565 T1I = FNMS(KP500000000, T1j, T18);
Chris@42 566 T1L = KP866025403 * (T1J - T1K);
Chris@42 567 T1M = T1I - T1L;
Chris@42 568 T1U = T1I + T1L;
Chris@42 569 }
Chris@42 570 }
Chris@42 571 }
Chris@42 572 {
Chris@42 573 E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
Chris@42 574 E T27;
Chris@42 575 {
Chris@42 576 E Tf, Th, Te, Tg;
Chris@42 577 Tf = cr[WS(rs, 3)];
Chris@42 578 Th = ci[WS(rs, 3)];
Chris@42 579 Te = W[4];
Chris@42 580 Tg = W[5];
Chris@42 581 Ti = FMA(Te, Tf, Tg * Th);
Chris@42 582 T21 = FNMS(Tg, Tf, Te * Th);
Chris@42 583 }
Chris@42 584 {
Chris@42 585 E Tw, Ty, Tv, Tx;
Chris@42 586 Tw = cr[WS(rs, 12)];
Chris@42 587 Ty = ci[WS(rs, 12)];
Chris@42 588 Tv = W[22];
Chris@42 589 Tx = W[23];
Chris@42 590 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 591 T26 = FNMS(Tx, Tw, Tv * Ty);
Chris@42 592 }
Chris@42 593 {
Chris@42 594 E Tk, Tm, Tj, Tl;
Chris@42 595 Tk = cr[WS(rs, 8)];
Chris@42 596 Tm = ci[WS(rs, 8)];
Chris@42 597 Tj = W[14];
Chris@42 598 Tl = W[15];
Chris@42 599 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@42 600 T1t = FNMS(Tl, Tk, Tj * Tm);
Chris@42 601 }
Chris@42 602 {
Chris@42 603 E Tp, Tr, To, Tq;
Chris@42 604 Tp = cr[WS(rs, 13)];
Chris@42 605 Tr = ci[WS(rs, 13)];
Chris@42 606 To = W[24];
Chris@42 607 Tq = W[25];
Chris@42 608 Ts = FMA(To, Tp, Tq * Tr);
Chris@42 609 T1u = FNMS(Tq, Tp, To * Tr);
Chris@42 610 }
Chris@42 611 Tt = Tn + Ts;
Chris@42 612 T22 = T1t + T1u;
Chris@42 613 {
Chris@42 614 E TB, TD, TA, TC;
Chris@42 615 TB = cr[WS(rs, 2)];
Chris@42 616 TD = ci[WS(rs, 2)];
Chris@42 617 TA = W[2];
Chris@42 618 TC = W[3];
Chris@42 619 TE = FMA(TA, TB, TC * TD);
Chris@42 620 T1y = FNMS(TC, TB, TA * TD);
Chris@42 621 }
Chris@42 622 {
Chris@42 623 E TG, TI, TF, TH;
Chris@42 624 TG = cr[WS(rs, 7)];
Chris@42 625 TI = ci[WS(rs, 7)];
Chris@42 626 TF = W[12];
Chris@42 627 TH = W[13];
Chris@42 628 TJ = FMA(TF, TG, TH * TI);
Chris@42 629 T1z = FNMS(TH, TG, TF * TI);
Chris@42 630 }
Chris@42 631 TK = TE + TJ;
Chris@42 632 T27 = T1y + T1z;
Chris@42 633 {
Chris@42 634 E T1s, T1v, T25, T28;
Chris@42 635 Tu = Ti + Tt;
Chris@42 636 TL = Tz + TK;
Chris@42 637 TM = Tu + TL;
Chris@42 638 T2H = T21 + T22;
Chris@42 639 T2I = T26 + T27;
Chris@42 640 T3i = T2H + T2I;
Chris@42 641 T1s = FNMS(KP500000000, Tt, Ti);
Chris@42 642 T1v = KP866025403 * (T1t - T1u);
Chris@42 643 T1w = T1s - T1v;
Chris@42 644 T1Q = T1s + T1v;
Chris@42 645 T25 = KP866025403 * (TJ - TE);
Chris@42 646 T28 = FNMS(KP500000000, T27, T26);
Chris@42 647 T29 = T25 + T28;
Chris@42 648 T2t = T28 - T25;
Chris@42 649 {
Chris@42 650 E T20, T23, T1x, T1A;
Chris@42 651 T20 = KP866025403 * (Ts - Tn);
Chris@42 652 T23 = FNMS(KP500000000, T22, T21);
Chris@42 653 T24 = T20 + T23;
Chris@42 654 T2s = T23 - T20;
Chris@42 655 T1x = FNMS(KP500000000, TK, Tz);
Chris@42 656 T1A = KP866025403 * (T1y - T1z);
Chris@42 657 T1B = T1x - T1A;
Chris@42 658 T1R = T1x + T1A;
Chris@42 659 }
Chris@42 660 }
Chris@42 661 }
Chris@42 662 {
Chris@42 663 E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
Chris@42 664 T2C = KP559016994 * (TM - T1l);
Chris@42 665 T1m = TM + T1l;
Chris@42 666 T2B = FNMS(KP250000000, T1m, Td);
Chris@42 667 T2G = T2E - T2F;
Chris@42 668 T2J = T2H - T2I;
Chris@42 669 T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
Chris@42 670 T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
Chris@42 671 cr[0] = Td + T1m;
Chris@42 672 T2L = T2C + T2B;
Chris@42 673 ci[WS(rs, 5)] = T2L - T2M;
Chris@42 674 cr[WS(rs, 6)] = T2L + T2M;
Chris@42 675 T2D = T2B - T2C;
Chris@42 676 ci[WS(rs, 2)] = T2D - T2K;
Chris@42 677 cr[WS(rs, 3)] = T2D + T2K;
Chris@42 678 }
Chris@42 679 {
Chris@42 680 E T3k, T3m, T3n, T3h, T3p, T3f, T3g, T3q, T3o;
Chris@42 681 T3k = KP559016994 * (T3i - T3j);
Chris@42 682 T3m = T3i + T3j;
Chris@42 683 T3n = FNMS(KP250000000, T3m, T3l);
Chris@42 684 T3f = T1k - T13;
Chris@42 685 T3g = Tu - TL;
Chris@42 686 T3h = FNMS(KP951056516, T3g, KP587785252 * T3f);
Chris@42 687 T3p = FMA(KP587785252, T3g, KP951056516 * T3f);
Chris@42 688 ci[WS(rs, 14)] = T3m + T3l;
Chris@42 689 T3q = T3n - T3k;
Chris@42 690 cr[WS(rs, 12)] = T3p - T3q;
Chris@42 691 ci[WS(rs, 11)] = T3p + T3q;
Chris@42 692 T3o = T3k + T3n;
Chris@42 693 cr[WS(rs, 9)] = T3h - T3o;
Chris@42 694 ci[WS(rs, 8)] = T3h + T3o;
Chris@42 695 }
Chris@42 696 {
Chris@42 697 E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
Chris@42 698 {
Chris@42 699 E T2u, T2x, T1C, T1N;
Chris@42 700 T2u = T2s - T2t;
Chris@42 701 T2x = T2v - T2w;
Chris@42 702 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
Chris@42 703 T2A = FNMS(KP587785252, T2u, KP951056516 * T2x);
Chris@42 704 T1r = T1n - T1q;
Chris@42 705 T1C = T1w + T1B;
Chris@42 706 T1N = T1H + T1M;
Chris@42 707 T1O = T1C + T1N;
Chris@42 708 T2p = KP559016994 * (T1C - T1N);
Chris@42 709 T2q = FNMS(KP250000000, T1O, T1r);
Chris@42 710 }
Chris@42 711 cr[WS(rs, 5)] = T1r + T1O;
Chris@42 712 T2z = T2q - T2p;
Chris@42 713 cr[WS(rs, 2)] = T2z - T2A;
Chris@42 714 ci[WS(rs, 6)] = T2z + T2A;
Chris@42 715 T2r = T2p + T2q;
Chris@42 716 ci[0] = T2r - T2y;
Chris@42 717 ci[WS(rs, 3)] = T2r + T2y;
Chris@42 718 }
Chris@42 719 {
Chris@42 720 E T35, T3d, T39, T3a, T38, T3b, T3e, T3c;
Chris@42 721 {
Chris@42 722 E T33, T34, T36, T37;
Chris@42 723 T33 = T1w - T1B;
Chris@42 724 T34 = T1H - T1M;
Chris@42 725 T35 = FMA(KP951056516, T33, KP587785252 * T34);
Chris@42 726 T3d = FNMS(KP587785252, T33, KP951056516 * T34);
Chris@42 727 T39 = T2T - T2Q;
Chris@42 728 T36 = T2v + T2w;
Chris@42 729 T37 = T2s + T2t;
Chris@42 730 T3a = T37 + T36;
Chris@42 731 T38 = KP559016994 * (T36 - T37);
Chris@42 732 T3b = FNMS(KP250000000, T3a, T39);
Chris@42 733 }
Chris@42 734 ci[WS(rs, 9)] = T3a + T39;
Chris@42 735 T3e = T38 + T3b;
Chris@42 736 cr[WS(rs, 8)] = T3d - T3e;
Chris@42 737 ci[WS(rs, 12)] = T3d + T3e;
Chris@42 738 T3c = T38 - T3b;
Chris@42 739 cr[WS(rs, 11)] = T35 + T3c;
Chris@42 740 cr[WS(rs, 14)] = T3c - T35;
Chris@42 741 }
Chris@42 742 {
Chris@42 743 E T2X, T31, T2U, T2P, T2Y, T2Z, T32, T30;
Chris@42 744 {
Chris@42 745 E T2V, T2W, T2N, T2O;
Chris@42 746 T2V = T1T - T1U;
Chris@42 747 T2W = T1Q - T1R;
Chris@42 748 T2X = FNMS(KP587785252, T2W, KP951056516 * T2V);
Chris@42 749 T31 = FMA(KP951056516, T2W, KP587785252 * T2V);
Chris@42 750 T2U = T2Q + T2T;
Chris@42 751 T2N = T2k - T2f;
Chris@42 752 T2O = T24 + T29;
Chris@42 753 T2P = T2N - T2O;
Chris@42 754 T2Y = FMA(KP250000000, T2P, T2U);
Chris@42 755 T2Z = KP559016994 * (T2O + T2N);
Chris@42 756 }
Chris@42 757 cr[WS(rs, 10)] = T2P - T2U;
Chris@42 758 T32 = T2Z + T2Y;
Chris@42 759 ci[WS(rs, 10)] = T31 + T32;
Chris@42 760 ci[WS(rs, 13)] = T32 - T31;
Chris@42 761 T30 = T2Y - T2Z;
Chris@42 762 cr[WS(rs, 13)] = T2X - T30;
Chris@42 763 ci[WS(rs, 7)] = T2X + T30;
Chris@42 764 }
Chris@42 765 {
Chris@42 766 E T2m, T2o, T1P, T1W, T1X, T1Y, T1Z, T2n;
Chris@42 767 {
Chris@42 768 E T2a, T2l, T1S, T1V;
Chris@42 769 T2a = T24 - T29;
Chris@42 770 T2l = T2f + T2k;
Chris@42 771 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
Chris@42 772 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
Chris@42 773 T1P = T1n + T1q;
Chris@42 774 T1S = T1Q + T1R;
Chris@42 775 T1V = T1T + T1U;
Chris@42 776 T1W = T1S + T1V;
Chris@42 777 T1X = KP559016994 * (T1S - T1V);
Chris@42 778 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@42 779 }
Chris@42 780 ci[WS(rs, 4)] = T1P + T1W;
Chris@42 781 T1Z = T1X + T1Y;
Chris@42 782 cr[WS(rs, 4)] = T1Z - T2m;
Chris@42 783 cr[WS(rs, 1)] = T1Z + T2m;
Chris@42 784 T2n = T1Y - T1X;
Chris@42 785 cr[WS(rs, 7)] = T2n - T2o;
Chris@42 786 ci[WS(rs, 1)] = T2n + T2o;
Chris@42 787 }
Chris@42 788 }
Chris@42 789 }
Chris@42 790 }
Chris@42 791
Chris@42 792 static const tw_instr twinstr[] = {
Chris@42 793 {TW_FULL, 1, 15},
Chris@42 794 {TW_NEXT, 1, 0}
Chris@42 795 };
Chris@42 796
Chris@42 797 static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {128, 56, 56, 0} };
Chris@42 798
Chris@42 799 void X(codelet_hf_15) (planner *p) {
Chris@42 800 X(khc2hc_register) (p, hf_15, &desc);
Chris@42 801 }
Chris@42 802 #endif /* HAVE_FMA */