annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@82 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@82 33 * 51 stack variables, 6 constants, and 60 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 45 {
Chris@82 46 INT m;
Chris@82 47 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 48 E T1, T3i, T1G, T3l, Te, T1B, T3j, T3k, T1y, T2i, T2a, T2M, T37, T2Y, Tz;
Chris@82 49 E T2e, T1O, T2t, T39, T2U, TT, T2f, T1V, T2z, T3a, T2V, T1e, T2h, T23, T2G;
Chris@82 50 E T36, T2X;
Chris@82 51 {
Chris@82 52 E T7, T1D, Td, T1F;
Chris@82 53 T1 = cr[0];
Chris@82 54 T3i = ci[0];
Chris@82 55 {
Chris@82 56 E T3, T6, T4, T1C, T2, T5;
Chris@82 57 T3 = cr[WS(rs, 5)];
Chris@82 58 T6 = ci[WS(rs, 5)];
Chris@82 59 T2 = W[8];
Chris@82 60 T4 = T2 * T3;
Chris@82 61 T1C = T2 * T6;
Chris@82 62 T5 = W[9];
Chris@82 63 T7 = FMA(T5, T6, T4);
Chris@82 64 T1D = FNMS(T5, T3, T1C);
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E T9, Tc, Ta, T1E, T8, Tb;
Chris@82 68 T9 = cr[WS(rs, 10)];
Chris@82 69 Tc = ci[WS(rs, 10)];
Chris@82 70 T8 = W[18];
Chris@82 71 Ta = T8 * T9;
Chris@82 72 T1E = T8 * Tc;
Chris@82 73 Tb = W[19];
Chris@82 74 Td = FMA(Tb, Tc, Ta);
Chris@82 75 T1F = FNMS(Tb, T9, T1E);
Chris@82 76 }
Chris@82 77 T1G = T1D - T1F;
Chris@82 78 T3l = Td - T7;
Chris@82 79 Te = T7 + Td;
Chris@82 80 T1B = FNMS(KP500000000, Te, T1);
Chris@82 81 T3j = T1D + T1F;
Chris@82 82 T3k = FNMS(KP500000000, T3j, T3i);
Chris@82 83 }
Chris@82 84 {
Chris@82 85 E T1k, T2I, T1w, T28, T1q, T26;
Chris@82 86 {
Chris@82 87 E T1g, T1j, T1h, T2H, T1f, T1i;
Chris@82 88 T1g = cr[WS(rs, 9)];
Chris@82 89 T1j = ci[WS(rs, 9)];
Chris@82 90 T1f = W[16];
Chris@82 91 T1h = T1f * T1g;
Chris@82 92 T2H = T1f * T1j;
Chris@82 93 T1i = W[17];
Chris@82 94 T1k = FMA(T1i, T1j, T1h);
Chris@82 95 T2I = FNMS(T1i, T1g, T2H);
Chris@82 96 }
Chris@82 97 {
Chris@82 98 E T1s, T1v, T1t, T27, T1r, T1u;
Chris@82 99 T1s = cr[WS(rs, 4)];
Chris@82 100 T1v = ci[WS(rs, 4)];
Chris@82 101 T1r = W[6];
Chris@82 102 T1t = T1r * T1s;
Chris@82 103 T27 = T1r * T1v;
Chris@82 104 T1u = W[7];
Chris@82 105 T1w = FMA(T1u, T1v, T1t);
Chris@82 106 T28 = FNMS(T1u, T1s, T27);
Chris@82 107 }
Chris@82 108 {
Chris@82 109 E T1m, T1p, T1n, T25, T1l, T1o;
Chris@82 110 T1m = cr[WS(rs, 14)];
Chris@82 111 T1p = ci[WS(rs, 14)];
Chris@82 112 T1l = W[26];
Chris@82 113 T1n = T1l * T1m;
Chris@82 114 T25 = T1l * T1p;
Chris@82 115 T1o = W[27];
Chris@82 116 T1q = FMA(T1o, T1p, T1n);
Chris@82 117 T26 = FNMS(T1o, T1m, T25);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E T29, T1x, T24, T2L, T2J, T2K;
Chris@82 121 T29 = T26 - T28;
Chris@82 122 T1x = T1q + T1w;
Chris@82 123 T24 = FNMS(KP500000000, T1x, T1k);
Chris@82 124 T1y = T1k + T1x;
Chris@82 125 T2i = FMA(KP866025403, T29, T24);
Chris@82 126 T2a = FNMS(KP866025403, T29, T24);
Chris@82 127 T2L = T1q - T1w;
Chris@82 128 T2J = T26 + T28;
Chris@82 129 T2K = FNMS(KP500000000, T2J, T2I);
Chris@82 130 T2M = FNMS(KP866025403, T2L, T2K);
Chris@82 131 T37 = T2I + T2J;
Chris@82 132 T2Y = FMA(KP866025403, T2L, T2K);
Chris@82 133 }
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E Tl, T2p, Tx, T1M, Tr, T1K;
Chris@82 137 {
Chris@82 138 E Th, Tk, Ti, T2o, Tg, Tj;
Chris@82 139 Th = cr[WS(rs, 3)];
Chris@82 140 Tk = ci[WS(rs, 3)];
Chris@82 141 Tg = W[4];
Chris@82 142 Ti = Tg * Th;
Chris@82 143 T2o = Tg * Tk;
Chris@82 144 Tj = W[5];
Chris@82 145 Tl = FMA(Tj, Tk, Ti);
Chris@82 146 T2p = FNMS(Tj, Th, T2o);
Chris@82 147 }
Chris@82 148 {
Chris@82 149 E Tt, Tw, Tu, T1L, Ts, Tv;
Chris@82 150 Tt = cr[WS(rs, 13)];
Chris@82 151 Tw = ci[WS(rs, 13)];
Chris@82 152 Ts = W[24];
Chris@82 153 Tu = Ts * Tt;
Chris@82 154 T1L = Ts * Tw;
Chris@82 155 Tv = W[25];
Chris@82 156 Tx = FMA(Tv, Tw, Tu);
Chris@82 157 T1M = FNMS(Tv, Tt, T1L);
Chris@82 158 }
Chris@82 159 {
Chris@82 160 E Tn, Tq, To, T1J, Tm, Tp;
Chris@82 161 Tn = cr[WS(rs, 8)];
Chris@82 162 Tq = ci[WS(rs, 8)];
Chris@82 163 Tm = W[14];
Chris@82 164 To = Tm * Tn;
Chris@82 165 T1J = Tm * Tq;
Chris@82 166 Tp = W[15];
Chris@82 167 Tr = FMA(Tp, Tq, To);
Chris@82 168 T1K = FNMS(Tp, Tn, T1J);
Chris@82 169 }
Chris@82 170 {
Chris@82 171 E T1N, Ty, T1I, T2s, T2q, T2r;
Chris@82 172 T1N = T1K - T1M;
Chris@82 173 Ty = Tr + Tx;
Chris@82 174 T1I = FNMS(KP500000000, Ty, Tl);
Chris@82 175 Tz = Tl + Ty;
Chris@82 176 T2e = FMA(KP866025403, T1N, T1I);
Chris@82 177 T1O = FNMS(KP866025403, T1N, T1I);
Chris@82 178 T2s = Tr - Tx;
Chris@82 179 T2q = T1K + T1M;
Chris@82 180 T2r = FNMS(KP500000000, T2q, T2p);
Chris@82 181 T2t = FNMS(KP866025403, T2s, T2r);
Chris@82 182 T39 = T2p + T2q;
Chris@82 183 T2U = FMA(KP866025403, T2s, T2r);
Chris@82 184 }
Chris@82 185 }
Chris@82 186 {
Chris@82 187 E TF, T2v, TR, T1T, TL, T1R;
Chris@82 188 {
Chris@82 189 E TB, TE, TC, T2u, TA, TD;
Chris@82 190 TB = cr[WS(rs, 12)];
Chris@82 191 TE = ci[WS(rs, 12)];
Chris@82 192 TA = W[22];
Chris@82 193 TC = TA * TB;
Chris@82 194 T2u = TA * TE;
Chris@82 195 TD = W[23];
Chris@82 196 TF = FMA(TD, TE, TC);
Chris@82 197 T2v = FNMS(TD, TB, T2u);
Chris@82 198 }
Chris@82 199 {
Chris@82 200 E TN, TQ, TO, T1S, TM, TP;
Chris@82 201 TN = cr[WS(rs, 7)];
Chris@82 202 TQ = ci[WS(rs, 7)];
Chris@82 203 TM = W[12];
Chris@82 204 TO = TM * TN;
Chris@82 205 T1S = TM * TQ;
Chris@82 206 TP = W[13];
Chris@82 207 TR = FMA(TP, TQ, TO);
Chris@82 208 T1T = FNMS(TP, TN, T1S);
Chris@82 209 }
Chris@82 210 {
Chris@82 211 E TH, TK, TI, T1Q, TG, TJ;
Chris@82 212 TH = cr[WS(rs, 2)];
Chris@82 213 TK = ci[WS(rs, 2)];
Chris@82 214 TG = W[2];
Chris@82 215 TI = TG * TH;
Chris@82 216 T1Q = TG * TK;
Chris@82 217 TJ = W[3];
Chris@82 218 TL = FMA(TJ, TK, TI);
Chris@82 219 T1R = FNMS(TJ, TH, T1Q);
Chris@82 220 }
Chris@82 221 {
Chris@82 222 E T1U, TS, T1P, T2y, T2w, T2x;
Chris@82 223 T1U = T1R - T1T;
Chris@82 224 TS = TL + TR;
Chris@82 225 T1P = FNMS(KP500000000, TS, TF);
Chris@82 226 TT = TF + TS;
Chris@82 227 T2f = FMA(KP866025403, T1U, T1P);
Chris@82 228 T1V = FNMS(KP866025403, T1U, T1P);
Chris@82 229 T2y = TL - TR;
Chris@82 230 T2w = T1R + T1T;
Chris@82 231 T2x = FNMS(KP500000000, T2w, T2v);
Chris@82 232 T2z = FNMS(KP866025403, T2y, T2x);
Chris@82 233 T3a = T2v + T2w;
Chris@82 234 T2V = FMA(KP866025403, T2y, T2x);
Chris@82 235 }
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T10, T2C, T1c, T21, T16, T1Z;
Chris@82 239 {
Chris@82 240 E TW, TZ, TX, T2B, TV, TY;
Chris@82 241 TW = cr[WS(rs, 6)];
Chris@82 242 TZ = ci[WS(rs, 6)];
Chris@82 243 TV = W[10];
Chris@82 244 TX = TV * TW;
Chris@82 245 T2B = TV * TZ;
Chris@82 246 TY = W[11];
Chris@82 247 T10 = FMA(TY, TZ, TX);
Chris@82 248 T2C = FNMS(TY, TW, T2B);
Chris@82 249 }
Chris@82 250 {
Chris@82 251 E T18, T1b, T19, T20, T17, T1a;
Chris@82 252 T18 = cr[WS(rs, 1)];
Chris@82 253 T1b = ci[WS(rs, 1)];
Chris@82 254 T17 = W[0];
Chris@82 255 T19 = T17 * T18;
Chris@82 256 T20 = T17 * T1b;
Chris@82 257 T1a = W[1];
Chris@82 258 T1c = FMA(T1a, T1b, T19);
Chris@82 259 T21 = FNMS(T1a, T18, T20);
Chris@82 260 }
Chris@82 261 {
Chris@82 262 E T12, T15, T13, T1Y, T11, T14;
Chris@82 263 T12 = cr[WS(rs, 11)];
Chris@82 264 T15 = ci[WS(rs, 11)];
Chris@82 265 T11 = W[20];
Chris@82 266 T13 = T11 * T12;
Chris@82 267 T1Y = T11 * T15;
Chris@82 268 T14 = W[21];
Chris@82 269 T16 = FMA(T14, T15, T13);
Chris@82 270 T1Z = FNMS(T14, T12, T1Y);
Chris@82 271 }
Chris@82 272 {
Chris@82 273 E T22, T1d, T1X, T2F, T2D, T2E;
Chris@82 274 T22 = T1Z - T21;
Chris@82 275 T1d = T16 + T1c;
Chris@82 276 T1X = FNMS(KP500000000, T1d, T10);
Chris@82 277 T1e = T10 + T1d;
Chris@82 278 T2h = FMA(KP866025403, T22, T1X);
Chris@82 279 T23 = FNMS(KP866025403, T22, T1X);
Chris@82 280 T2F = T16 - T1c;
Chris@82 281 T2D = T1Z + T21;
Chris@82 282 T2E = FNMS(KP500000000, T2D, T2C);
Chris@82 283 T2G = FNMS(KP866025403, T2F, T2E);
Chris@82 284 T36 = T2C + T2D;
Chris@82 285 T2X = FMA(KP866025403, T2F, T2E);
Chris@82 286 }
Chris@82 287 }
Chris@82 288 {
Chris@82 289 E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
Chris@82 290 {
Chris@82 291 E T38, T3b, TU, T1z;
Chris@82 292 T38 = T36 - T37;
Chris@82 293 T3b = T39 - T3a;
Chris@82 294 T3c = FNMS(KP618033988, T3b, T38);
Chris@82 295 T3e = FMA(KP618033988, T38, T3b);
Chris@82 296 Tf = T1 + Te;
Chris@82 297 TU = Tz + TT;
Chris@82 298 T1z = T1e + T1y;
Chris@82 299 T1A = TU + T1z;
Chris@82 300 T33 = FNMS(KP250000000, T1A, Tf);
Chris@82 301 T34 = TU - T1z;
Chris@82 302 }
Chris@82 303 cr[0] = Tf + T1A;
Chris@82 304 T3d = FMA(KP559016994, T34, T33);
Chris@82 305 ci[WS(rs, 5)] = FNMS(KP951056516, T3e, T3d);
Chris@82 306 cr[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
Chris@82 307 T35 = FNMS(KP559016994, T34, T33);
Chris@82 308 ci[WS(rs, 2)] = FNMS(KP951056516, T3c, T35);
Chris@82 309 cr[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
Chris@82 310 }
Chris@82 311 {
Chris@82 312 E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
Chris@82 313 {
Chris@82 314 E T2W, T2Z, T1W, T2b;
Chris@82 315 T2W = T2U - T2V;
Chris@82 316 T2Z = T2X - T2Y;
Chris@82 317 T30 = FMA(KP618033988, T2Z, T2W);
Chris@82 318 T32 = FNMS(KP618033988, T2W, T2Z);
Chris@82 319 T1H = FNMS(KP866025403, T1G, T1B);
Chris@82 320 T1W = T1O + T1V;
Chris@82 321 T2b = T23 + T2a;
Chris@82 322 T2c = T1W + T2b;
Chris@82 323 T2R = FNMS(KP250000000, T2c, T1H);
Chris@82 324 T2S = T1W - T2b;
Chris@82 325 }
Chris@82 326 cr[WS(rs, 5)] = T1H + T2c;
Chris@82 327 T31 = FNMS(KP559016994, T2S, T2R);
Chris@82 328 cr[WS(rs, 2)] = FNMS(KP951056516, T32, T31);
Chris@82 329 ci[WS(rs, 6)] = FMA(KP951056516, T32, T31);
Chris@82 330 T2T = FMA(KP559016994, T2S, T2R);
Chris@82 331 ci[0] = FNMS(KP951056516, T30, T2T);
Chris@82 332 ci[WS(rs, 3)] = FMA(KP951056516, T30, T2T);
Chris@82 333 }
Chris@82 334 {
Chris@82 335 E T2O, T2Q, T2d, T2k, T2l, T2m, T2n, T2P;
Chris@82 336 {
Chris@82 337 E T2A, T2N, T2g, T2j;
Chris@82 338 T2A = T2t - T2z;
Chris@82 339 T2N = T2G - T2M;
Chris@82 340 T2O = FMA(KP618033988, T2N, T2A);
Chris@82 341 T2Q = FNMS(KP618033988, T2A, T2N);
Chris@82 342 T2d = FMA(KP866025403, T1G, T1B);
Chris@82 343 T2g = T2e + T2f;
Chris@82 344 T2j = T2h + T2i;
Chris@82 345 T2k = T2g + T2j;
Chris@82 346 T2l = FNMS(KP250000000, T2k, T2d);
Chris@82 347 T2m = T2g - T2j;
Chris@82 348 }
Chris@82 349 ci[WS(rs, 4)] = T2d + T2k;
Chris@82 350 T2n = FMA(KP559016994, T2m, T2l);
Chris@82 351 cr[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
Chris@82 352 cr[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
Chris@82 353 T2P = FNMS(KP559016994, T2m, T2l);
Chris@82 354 cr[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
Chris@82 355 ci[WS(rs, 1)] = FMA(KP951056516, T2Q, T2P);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 E T3s, T3u, T3m, T3h, T3n, T3o, T3t, T3p;
Chris@82 359 {
Chris@82 360 E T3q, T3r, T3f, T3g;
Chris@82 361 T3q = T2h - T2i;
Chris@82 362 T3r = T2e - T2f;
Chris@82 363 T3s = FNMS(KP618033988, T3r, T3q);
Chris@82 364 T3u = FMA(KP618033988, T3q, T3r);
Chris@82 365 T3m = FMA(KP866025403, T3l, T3k);
Chris@82 366 T3f = T2t + T2z;
Chris@82 367 T3g = T2G + T2M;
Chris@82 368 T3h = T3f + T3g;
Chris@82 369 T3n = FNMS(KP250000000, T3h, T3m);
Chris@82 370 T3o = T3f - T3g;
Chris@82 371 }
Chris@82 372 cr[WS(rs, 10)] = -(T3h + T3m);
Chris@82 373 T3t = FMA(KP559016994, T3o, T3n);
Chris@82 374 ci[WS(rs, 10)] = FMA(KP951056516, T3u, T3t);
Chris@82 375 ci[WS(rs, 13)] = FNMS(KP951056516, T3u, T3t);
Chris@82 376 T3p = FNMS(KP559016994, T3o, T3n);
Chris@82 377 cr[WS(rs, 13)] = FMS(KP951056516, T3s, T3p);
Chris@82 378 ci[WS(rs, 7)] = FMA(KP951056516, T3s, T3p);
Chris@82 379 }
Chris@82 380 {
Chris@82 381 E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
Chris@82 382 {
Chris@82 383 E T3O, T3P, T3I, T3J;
Chris@82 384 T3O = TT - Tz;
Chris@82 385 T3P = T1y - T1e;
Chris@82 386 T3Q = FMA(KP618033988, T3P, T3O);
Chris@82 387 T3S = FNMS(KP618033988, T3O, T3P);
Chris@82 388 T3H = T3j + T3i;
Chris@82 389 T3I = T39 + T3a;
Chris@82 390 T3J = T36 + T37;
Chris@82 391 T3K = T3I + T3J;
Chris@82 392 T3L = FNMS(KP250000000, T3K, T3H);
Chris@82 393 T3M = T3I - T3J;
Chris@82 394 }
Chris@82 395 ci[WS(rs, 14)] = T3K + T3H;
Chris@82 396 T3R = FNMS(KP559016994, T3M, T3L);
Chris@82 397 cr[WS(rs, 12)] = FMS(KP951056516, T3S, T3R);
Chris@82 398 ci[WS(rs, 11)] = FMA(KP951056516, T3S, T3R);
Chris@82 399 T3N = FMA(KP559016994, T3M, T3L);
Chris@82 400 cr[WS(rs, 9)] = FMS(KP951056516, T3Q, T3N);
Chris@82 401 ci[WS(rs, 8)] = FMA(KP951056516, T3Q, T3N);
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
Chris@82 405 {
Chris@82 406 E T3C, T3D, T3w, T3x;
Chris@82 407 T3C = T1O - T1V;
Chris@82 408 T3D = T23 - T2a;
Chris@82 409 T3E = FMA(KP618033988, T3D, T3C);
Chris@82 410 T3G = FNMS(KP618033988, T3C, T3D);
Chris@82 411 T3v = FNMS(KP866025403, T3l, T3k);
Chris@82 412 T3w = T2U + T2V;
Chris@82 413 T3x = T2X + T2Y;
Chris@82 414 T3y = T3w + T3x;
Chris@82 415 T3z = FNMS(KP250000000, T3y, T3v);
Chris@82 416 T3A = T3x - T3w;
Chris@82 417 }
Chris@82 418 ci[WS(rs, 9)] = T3y + T3v;
Chris@82 419 T3F = FMA(KP559016994, T3A, T3z);
Chris@82 420 cr[WS(rs, 8)] = FMS(KP951056516, T3G, T3F);
Chris@82 421 ci[WS(rs, 12)] = FMA(KP951056516, T3G, T3F);
Chris@82 422 T3B = FNMS(KP559016994, T3A, T3z);
Chris@82 423 cr[WS(rs, 11)] = FMS(KP951056516, T3E, T3B);
Chris@82 424 cr[WS(rs, 14)] = -(FMA(KP951056516, T3E, T3B));
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429
Chris@82 430 static const tw_instr twinstr[] = {
Chris@82 431 {TW_FULL, 1, 15},
Chris@82 432 {TW_NEXT, 1, 0}
Chris@82 433 };
Chris@82 434
Chris@82 435 static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {72, 28, 112, 0} };
Chris@82 436
Chris@82 437 void X(codelet_hf_15) (planner *p) {
Chris@82 438 X(khc2hc_register) (p, hf_15, &desc);
Chris@82 439 }
Chris@82 440 #else
Chris@82 441
Chris@82 442 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
Chris@82 443
Chris@82 444 /*
Chris@82 445 * This function contains 184 FP additions, 112 FP multiplications,
Chris@82 446 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@82 447 * 65 stack variables, 6 constants, and 60 memory accesses
Chris@82 448 */
Chris@82 449 #include "rdft/scalar/hf.h"
Chris@82 450
Chris@82 451 static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 452 {
Chris@82 453 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 454 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 455 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 456 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 457 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 458 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 459 {
Chris@82 460 INT m;
Chris@82 461 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 462 E T1q, T2Q, Td, T1n, T2T, T3l, T13, T1k, T1l, T2E, T2F, T3j, T1H, T1T, T2k;
Chris@82 463 E T2w, T2f, T2v, T1M, T1U, Tu, TL, TM, T2H, T2I, T3i, T1w, T1Q, T29, T2t;
Chris@82 464 E T24, T2s, T1B, T1R;
Chris@82 465 {
Chris@82 466 E T1, T2R, T6, T1o, Tb, T1p, Tc, T2S;
Chris@82 467 T1 = cr[0];
Chris@82 468 T2R = ci[0];
Chris@82 469 {
Chris@82 470 E T3, T5, T2, T4;
Chris@82 471 T3 = cr[WS(rs, 5)];
Chris@82 472 T5 = ci[WS(rs, 5)];
Chris@82 473 T2 = W[8];
Chris@82 474 T4 = W[9];
Chris@82 475 T6 = FMA(T2, T3, T4 * T5);
Chris@82 476 T1o = FNMS(T4, T3, T2 * T5);
Chris@82 477 }
Chris@82 478 {
Chris@82 479 E T8, Ta, T7, T9;
Chris@82 480 T8 = cr[WS(rs, 10)];
Chris@82 481 Ta = ci[WS(rs, 10)];
Chris@82 482 T7 = W[18];
Chris@82 483 T9 = W[19];
Chris@82 484 Tb = FMA(T7, T8, T9 * Ta);
Chris@82 485 T1p = FNMS(T9, T8, T7 * Ta);
Chris@82 486 }
Chris@82 487 T1q = KP866025403 * (T1o - T1p);
Chris@82 488 T2Q = KP866025403 * (Tb - T6);
Chris@82 489 Tc = T6 + Tb;
Chris@82 490 Td = T1 + Tc;
Chris@82 491 T1n = FNMS(KP500000000, Tc, T1);
Chris@82 492 T2S = T1o + T1p;
Chris@82 493 T2T = FNMS(KP500000000, T2S, T2R);
Chris@82 494 T3l = T2S + T2R;
Chris@82 495 }
Chris@82 496 {
Chris@82 497 E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
Chris@82 498 E T2i;
Chris@82 499 {
Chris@82 500 E TO, TQ, TN, TP;
Chris@82 501 TO = cr[WS(rs, 6)];
Chris@82 502 TQ = ci[WS(rs, 6)];
Chris@82 503 TN = W[10];
Chris@82 504 TP = W[11];
Chris@82 505 TR = FMA(TN, TO, TP * TQ);
Chris@82 506 T2c = FNMS(TP, TO, TN * TQ);
Chris@82 507 }
Chris@82 508 {
Chris@82 509 E T15, T17, T14, T16;
Chris@82 510 T15 = cr[WS(rs, 9)];
Chris@82 511 T17 = ci[WS(rs, 9)];
Chris@82 512 T14 = W[16];
Chris@82 513 T16 = W[17];
Chris@82 514 T18 = FMA(T14, T15, T16 * T17);
Chris@82 515 T2h = FNMS(T16, T15, T14 * T17);
Chris@82 516 }
Chris@82 517 {
Chris@82 518 E TT, TV, TS, TU;
Chris@82 519 TT = cr[WS(rs, 11)];
Chris@82 520 TV = ci[WS(rs, 11)];
Chris@82 521 TS = W[20];
Chris@82 522 TU = W[21];
Chris@82 523 TW = FMA(TS, TT, TU * TV);
Chris@82 524 T1E = FNMS(TU, TT, TS * TV);
Chris@82 525 }
Chris@82 526 {
Chris@82 527 E TY, T10, TX, TZ;
Chris@82 528 TY = cr[WS(rs, 1)];
Chris@82 529 T10 = ci[WS(rs, 1)];
Chris@82 530 TX = W[0];
Chris@82 531 TZ = W[1];
Chris@82 532 T11 = FMA(TX, TY, TZ * T10);
Chris@82 533 T1F = FNMS(TZ, TY, TX * T10);
Chris@82 534 }
Chris@82 535 T12 = TW + T11;
Chris@82 536 T2d = T1E + T1F;
Chris@82 537 {
Chris@82 538 E T1a, T1c, T19, T1b;
Chris@82 539 T1a = cr[WS(rs, 14)];
Chris@82 540 T1c = ci[WS(rs, 14)];
Chris@82 541 T19 = W[26];
Chris@82 542 T1b = W[27];
Chris@82 543 T1d = FMA(T19, T1a, T1b * T1c);
Chris@82 544 T1J = FNMS(T1b, T1a, T19 * T1c);
Chris@82 545 }
Chris@82 546 {
Chris@82 547 E T1f, T1h, T1e, T1g;
Chris@82 548 T1f = cr[WS(rs, 4)];
Chris@82 549 T1h = ci[WS(rs, 4)];
Chris@82 550 T1e = W[6];
Chris@82 551 T1g = W[7];
Chris@82 552 T1i = FMA(T1e, T1f, T1g * T1h);
Chris@82 553 T1K = FNMS(T1g, T1f, T1e * T1h);
Chris@82 554 }
Chris@82 555 T1j = T1d + T1i;
Chris@82 556 T2i = T1J + T1K;
Chris@82 557 {
Chris@82 558 E T1D, T1G, T2g, T2j;
Chris@82 559 T13 = TR + T12;
Chris@82 560 T1k = T18 + T1j;
Chris@82 561 T1l = T13 + T1k;
Chris@82 562 T2E = T2c + T2d;
Chris@82 563 T2F = T2h + T2i;
Chris@82 564 T3j = T2E + T2F;
Chris@82 565 T1D = FNMS(KP500000000, T12, TR);
Chris@82 566 T1G = KP866025403 * (T1E - T1F);
Chris@82 567 T1H = T1D - T1G;
Chris@82 568 T1T = T1D + T1G;
Chris@82 569 T2g = KP866025403 * (T1d - T1i);
Chris@82 570 T2j = FNMS(KP500000000, T2i, T2h);
Chris@82 571 T2k = T2g - T2j;
Chris@82 572 T2w = T2g + T2j;
Chris@82 573 {
Chris@82 574 E T2b, T2e, T1I, T1L;
Chris@82 575 T2b = KP866025403 * (T11 - TW);
Chris@82 576 T2e = FNMS(KP500000000, T2d, T2c);
Chris@82 577 T2f = T2b + T2e;
Chris@82 578 T2v = T2e - T2b;
Chris@82 579 T1I = FNMS(KP500000000, T1j, T18);
Chris@82 580 T1L = KP866025403 * (T1J - T1K);
Chris@82 581 T1M = T1I - T1L;
Chris@82 582 T1U = T1I + T1L;
Chris@82 583 }
Chris@82 584 }
Chris@82 585 }
Chris@82 586 {
Chris@82 587 E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
Chris@82 588 E T27;
Chris@82 589 {
Chris@82 590 E Tf, Th, Te, Tg;
Chris@82 591 Tf = cr[WS(rs, 3)];
Chris@82 592 Th = ci[WS(rs, 3)];
Chris@82 593 Te = W[4];
Chris@82 594 Tg = W[5];
Chris@82 595 Ti = FMA(Te, Tf, Tg * Th);
Chris@82 596 T21 = FNMS(Tg, Tf, Te * Th);
Chris@82 597 }
Chris@82 598 {
Chris@82 599 E Tw, Ty, Tv, Tx;
Chris@82 600 Tw = cr[WS(rs, 12)];
Chris@82 601 Ty = ci[WS(rs, 12)];
Chris@82 602 Tv = W[22];
Chris@82 603 Tx = W[23];
Chris@82 604 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 605 T26 = FNMS(Tx, Tw, Tv * Ty);
Chris@82 606 }
Chris@82 607 {
Chris@82 608 E Tk, Tm, Tj, Tl;
Chris@82 609 Tk = cr[WS(rs, 8)];
Chris@82 610 Tm = ci[WS(rs, 8)];
Chris@82 611 Tj = W[14];
Chris@82 612 Tl = W[15];
Chris@82 613 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@82 614 T1t = FNMS(Tl, Tk, Tj * Tm);
Chris@82 615 }
Chris@82 616 {
Chris@82 617 E Tp, Tr, To, Tq;
Chris@82 618 Tp = cr[WS(rs, 13)];
Chris@82 619 Tr = ci[WS(rs, 13)];
Chris@82 620 To = W[24];
Chris@82 621 Tq = W[25];
Chris@82 622 Ts = FMA(To, Tp, Tq * Tr);
Chris@82 623 T1u = FNMS(Tq, Tp, To * Tr);
Chris@82 624 }
Chris@82 625 Tt = Tn + Ts;
Chris@82 626 T22 = T1t + T1u;
Chris@82 627 {
Chris@82 628 E TB, TD, TA, TC;
Chris@82 629 TB = cr[WS(rs, 2)];
Chris@82 630 TD = ci[WS(rs, 2)];
Chris@82 631 TA = W[2];
Chris@82 632 TC = W[3];
Chris@82 633 TE = FMA(TA, TB, TC * TD);
Chris@82 634 T1y = FNMS(TC, TB, TA * TD);
Chris@82 635 }
Chris@82 636 {
Chris@82 637 E TG, TI, TF, TH;
Chris@82 638 TG = cr[WS(rs, 7)];
Chris@82 639 TI = ci[WS(rs, 7)];
Chris@82 640 TF = W[12];
Chris@82 641 TH = W[13];
Chris@82 642 TJ = FMA(TF, TG, TH * TI);
Chris@82 643 T1z = FNMS(TH, TG, TF * TI);
Chris@82 644 }
Chris@82 645 TK = TE + TJ;
Chris@82 646 T27 = T1y + T1z;
Chris@82 647 {
Chris@82 648 E T1s, T1v, T25, T28;
Chris@82 649 Tu = Ti + Tt;
Chris@82 650 TL = Tz + TK;
Chris@82 651 TM = Tu + TL;
Chris@82 652 T2H = T21 + T22;
Chris@82 653 T2I = T26 + T27;
Chris@82 654 T3i = T2H + T2I;
Chris@82 655 T1s = FNMS(KP500000000, Tt, Ti);
Chris@82 656 T1v = KP866025403 * (T1t - T1u);
Chris@82 657 T1w = T1s - T1v;
Chris@82 658 T1Q = T1s + T1v;
Chris@82 659 T25 = KP866025403 * (TJ - TE);
Chris@82 660 T28 = FNMS(KP500000000, T27, T26);
Chris@82 661 T29 = T25 + T28;
Chris@82 662 T2t = T28 - T25;
Chris@82 663 {
Chris@82 664 E T20, T23, T1x, T1A;
Chris@82 665 T20 = KP866025403 * (Ts - Tn);
Chris@82 666 T23 = FNMS(KP500000000, T22, T21);
Chris@82 667 T24 = T20 + T23;
Chris@82 668 T2s = T23 - T20;
Chris@82 669 T1x = FNMS(KP500000000, TK, Tz);
Chris@82 670 T1A = KP866025403 * (T1y - T1z);
Chris@82 671 T1B = T1x - T1A;
Chris@82 672 T1R = T1x + T1A;
Chris@82 673 }
Chris@82 674 }
Chris@82 675 }
Chris@82 676 {
Chris@82 677 E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
Chris@82 678 T2C = KP559016994 * (TM - T1l);
Chris@82 679 T1m = TM + T1l;
Chris@82 680 T2B = FNMS(KP250000000, T1m, Td);
Chris@82 681 T2G = T2E - T2F;
Chris@82 682 T2J = T2H - T2I;
Chris@82 683 T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
Chris@82 684 T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
Chris@82 685 cr[0] = Td + T1m;
Chris@82 686 T2L = T2C + T2B;
Chris@82 687 ci[WS(rs, 5)] = T2L - T2M;
Chris@82 688 cr[WS(rs, 6)] = T2L + T2M;
Chris@82 689 T2D = T2B - T2C;
Chris@82 690 ci[WS(rs, 2)] = T2D - T2K;
Chris@82 691 cr[WS(rs, 3)] = T2D + T2K;
Chris@82 692 }
Chris@82 693 {
Chris@82 694 E T3k, T3m, T3n, T3h, T3p, T3f, T3g, T3q, T3o;
Chris@82 695 T3k = KP559016994 * (T3i - T3j);
Chris@82 696 T3m = T3i + T3j;
Chris@82 697 T3n = FNMS(KP250000000, T3m, T3l);
Chris@82 698 T3f = T1k - T13;
Chris@82 699 T3g = Tu - TL;
Chris@82 700 T3h = FNMS(KP951056516, T3g, KP587785252 * T3f);
Chris@82 701 T3p = FMA(KP587785252, T3g, KP951056516 * T3f);
Chris@82 702 ci[WS(rs, 14)] = T3m + T3l;
Chris@82 703 T3q = T3n - T3k;
Chris@82 704 cr[WS(rs, 12)] = T3p - T3q;
Chris@82 705 ci[WS(rs, 11)] = T3p + T3q;
Chris@82 706 T3o = T3k + T3n;
Chris@82 707 cr[WS(rs, 9)] = T3h - T3o;
Chris@82 708 ci[WS(rs, 8)] = T3h + T3o;
Chris@82 709 }
Chris@82 710 {
Chris@82 711 E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
Chris@82 712 {
Chris@82 713 E T2u, T2x, T1C, T1N;
Chris@82 714 T2u = T2s - T2t;
Chris@82 715 T2x = T2v - T2w;
Chris@82 716 T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
Chris@82 717 T2A = FNMS(KP587785252, T2u, KP951056516 * T2x);
Chris@82 718 T1r = T1n - T1q;
Chris@82 719 T1C = T1w + T1B;
Chris@82 720 T1N = T1H + T1M;
Chris@82 721 T1O = T1C + T1N;
Chris@82 722 T2p = KP559016994 * (T1C - T1N);
Chris@82 723 T2q = FNMS(KP250000000, T1O, T1r);
Chris@82 724 }
Chris@82 725 cr[WS(rs, 5)] = T1r + T1O;
Chris@82 726 T2z = T2q - T2p;
Chris@82 727 cr[WS(rs, 2)] = T2z - T2A;
Chris@82 728 ci[WS(rs, 6)] = T2z + T2A;
Chris@82 729 T2r = T2p + T2q;
Chris@82 730 ci[0] = T2r - T2y;
Chris@82 731 ci[WS(rs, 3)] = T2r + T2y;
Chris@82 732 }
Chris@82 733 {
Chris@82 734 E T35, T3d, T39, T3a, T38, T3b, T3e, T3c;
Chris@82 735 {
Chris@82 736 E T33, T34, T36, T37;
Chris@82 737 T33 = T1w - T1B;
Chris@82 738 T34 = T1H - T1M;
Chris@82 739 T35 = FMA(KP951056516, T33, KP587785252 * T34);
Chris@82 740 T3d = FNMS(KP587785252, T33, KP951056516 * T34);
Chris@82 741 T39 = T2T - T2Q;
Chris@82 742 T36 = T2v + T2w;
Chris@82 743 T37 = T2s + T2t;
Chris@82 744 T3a = T37 + T36;
Chris@82 745 T38 = KP559016994 * (T36 - T37);
Chris@82 746 T3b = FNMS(KP250000000, T3a, T39);
Chris@82 747 }
Chris@82 748 ci[WS(rs, 9)] = T3a + T39;
Chris@82 749 T3e = T38 + T3b;
Chris@82 750 cr[WS(rs, 8)] = T3d - T3e;
Chris@82 751 ci[WS(rs, 12)] = T3d + T3e;
Chris@82 752 T3c = T38 - T3b;
Chris@82 753 cr[WS(rs, 11)] = T35 + T3c;
Chris@82 754 cr[WS(rs, 14)] = T3c - T35;
Chris@82 755 }
Chris@82 756 {
Chris@82 757 E T2X, T31, T2U, T2P, T2Y, T2Z, T32, T30;
Chris@82 758 {
Chris@82 759 E T2V, T2W, T2N, T2O;
Chris@82 760 T2V = T1T - T1U;
Chris@82 761 T2W = T1Q - T1R;
Chris@82 762 T2X = FNMS(KP587785252, T2W, KP951056516 * T2V);
Chris@82 763 T31 = FMA(KP951056516, T2W, KP587785252 * T2V);
Chris@82 764 T2U = T2Q + T2T;
Chris@82 765 T2N = T2k - T2f;
Chris@82 766 T2O = T24 + T29;
Chris@82 767 T2P = T2N - T2O;
Chris@82 768 T2Y = FMA(KP250000000, T2P, T2U);
Chris@82 769 T2Z = KP559016994 * (T2O + T2N);
Chris@82 770 }
Chris@82 771 cr[WS(rs, 10)] = T2P - T2U;
Chris@82 772 T32 = T2Z + T2Y;
Chris@82 773 ci[WS(rs, 10)] = T31 + T32;
Chris@82 774 ci[WS(rs, 13)] = T32 - T31;
Chris@82 775 T30 = T2Y - T2Z;
Chris@82 776 cr[WS(rs, 13)] = T2X - T30;
Chris@82 777 ci[WS(rs, 7)] = T2X + T30;
Chris@82 778 }
Chris@82 779 {
Chris@82 780 E T2m, T2o, T1P, T1W, T1X, T1Y, T1Z, T2n;
Chris@82 781 {
Chris@82 782 E T2a, T2l, T1S, T1V;
Chris@82 783 T2a = T24 - T29;
Chris@82 784 T2l = T2f + T2k;
Chris@82 785 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
Chris@82 786 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
Chris@82 787 T1P = T1n + T1q;
Chris@82 788 T1S = T1Q + T1R;
Chris@82 789 T1V = T1T + T1U;
Chris@82 790 T1W = T1S + T1V;
Chris@82 791 T1X = KP559016994 * (T1S - T1V);
Chris@82 792 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@82 793 }
Chris@82 794 ci[WS(rs, 4)] = T1P + T1W;
Chris@82 795 T1Z = T1X + T1Y;
Chris@82 796 cr[WS(rs, 4)] = T1Z - T2m;
Chris@82 797 cr[WS(rs, 1)] = T1Z + T2m;
Chris@82 798 T2n = T1Y - T1X;
Chris@82 799 cr[WS(rs, 7)] = T2n - T2o;
Chris@82 800 ci[WS(rs, 1)] = T2n + T2o;
Chris@82 801 }
Chris@82 802 }
Chris@82 803 }
Chris@82 804 }
Chris@82 805
Chris@82 806 static const tw_instr twinstr[] = {
Chris@82 807 {TW_FULL, 1, 15},
Chris@82 808 {TW_NEXT, 1, 0}
Chris@82 809 };
Chris@82 810
Chris@82 811 static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, {128, 56, 56, 0} };
Chris@82 812
Chris@82 813 void X(codelet_hf_15) (planner *p) {
Chris@82 814 X(khc2hc_register) (p, hf_15, &desc);
Chris@82 815 }
Chris@82 816 #endif