annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:32 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@82 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@82 33 * 78 stack variables, 6 constants, and 60 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 45 {
Chris@82 46 INT m;
Chris@82 47 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 48 E T5, T11, T1C, T2U, T2f, T3f, TH, T19, T18, TS, T12, T13, T14, T3a, T3g;
Chris@82 49 E Ts, Tv, T37, T3h, T28, T2h, T21, T2g, T2V, T2W, T2X, T2Y, T2Z, T30, T31;
Chris@82 50 E T1F, T1I, T1J, T1M, T1P, T1Q, T1R;
Chris@82 51 {
Chris@82 52 E T1, TX, T4, T2e, T10, T1B, T1A, T2d;
Chris@82 53 T1 = cr[0];
Chris@82 54 TX = ci[WS(rs, 14)];
Chris@82 55 {
Chris@82 56 E T2, T3, TY, TZ;
Chris@82 57 T2 = cr[WS(rs, 5)];
Chris@82 58 T3 = ci[WS(rs, 4)];
Chris@82 59 T4 = T2 + T3;
Chris@82 60 T2e = T2 - T3;
Chris@82 61 TY = ci[WS(rs, 9)];
Chris@82 62 TZ = cr[WS(rs, 10)];
Chris@82 63 T10 = TY - TZ;
Chris@82 64 T1B = TY + TZ;
Chris@82 65 }
Chris@82 66 T5 = T1 + T4;
Chris@82 67 T11 = TX + T10;
Chris@82 68 T1A = FNMS(KP500000000, T4, T1);
Chris@82 69 T1C = FNMS(KP866025403, T1B, T1A);
Chris@82 70 T2U = FMA(KP866025403, T1B, T1A);
Chris@82 71 T2d = FNMS(KP500000000, T10, TX);
Chris@82 72 T2f = FMA(KP866025403, T2e, T2d);
Chris@82 73 T3f = FNMS(KP866025403, T2e, T2d);
Chris@82 74 }
Chris@82 75 {
Chris@82 76 E Ta, T1W, T1D, Tl, T23, T1K, Tf, T1Z, T1G, TR, T1Y, T1H, Tq, T26, T1N;
Chris@82 77 E TG, T25, T1O, TM, T1V, T1E, TB, T22, T1L, T38, T39;
Chris@82 78 {
Chris@82 79 E T6, T7, T8, T9;
Chris@82 80 T6 = cr[WS(rs, 3)];
Chris@82 81 T7 = ci[WS(rs, 6)];
Chris@82 82 T8 = ci[WS(rs, 1)];
Chris@82 83 T9 = T7 + T8;
Chris@82 84 Ta = T6 + T9;
Chris@82 85 T1W = T7 - T8;
Chris@82 86 T1D = FNMS(KP500000000, T9, T6);
Chris@82 87 }
Chris@82 88 {
Chris@82 89 E Th, Ti, Tj, Tk;
Chris@82 90 Th = cr[WS(rs, 6)];
Chris@82 91 Ti = ci[WS(rs, 3)];
Chris@82 92 Tj = cr[WS(rs, 1)];
Chris@82 93 Tk = Ti + Tj;
Chris@82 94 Tl = Th + Tk;
Chris@82 95 T23 = Ti - Tj;
Chris@82 96 T1K = FNMS(KP500000000, Tk, Th);
Chris@82 97 }
Chris@82 98 {
Chris@82 99 E Tb, Tc, Td, Te;
Chris@82 100 Tb = ci[WS(rs, 2)];
Chris@82 101 Tc = cr[WS(rs, 2)];
Chris@82 102 Td = cr[WS(rs, 7)];
Chris@82 103 Te = Tc + Td;
Chris@82 104 Tf = Tb + Te;
Chris@82 105 T1Z = Tc - Td;
Chris@82 106 T1G = FNMS(KP500000000, Te, Tb);
Chris@82 107 }
Chris@82 108 {
Chris@82 109 E TQ, TN, TO, TP;
Chris@82 110 TQ = cr[WS(rs, 12)];
Chris@82 111 TN = ci[WS(rs, 12)];
Chris@82 112 TO = ci[WS(rs, 7)];
Chris@82 113 TP = TN + TO;
Chris@82 114 TR = TP - TQ;
Chris@82 115 T1Y = FMA(KP500000000, TP, TQ);
Chris@82 116 T1H = TO - TN;
Chris@82 117 }
Chris@82 118 {
Chris@82 119 E Tm, Tn, To, Tp;
Chris@82 120 Tm = ci[WS(rs, 5)];
Chris@82 121 Tn = ci[0];
Chris@82 122 To = cr[WS(rs, 4)];
Chris@82 123 Tp = Tn + To;
Chris@82 124 Tq = Tm + Tp;
Chris@82 125 T26 = Tn - To;
Chris@82 126 T1N = FNMS(KP500000000, Tp, Tm);
Chris@82 127 }
Chris@82 128 {
Chris@82 129 E TF, TC, TD, TE;
Chris@82 130 TF = cr[WS(rs, 9)];
Chris@82 131 TC = ci[WS(rs, 10)];
Chris@82 132 TD = cr[WS(rs, 14)];
Chris@82 133 TE = TC - TD;
Chris@82 134 TG = TE - TF;
Chris@82 135 T25 = FMA(KP500000000, TE, TF);
Chris@82 136 T1O = TC + TD;
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E TI, TJ, TK, TL;
Chris@82 140 TI = ci[WS(rs, 11)];
Chris@82 141 TJ = cr[WS(rs, 8)];
Chris@82 142 TK = cr[WS(rs, 13)];
Chris@82 143 TL = TJ + TK;
Chris@82 144 TM = TI - TL;
Chris@82 145 T1V = FMA(KP500000000, TL, TI);
Chris@82 146 T1E = TJ - TK;
Chris@82 147 }
Chris@82 148 {
Chris@82 149 E Tx, Ty, Tz, TA;
Chris@82 150 Tx = ci[WS(rs, 8)];
Chris@82 151 Ty = ci[WS(rs, 13)];
Chris@82 152 Tz = cr[WS(rs, 11)];
Chris@82 153 TA = Ty - Tz;
Chris@82 154 TB = Tx + TA;
Chris@82 155 T22 = FNMS(KP500000000, TA, Tx);
Chris@82 156 T1L = Ty + Tz;
Chris@82 157 }
Chris@82 158 TH = TB - TG;
Chris@82 159 T19 = Ta - Tf;
Chris@82 160 T18 = Tl - Tq;
Chris@82 161 TS = TM - TR;
Chris@82 162 T12 = TM + TR;
Chris@82 163 T13 = TB + TG;
Chris@82 164 T14 = T12 + T13;
Chris@82 165 T38 = FNMS(KP866025403, T1W, T1V);
Chris@82 166 T39 = FMA(KP866025403, T1Z, T1Y);
Chris@82 167 T3a = T38 + T39;
Chris@82 168 T3g = T38 - T39;
Chris@82 169 {
Chris@82 170 E Tg, Tr, T1X, T20;
Chris@82 171 Tg = Ta + Tf;
Chris@82 172 Tr = Tl + Tq;
Chris@82 173 Ts = Tg + Tr;
Chris@82 174 Tv = Tg - Tr;
Chris@82 175 {
Chris@82 176 E T35, T36, T24, T27;
Chris@82 177 T35 = FNMS(KP866025403, T23, T22);
Chris@82 178 T36 = FMA(KP866025403, T26, T25);
Chris@82 179 T37 = T35 + T36;
Chris@82 180 T3h = T35 - T36;
Chris@82 181 T24 = FMA(KP866025403, T23, T22);
Chris@82 182 T27 = FNMS(KP866025403, T26, T25);
Chris@82 183 T28 = T24 + T27;
Chris@82 184 T2h = T24 - T27;
Chris@82 185 }
Chris@82 186 T1X = FMA(KP866025403, T1W, T1V);
Chris@82 187 T20 = FNMS(KP866025403, T1Z, T1Y);
Chris@82 188 T21 = T1X + T20;
Chris@82 189 T2g = T1X - T20;
Chris@82 190 T2V = FNMS(KP866025403, T1E, T1D);
Chris@82 191 T2W = FNMS(KP866025403, T1H, T1G);
Chris@82 192 T2X = T2V + T2W;
Chris@82 193 T2Y = FNMS(KP866025403, T1L, T1K);
Chris@82 194 T2Z = FNMS(KP866025403, T1O, T1N);
Chris@82 195 T30 = T2Y + T2Z;
Chris@82 196 T31 = T2X + T30;
Chris@82 197 T1F = FMA(KP866025403, T1E, T1D);
Chris@82 198 T1I = FMA(KP866025403, T1H, T1G);
Chris@82 199 T1J = T1F + T1I;
Chris@82 200 T1M = FMA(KP866025403, T1L, T1K);
Chris@82 201 T1P = FMA(KP866025403, T1O, T1N);
Chris@82 202 T1Q = T1M + T1P;
Chris@82 203 T1R = T1J + T1Q;
Chris@82 204 }
Chris@82 205 }
Chris@82 206 cr[0] = T5 + Ts;
Chris@82 207 ci[0] = T11 + T14;
Chris@82 208 {
Chris@82 209 E T1a, T1q, T17, T1p, TU, T1u, T1e, T1m, T15, T16;
Chris@82 210 T1a = FNMS(KP618033988, T19, T18);
Chris@82 211 T1q = FMA(KP618033988, T18, T19);
Chris@82 212 T15 = FNMS(KP250000000, T14, T11);
Chris@82 213 T16 = T12 - T13;
Chris@82 214 T17 = FNMS(KP559016994, T16, T15);
Chris@82 215 T1p = FMA(KP559016994, T16, T15);
Chris@82 216 {
Chris@82 217 E TT, T1l, Tw, T1k, Tu;
Chris@82 218 TT = FNMS(KP618033988, TS, TH);
Chris@82 219 T1l = FMA(KP618033988, TH, TS);
Chris@82 220 Tu = FNMS(KP250000000, Ts, T5);
Chris@82 221 Tw = FNMS(KP559016994, Tv, Tu);
Chris@82 222 T1k = FMA(KP559016994, Tv, Tu);
Chris@82 223 TU = FNMS(KP951056516, TT, Tw);
Chris@82 224 T1u = FMA(KP951056516, T1l, T1k);
Chris@82 225 T1e = FMA(KP951056516, TT, Tw);
Chris@82 226 T1m = FNMS(KP951056516, T1l, T1k);
Chris@82 227 }
Chris@82 228 {
Chris@82 229 E T1b, TW, T1c, Tt, TV;
Chris@82 230 T1b = FMA(KP951056516, T1a, T17);
Chris@82 231 TW = W[5];
Chris@82 232 T1c = TW * TU;
Chris@82 233 Tt = W[4];
Chris@82 234 TV = Tt * TU;
Chris@82 235 cr[WS(rs, 3)] = FNMS(TW, T1b, TV);
Chris@82 236 ci[WS(rs, 3)] = FMA(Tt, T1b, T1c);
Chris@82 237 }
Chris@82 238 {
Chris@82 239 E T1x, T1w, T1y, T1t, T1v;
Chris@82 240 T1x = FNMS(KP951056516, T1q, T1p);
Chris@82 241 T1w = W[17];
Chris@82 242 T1y = T1w * T1u;
Chris@82 243 T1t = W[16];
Chris@82 244 T1v = T1t * T1u;
Chris@82 245 cr[WS(rs, 9)] = FNMS(T1w, T1x, T1v);
Chris@82 246 ci[WS(rs, 9)] = FMA(T1t, T1x, T1y);
Chris@82 247 }
Chris@82 248 {
Chris@82 249 E T1h, T1g, T1i, T1d, T1f;
Chris@82 250 T1h = FNMS(KP951056516, T1a, T17);
Chris@82 251 T1g = W[23];
Chris@82 252 T1i = T1g * T1e;
Chris@82 253 T1d = W[22];
Chris@82 254 T1f = T1d * T1e;
Chris@82 255 cr[WS(rs, 12)] = FNMS(T1g, T1h, T1f);
Chris@82 256 ci[WS(rs, 12)] = FMA(T1d, T1h, T1i);
Chris@82 257 }
Chris@82 258 {
Chris@82 259 E T1r, T1o, T1s, T1j, T1n;
Chris@82 260 T1r = FMA(KP951056516, T1q, T1p);
Chris@82 261 T1o = W[11];
Chris@82 262 T1s = T1o * T1m;
Chris@82 263 T1j = W[10];
Chris@82 264 T1n = T1j * T1m;
Chris@82 265 cr[WS(rs, 6)] = FNMS(T1o, T1r, T1n);
Chris@82 266 ci[WS(rs, 6)] = FMA(T1j, T1r, T1s);
Chris@82 267 }
Chris@82 268 }
Chris@82 269 {
Chris@82 270 E T2o, T2E, T2N, T2P, T2Q, T2S, T2l, T2R, T2D, T2a, T2I, T2s, T2A;
Chris@82 271 {
Chris@82 272 E T2m, T2n, T2O, T2k, T2i, T2j;
Chris@82 273 T2m = T1F - T1I;
Chris@82 274 T2n = T1M - T1P;
Chris@82 275 T2o = FMA(KP618033988, T2n, T2m);
Chris@82 276 T2E = FNMS(KP618033988, T2m, T2n);
Chris@82 277 T2O = T1C + T1R;
Chris@82 278 T2N = W[18];
Chris@82 279 T2P = T2N * T2O;
Chris@82 280 T2Q = W[19];
Chris@82 281 T2S = T2Q * T2O;
Chris@82 282 T2k = T2g - T2h;
Chris@82 283 T2i = T2g + T2h;
Chris@82 284 T2j = FNMS(KP250000000, T2i, T2f);
Chris@82 285 T2l = FMA(KP559016994, T2k, T2j);
Chris@82 286 T2R = T2f + T2i;
Chris@82 287 T2D = FNMS(KP559016994, T2k, T2j);
Chris@82 288 {
Chris@82 289 E T29, T2z, T1U, T2y, T1S, T1T;
Chris@82 290 T29 = FMA(KP618033988, T28, T21);
Chris@82 291 T2z = FNMS(KP618033988, T21, T28);
Chris@82 292 T1S = FNMS(KP250000000, T1R, T1C);
Chris@82 293 T1T = T1J - T1Q;
Chris@82 294 T1U = FMA(KP559016994, T1T, T1S);
Chris@82 295 T2y = FNMS(KP559016994, T1T, T1S);
Chris@82 296 T2a = FNMS(KP951056516, T29, T1U);
Chris@82 297 T2I = FNMS(KP951056516, T2z, T2y);
Chris@82 298 T2s = FMA(KP951056516, T29, T1U);
Chris@82 299 T2A = FMA(KP951056516, T2z, T2y);
Chris@82 300 }
Chris@82 301 }
Chris@82 302 cr[WS(rs, 10)] = FNMS(T2Q, T2R, T2P);
Chris@82 303 ci[WS(rs, 10)] = FMA(T2N, T2R, T2S);
Chris@82 304 {
Chris@82 305 E T2p, T2c, T2q, T1z, T2b;
Chris@82 306 T2p = FMA(KP951056516, T2o, T2l);
Chris@82 307 T2c = W[1];
Chris@82 308 T2q = T2c * T2a;
Chris@82 309 T1z = W[0];
Chris@82 310 T2b = T1z * T2a;
Chris@82 311 cr[WS(rs, 1)] = FNMS(T2c, T2p, T2b);
Chris@82 312 ci[WS(rs, 1)] = FMA(T1z, T2p, T2q);
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T2L, T2K, T2M, T2H, T2J;
Chris@82 316 T2L = FMA(KP951056516, T2E, T2D);
Chris@82 317 T2K = W[25];
Chris@82 318 T2M = T2K * T2I;
Chris@82 319 T2H = W[24];
Chris@82 320 T2J = T2H * T2I;
Chris@82 321 cr[WS(rs, 13)] = FNMS(T2K, T2L, T2J);
Chris@82 322 ci[WS(rs, 13)] = FMA(T2H, T2L, T2M);
Chris@82 323 }
Chris@82 324 {
Chris@82 325 E T2F, T2C, T2G, T2x, T2B;
Chris@82 326 T2F = FNMS(KP951056516, T2E, T2D);
Chris@82 327 T2C = W[13];
Chris@82 328 T2G = T2C * T2A;
Chris@82 329 T2x = W[12];
Chris@82 330 T2B = T2x * T2A;
Chris@82 331 cr[WS(rs, 7)] = FNMS(T2C, T2F, T2B);
Chris@82 332 ci[WS(rs, 7)] = FMA(T2x, T2F, T2G);
Chris@82 333 }
Chris@82 334 {
Chris@82 335 E T2v, T2u, T2w, T2r, T2t;
Chris@82 336 T2v = FNMS(KP951056516, T2o, T2l);
Chris@82 337 T2u = W[7];
Chris@82 338 T2w = T2u * T2s;
Chris@82 339 T2r = W[6];
Chris@82 340 T2t = T2r * T2s;
Chris@82 341 cr[WS(rs, 4)] = FNMS(T2u, T2v, T2t);
Chris@82 342 ci[WS(rs, 4)] = FMA(T2r, T2v, T2w);
Chris@82 343 }
Chris@82 344 }
Chris@82 345 {
Chris@82 346 E T3o, T3E, T3N, T3P, T3Q, T3S, T3l, T3R, T3D, T3c, T3I, T3s, T3A;
Chris@82 347 {
Chris@82 348 E T3m, T3n, T3O, T3k, T3i, T3j;
Chris@82 349 T3m = T2Y - T2Z;
Chris@82 350 T3n = T2V - T2W;
Chris@82 351 T3o = FNMS(KP618033988, T3n, T3m);
Chris@82 352 T3E = FMA(KP618033988, T3m, T3n);
Chris@82 353 T3O = T2U + T31;
Chris@82 354 T3N = W[8];
Chris@82 355 T3P = T3N * T3O;
Chris@82 356 T3Q = W[9];
Chris@82 357 T3S = T3Q * T3O;
Chris@82 358 T3k = T3g - T3h;
Chris@82 359 T3i = T3g + T3h;
Chris@82 360 T3j = FNMS(KP250000000, T3i, T3f);
Chris@82 361 T3l = FNMS(KP559016994, T3k, T3j);
Chris@82 362 T3R = T3f + T3i;
Chris@82 363 T3D = FMA(KP559016994, T3k, T3j);
Chris@82 364 {
Chris@82 365 E T3b, T3z, T34, T3y, T32, T33;
Chris@82 366 T3b = FNMS(KP618033988, T3a, T37);
Chris@82 367 T3z = FMA(KP618033988, T37, T3a);
Chris@82 368 T32 = FNMS(KP250000000, T31, T2U);
Chris@82 369 T33 = T2X - T30;
Chris@82 370 T34 = FNMS(KP559016994, T33, T32);
Chris@82 371 T3y = FMA(KP559016994, T33, T32);
Chris@82 372 T3c = FMA(KP951056516, T3b, T34);
Chris@82 373 T3I = FMA(KP951056516, T3z, T3y);
Chris@82 374 T3s = FNMS(KP951056516, T3b, T34);
Chris@82 375 T3A = FNMS(KP951056516, T3z, T3y);
Chris@82 376 }
Chris@82 377 }
Chris@82 378 cr[WS(rs, 5)] = FNMS(T3Q, T3R, T3P);
Chris@82 379 ci[WS(rs, 5)] = FMA(T3N, T3R, T3S);
Chris@82 380 {
Chris@82 381 E T3p, T3e, T3q, T2T, T3d;
Chris@82 382 T3p = FNMS(KP951056516, T3o, T3l);
Chris@82 383 T3e = W[3];
Chris@82 384 T3q = T3e * T3c;
Chris@82 385 T2T = W[2];
Chris@82 386 T3d = T2T * T3c;
Chris@82 387 cr[WS(rs, 2)] = FNMS(T3e, T3p, T3d);
Chris@82 388 ci[WS(rs, 2)] = FMA(T2T, T3p, T3q);
Chris@82 389 }
Chris@82 390 {
Chris@82 391 E T3L, T3K, T3M, T3H, T3J;
Chris@82 392 T3L = FNMS(KP951056516, T3E, T3D);
Chris@82 393 T3K = W[27];
Chris@82 394 T3M = T3K * T3I;
Chris@82 395 T3H = W[26];
Chris@82 396 T3J = T3H * T3I;
Chris@82 397 cr[WS(rs, 14)] = FNMS(T3K, T3L, T3J);
Chris@82 398 ci[WS(rs, 14)] = FMA(T3H, T3L, T3M);
Chris@82 399 }
Chris@82 400 {
Chris@82 401 E T3F, T3C, T3G, T3x, T3B;
Chris@82 402 T3F = FMA(KP951056516, T3E, T3D);
Chris@82 403 T3C = W[21];
Chris@82 404 T3G = T3C * T3A;
Chris@82 405 T3x = W[20];
Chris@82 406 T3B = T3x * T3A;
Chris@82 407 cr[WS(rs, 11)] = FNMS(T3C, T3F, T3B);
Chris@82 408 ci[WS(rs, 11)] = FMA(T3x, T3F, T3G);
Chris@82 409 }
Chris@82 410 {
Chris@82 411 E T3v, T3u, T3w, T3r, T3t;
Chris@82 412 T3v = FMA(KP951056516, T3o, T3l);
Chris@82 413 T3u = W[15];
Chris@82 414 T3w = T3u * T3s;
Chris@82 415 T3r = W[14];
Chris@82 416 T3t = T3r * T3s;
Chris@82 417 cr[WS(rs, 8)] = FNMS(T3u, T3v, T3t);
Chris@82 418 ci[WS(rs, 8)] = FMA(T3r, T3v, T3w);
Chris@82 419 }
Chris@82 420 }
Chris@82 421 }
Chris@82 422 }
Chris@82 423 }
Chris@82 424
Chris@82 425 static const tw_instr twinstr[] = {
Chris@82 426 {TW_FULL, 1, 15},
Chris@82 427 {TW_NEXT, 1, 0}
Chris@82 428 };
Chris@82 429
Chris@82 430 static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {72, 28, 112, 0} };
Chris@82 431
Chris@82 432 void X(codelet_hb_15) (planner *p) {
Chris@82 433 X(khc2hc_register) (p, hb_15, &desc);
Chris@82 434 }
Chris@82 435 #else
Chris@82 436
Chris@82 437 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include rdft/scalar/hb.h */
Chris@82 438
Chris@82 439 /*
Chris@82 440 * This function contains 184 FP additions, 112 FP multiplications,
Chris@82 441 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@82 442 * 75 stack variables, 6 constants, and 60 memory accesses
Chris@82 443 */
Chris@82 444 #include "rdft/scalar/hb.h"
Chris@82 445
Chris@82 446 static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 447 {
Chris@82 448 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 449 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 450 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 451 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 452 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 453 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 454 {
Chris@82 455 INT m;
Chris@82 456 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 457 E T5, T10, T1J, T2C, T2c, T2M, TH, T18, T17, TS, T2Q, T2R, T2S, Tg, Tr;
Chris@82 458 E Ts, T11, T12, T13, T2N, T2O, T2P, T1u, T1x, T1y, T1W, T1Z, T28, T1P, T1S;
Chris@82 459 E T27, T1B, T1E, T1F, T2G, T2H, T2I, T2D, T2E, T2F;
Chris@82 460 {
Chris@82 461 E T1, TW, T4, T2a, TZ, T1I, T1H, T2b;
Chris@82 462 T1 = cr[0];
Chris@82 463 TW = ci[WS(rs, 14)];
Chris@82 464 {
Chris@82 465 E T2, T3, TX, TY;
Chris@82 466 T2 = cr[WS(rs, 5)];
Chris@82 467 T3 = ci[WS(rs, 4)];
Chris@82 468 T4 = T2 + T3;
Chris@82 469 T2a = KP866025403 * (T2 - T3);
Chris@82 470 TX = ci[WS(rs, 9)];
Chris@82 471 TY = cr[WS(rs, 10)];
Chris@82 472 TZ = TX - TY;
Chris@82 473 T1I = KP866025403 * (TX + TY);
Chris@82 474 }
Chris@82 475 T5 = T1 + T4;
Chris@82 476 T10 = TW + TZ;
Chris@82 477 T1H = FNMS(KP500000000, T4, T1);
Chris@82 478 T1J = T1H - T1I;
Chris@82 479 T2C = T1H + T1I;
Chris@82 480 T2b = FNMS(KP500000000, TZ, TW);
Chris@82 481 T2c = T2a + T2b;
Chris@82 482 T2M = T2b - T2a;
Chris@82 483 }
Chris@82 484 {
Chris@82 485 E Ta, T1N, T1s, Tl, T1U, T1z, Tf, T1Q, T1v, TG, T1R, T1w, Tq, T1X, T1C;
Chris@82 486 E TM, T1V, T1A, TB, T1O, T1t, TR, T1Y, T1D;
Chris@82 487 {
Chris@82 488 E T6, T7, T8, T9;
Chris@82 489 T6 = cr[WS(rs, 3)];
Chris@82 490 T7 = ci[WS(rs, 6)];
Chris@82 491 T8 = ci[WS(rs, 1)];
Chris@82 492 T9 = T7 + T8;
Chris@82 493 Ta = T6 + T9;
Chris@82 494 T1N = KP866025403 * (T7 - T8);
Chris@82 495 T1s = FNMS(KP500000000, T9, T6);
Chris@82 496 }
Chris@82 497 {
Chris@82 498 E Th, Ti, Tj, Tk;
Chris@82 499 Th = cr[WS(rs, 6)];
Chris@82 500 Ti = ci[WS(rs, 3)];
Chris@82 501 Tj = cr[WS(rs, 1)];
Chris@82 502 Tk = Ti + Tj;
Chris@82 503 Tl = Th + Tk;
Chris@82 504 T1U = KP866025403 * (Ti - Tj);
Chris@82 505 T1z = FNMS(KP500000000, Tk, Th);
Chris@82 506 }
Chris@82 507 {
Chris@82 508 E Tb, Tc, Td, Te;
Chris@82 509 Tb = ci[WS(rs, 2)];
Chris@82 510 Tc = cr[WS(rs, 2)];
Chris@82 511 Td = cr[WS(rs, 7)];
Chris@82 512 Te = Tc + Td;
Chris@82 513 Tf = Tb + Te;
Chris@82 514 T1Q = KP866025403 * (Tc - Td);
Chris@82 515 T1v = FNMS(KP500000000, Te, Tb);
Chris@82 516 }
Chris@82 517 {
Chris@82 518 E TF, TC, TD, TE;
Chris@82 519 TF = cr[WS(rs, 12)];
Chris@82 520 TC = ci[WS(rs, 12)];
Chris@82 521 TD = ci[WS(rs, 7)];
Chris@82 522 TE = TC + TD;
Chris@82 523 TG = TE - TF;
Chris@82 524 T1R = FMA(KP500000000, TE, TF);
Chris@82 525 T1w = KP866025403 * (TD - TC);
Chris@82 526 }
Chris@82 527 {
Chris@82 528 E Tm, Tn, To, Tp;
Chris@82 529 Tm = ci[WS(rs, 5)];
Chris@82 530 Tn = ci[0];
Chris@82 531 To = cr[WS(rs, 4)];
Chris@82 532 Tp = Tn + To;
Chris@82 533 Tq = Tm + Tp;
Chris@82 534 T1X = KP866025403 * (Tn - To);
Chris@82 535 T1C = FNMS(KP500000000, Tp, Tm);
Chris@82 536 }
Chris@82 537 {
Chris@82 538 E TI, TJ, TK, TL;
Chris@82 539 TI = ci[WS(rs, 8)];
Chris@82 540 TJ = ci[WS(rs, 13)];
Chris@82 541 TK = cr[WS(rs, 11)];
Chris@82 542 TL = TJ - TK;
Chris@82 543 TM = TI + TL;
Chris@82 544 T1V = FNMS(KP500000000, TL, TI);
Chris@82 545 T1A = KP866025403 * (TJ + TK);
Chris@82 546 }
Chris@82 547 {
Chris@82 548 E Tx, Ty, Tz, TA;
Chris@82 549 Tx = ci[WS(rs, 11)];
Chris@82 550 Ty = cr[WS(rs, 8)];
Chris@82 551 Tz = cr[WS(rs, 13)];
Chris@82 552 TA = Ty + Tz;
Chris@82 553 TB = Tx - TA;
Chris@82 554 T1O = FMA(KP500000000, TA, Tx);
Chris@82 555 T1t = KP866025403 * (Ty - Tz);
Chris@82 556 }
Chris@82 557 {
Chris@82 558 E TQ, TN, TO, TP;
Chris@82 559 TQ = cr[WS(rs, 9)];
Chris@82 560 TN = ci[WS(rs, 10)];
Chris@82 561 TO = cr[WS(rs, 14)];
Chris@82 562 TP = TN - TO;
Chris@82 563 TR = TP - TQ;
Chris@82 564 T1Y = FMA(KP500000000, TP, TQ);
Chris@82 565 T1D = KP866025403 * (TN + TO);
Chris@82 566 }
Chris@82 567 TH = TB - TG;
Chris@82 568 T18 = Tl - Tq;
Chris@82 569 T17 = Ta - Tf;
Chris@82 570 TS = TM - TR;
Chris@82 571 T2Q = T1V - T1U;
Chris@82 572 T2R = T1X + T1Y;
Chris@82 573 T2S = T2Q - T2R;
Chris@82 574 Tg = Ta + Tf;
Chris@82 575 Tr = Tl + Tq;
Chris@82 576 Ts = Tg + Tr;
Chris@82 577 T11 = TB + TG;
Chris@82 578 T12 = TM + TR;
Chris@82 579 T13 = T11 + T12;
Chris@82 580 T2N = T1O - T1N;
Chris@82 581 T2O = T1Q + T1R;
Chris@82 582 T2P = T2N - T2O;
Chris@82 583 T1u = T1s + T1t;
Chris@82 584 T1x = T1v + T1w;
Chris@82 585 T1y = T1u + T1x;
Chris@82 586 T1W = T1U + T1V;
Chris@82 587 T1Z = T1X - T1Y;
Chris@82 588 T28 = T1W + T1Z;
Chris@82 589 T1P = T1N + T1O;
Chris@82 590 T1S = T1Q - T1R;
Chris@82 591 T27 = T1P + T1S;
Chris@82 592 T1B = T1z + T1A;
Chris@82 593 T1E = T1C + T1D;
Chris@82 594 T1F = T1B + T1E;
Chris@82 595 T2G = T1z - T1A;
Chris@82 596 T2H = T1C - T1D;
Chris@82 597 T2I = T2G + T2H;
Chris@82 598 T2D = T1s - T1t;
Chris@82 599 T2E = T1v - T1w;
Chris@82 600 T2F = T2D + T2E;
Chris@82 601 }
Chris@82 602 cr[0] = T5 + Ts;
Chris@82 603 ci[0] = T10 + T13;
Chris@82 604 {
Chris@82 605 E TT, T19, T1k, T1h, T16, T1l, Tw, T1g;
Chris@82 606 TT = FNMS(KP951056516, TS, KP587785252 * TH);
Chris@82 607 T19 = FNMS(KP951056516, T18, KP587785252 * T17);
Chris@82 608 T1k = FMA(KP951056516, T17, KP587785252 * T18);
Chris@82 609 T1h = FMA(KP951056516, TH, KP587785252 * TS);
Chris@82 610 {
Chris@82 611 E T14, T15, Tu, Tv;
Chris@82 612 T14 = FNMS(KP250000000, T13, T10);
Chris@82 613 T15 = KP559016994 * (T11 - T12);
Chris@82 614 T16 = T14 - T15;
Chris@82 615 T1l = T15 + T14;
Chris@82 616 Tu = FNMS(KP250000000, Ts, T5);
Chris@82 617 Tv = KP559016994 * (Tg - Tr);
Chris@82 618 Tw = Tu - Tv;
Chris@82 619 T1g = Tv + Tu;
Chris@82 620 }
Chris@82 621 {
Chris@82 622 E TU, T1a, Tt, TV;
Chris@82 623 TU = Tw + TT;
Chris@82 624 T1a = T16 - T19;
Chris@82 625 Tt = W[4];
Chris@82 626 TV = W[5];
Chris@82 627 cr[WS(rs, 3)] = FNMS(TV, T1a, Tt * TU);
Chris@82 628 ci[WS(rs, 3)] = FMA(TV, TU, Tt * T1a);
Chris@82 629 }
Chris@82 630 {
Chris@82 631 E T1o, T1q, T1n, T1p;
Chris@82 632 T1o = T1g + T1h;
Chris@82 633 T1q = T1l - T1k;
Chris@82 634 T1n = W[16];
Chris@82 635 T1p = W[17];
Chris@82 636 cr[WS(rs, 9)] = FNMS(T1p, T1q, T1n * T1o);
Chris@82 637 ci[WS(rs, 9)] = FMA(T1p, T1o, T1n * T1q);
Chris@82 638 }
Chris@82 639 {
Chris@82 640 E T1c, T1e, T1b, T1d;
Chris@82 641 T1c = Tw - TT;
Chris@82 642 T1e = T19 + T16;
Chris@82 643 T1b = W[22];
Chris@82 644 T1d = W[23];
Chris@82 645 cr[WS(rs, 12)] = FNMS(T1d, T1e, T1b * T1c);
Chris@82 646 ci[WS(rs, 12)] = FMA(T1d, T1c, T1b * T1e);
Chris@82 647 }
Chris@82 648 {
Chris@82 649 E T1i, T1m, T1f, T1j;
Chris@82 650 T1i = T1g - T1h;
Chris@82 651 T1m = T1k + T1l;
Chris@82 652 T1f = W[10];
Chris@82 653 T1j = W[11];
Chris@82 654 cr[WS(rs, 6)] = FNMS(T1j, T1m, T1f * T1i);
Chris@82 655 ci[WS(rs, 6)] = FMA(T1j, T1i, T1f * T1m);
Chris@82 656 }
Chris@82 657 }
Chris@82 658 {
Chris@82 659 E T21, T2n, T26, T2q, T1M, T2y, T2m, T2f, T2A, T2r, T2x, T2z;
Chris@82 660 {
Chris@82 661 E T1T, T20, T24, T25;
Chris@82 662 T1T = T1P - T1S;
Chris@82 663 T20 = T1W - T1Z;
Chris@82 664 T21 = FMA(KP951056516, T1T, KP587785252 * T20);
Chris@82 665 T2n = FNMS(KP951056516, T20, KP587785252 * T1T);
Chris@82 666 T24 = T1u - T1x;
Chris@82 667 T25 = T1B - T1E;
Chris@82 668 T26 = FMA(KP951056516, T24, KP587785252 * T25);
Chris@82 669 T2q = FNMS(KP951056516, T25, KP587785252 * T24);
Chris@82 670 }
Chris@82 671 {
Chris@82 672 E T1G, T1K, T1L, T29, T2d, T2e;
Chris@82 673 T1G = KP559016994 * (T1y - T1F);
Chris@82 674 T1K = T1y + T1F;
Chris@82 675 T1L = FNMS(KP250000000, T1K, T1J);
Chris@82 676 T1M = T1G + T1L;
Chris@82 677 T2y = T1J + T1K;
Chris@82 678 T2m = T1L - T1G;
Chris@82 679 T29 = KP559016994 * (T27 - T28);
Chris@82 680 T2d = T27 + T28;
Chris@82 681 T2e = FNMS(KP250000000, T2d, T2c);
Chris@82 682 T2f = T29 + T2e;
Chris@82 683 T2A = T2c + T2d;
Chris@82 684 T2r = T2e - T29;
Chris@82 685 }
Chris@82 686 T2x = W[18];
Chris@82 687 T2z = W[19];
Chris@82 688 cr[WS(rs, 10)] = FNMS(T2z, T2A, T2x * T2y);
Chris@82 689 ci[WS(rs, 10)] = FMA(T2z, T2y, T2x * T2A);
Chris@82 690 {
Chris@82 691 E T2u, T2w, T2t, T2v;
Chris@82 692 T2u = T2m + T2n;
Chris@82 693 T2w = T2r - T2q;
Chris@82 694 T2t = W[24];
Chris@82 695 T2v = W[25];
Chris@82 696 cr[WS(rs, 13)] = FNMS(T2v, T2w, T2t * T2u);
Chris@82 697 ci[WS(rs, 13)] = FMA(T2v, T2u, T2t * T2w);
Chris@82 698 }
Chris@82 699 {
Chris@82 700 E T22, T2g, T1r, T23;
Chris@82 701 T22 = T1M - T21;
Chris@82 702 T2g = T26 + T2f;
Chris@82 703 T1r = W[0];
Chris@82 704 T23 = W[1];
Chris@82 705 cr[WS(rs, 1)] = FNMS(T23, T2g, T1r * T22);
Chris@82 706 ci[WS(rs, 1)] = FMA(T23, T22, T1r * T2g);
Chris@82 707 }
Chris@82 708 {
Chris@82 709 E T2i, T2k, T2h, T2j;
Chris@82 710 T2i = T1M + T21;
Chris@82 711 T2k = T2f - T26;
Chris@82 712 T2h = W[6];
Chris@82 713 T2j = W[7];
Chris@82 714 cr[WS(rs, 4)] = FNMS(T2j, T2k, T2h * T2i);
Chris@82 715 ci[WS(rs, 4)] = FMA(T2j, T2i, T2h * T2k);
Chris@82 716 }
Chris@82 717 {
Chris@82 718 E T2o, T2s, T2l, T2p;
Chris@82 719 T2o = T2m - T2n;
Chris@82 720 T2s = T2q + T2r;
Chris@82 721 T2l = W[12];
Chris@82 722 T2p = W[13];
Chris@82 723 cr[WS(rs, 7)] = FNMS(T2p, T2s, T2l * T2o);
Chris@82 724 ci[WS(rs, 7)] = FMA(T2p, T2o, T2l * T2s);
Chris@82 725 }
Chris@82 726 }
Chris@82 727 {
Chris@82 728 E T31, T3h, T36, T3k, T2K, T3g, T2Y, T2U, T3l, T39, T2B, T2L;
Chris@82 729 {
Chris@82 730 E T2Z, T30, T34, T35;
Chris@82 731 T2Z = T2N + T2O;
Chris@82 732 T30 = T2Q + T2R;
Chris@82 733 T31 = FNMS(KP951056516, T30, KP587785252 * T2Z);
Chris@82 734 T3h = FMA(KP951056516, T2Z, KP587785252 * T30);
Chris@82 735 T34 = T2D - T2E;
Chris@82 736 T35 = T2G - T2H;
Chris@82 737 T36 = FNMS(KP951056516, T35, KP587785252 * T34);
Chris@82 738 T3k = FMA(KP951056516, T34, KP587785252 * T35);
Chris@82 739 }
Chris@82 740 {
Chris@82 741 E T2X, T2J, T2W, T38, T2T, T37;
Chris@82 742 T2X = KP559016994 * (T2F - T2I);
Chris@82 743 T2J = T2F + T2I;
Chris@82 744 T2W = FNMS(KP250000000, T2J, T2C);
Chris@82 745 T2K = T2C + T2J;
Chris@82 746 T3g = T2X + T2W;
Chris@82 747 T2Y = T2W - T2X;
Chris@82 748 T38 = KP559016994 * (T2P - T2S);
Chris@82 749 T2T = T2P + T2S;
Chris@82 750 T37 = FNMS(KP250000000, T2T, T2M);
Chris@82 751 T2U = T2M + T2T;
Chris@82 752 T3l = T38 + T37;
Chris@82 753 T39 = T37 - T38;
Chris@82 754 }
Chris@82 755 T2B = W[8];
Chris@82 756 T2L = W[9];
Chris@82 757 cr[WS(rs, 5)] = FNMS(T2L, T2U, T2B * T2K);
Chris@82 758 ci[WS(rs, 5)] = FMA(T2L, T2K, T2B * T2U);
Chris@82 759 {
Chris@82 760 E T3o, T3q, T3n, T3p;
Chris@82 761 T3o = T3g + T3h;
Chris@82 762 T3q = T3l - T3k;
Chris@82 763 T3n = W[26];
Chris@82 764 T3p = W[27];
Chris@82 765 cr[WS(rs, 14)] = FNMS(T3p, T3q, T3n * T3o);
Chris@82 766 ci[WS(rs, 14)] = FMA(T3n, T3q, T3p * T3o);
Chris@82 767 }
Chris@82 768 {
Chris@82 769 E T32, T3a, T2V, T33;
Chris@82 770 T32 = T2Y - T31;
Chris@82 771 T3a = T36 + T39;
Chris@82 772 T2V = W[2];
Chris@82 773 T33 = W[3];
Chris@82 774 cr[WS(rs, 2)] = FNMS(T33, T3a, T2V * T32);
Chris@82 775 ci[WS(rs, 2)] = FMA(T2V, T3a, T33 * T32);
Chris@82 776 }
Chris@82 777 {
Chris@82 778 E T3c, T3e, T3b, T3d;
Chris@82 779 T3c = T2Y + T31;
Chris@82 780 T3e = T39 - T36;
Chris@82 781 T3b = W[14];
Chris@82 782 T3d = W[15];
Chris@82 783 cr[WS(rs, 8)] = FNMS(T3d, T3e, T3b * T3c);
Chris@82 784 ci[WS(rs, 8)] = FMA(T3b, T3e, T3d * T3c);
Chris@82 785 }
Chris@82 786 {
Chris@82 787 E T3i, T3m, T3f, T3j;
Chris@82 788 T3i = T3g - T3h;
Chris@82 789 T3m = T3k + T3l;
Chris@82 790 T3f = W[20];
Chris@82 791 T3j = W[21];
Chris@82 792 cr[WS(rs, 11)] = FNMS(T3j, T3m, T3f * T3i);
Chris@82 793 ci[WS(rs, 11)] = FMA(T3f, T3m, T3j * T3i);
Chris@82 794 }
Chris@82 795 }
Chris@82 796 }
Chris@82 797 }
Chris@82 798 }
Chris@82 799
Chris@82 800 static const tw_instr twinstr[] = {
Chris@82 801 {TW_FULL, 1, 15},
Chris@82 802 {TW_NEXT, 1, 0}
Chris@82 803 };
Chris@82 804
Chris@82 805 static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {128, 56, 56, 0} };
Chris@82 806
Chris@82 807 void X(codelet_hb_15) (planner *p) {
Chris@82 808 X(khc2hc_register) (p, hb_15, &desc);
Chris@82 809 }
Chris@82 810 #endif