annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:45 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@42 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@42 33 * 93 stack variables, 6 constants, and 60 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 48 E T3v, T3u, T3r, T3w, T3t;
Chris@42 49 {
Chris@42 50 E T5, T11, T1C, T2U, T2f, T3f, T19, T18, TS, TH, T14, T16, T3g, T3a, Ts;
Chris@42 51 E Tv, T37, T3h, T28, T2h, T1M, T21, T2g, T3n, T2X, T1P, T30, T3m, T1J, T2m;
Chris@42 52 {
Chris@42 53 E T1, TX, T2, T3, TY, TZ;
Chris@42 54 T1 = cr[0];
Chris@42 55 TX = ci[WS(rs, 14)];
Chris@42 56 T2 = cr[WS(rs, 5)];
Chris@42 57 T3 = ci[WS(rs, 4)];
Chris@42 58 TY = ci[WS(rs, 9)];
Chris@42 59 TZ = cr[WS(rs, 10)];
Chris@42 60 {
Chris@42 61 E T1W, T23, T1D, Ta, Tl, T1K, T1Z, T1H, T1G, Tf, TR, T1Y, T26, TI, T1O;
Chris@42 62 E T1N, Tq, TG, T25, Tx, Ty, Tz, TL, T1E;
Chris@42 63 {
Chris@42 64 E Tb, TQ, TN, TO, Te;
Chris@42 65 {
Chris@42 66 E T6, Th, Ti, Tj, T9, Tc, Td, Tk;
Chris@42 67 {
Chris@42 68 E T7, T8, T2e, T4;
Chris@42 69 T6 = cr[WS(rs, 3)];
Chris@42 70 T2e = T2 - T3;
Chris@42 71 T4 = T2 + T3;
Chris@42 72 {
Chris@42 73 E T1B, T10, T1A, T2d;
Chris@42 74 T1B = TY + TZ;
Chris@42 75 T10 = TY - TZ;
Chris@42 76 T7 = ci[WS(rs, 6)];
Chris@42 77 T5 = T1 + T4;
Chris@42 78 T1A = FNMS(KP500000000, T4, T1);
Chris@42 79 T11 = TX + T10;
Chris@42 80 T2d = FNMS(KP500000000, T10, TX);
Chris@42 81 T1C = FNMS(KP866025403, T1B, T1A);
Chris@42 82 T2U = FMA(KP866025403, T1B, T1A);
Chris@42 83 T2f = FMA(KP866025403, T2e, T2d);
Chris@42 84 T3f = FNMS(KP866025403, T2e, T2d);
Chris@42 85 T8 = ci[WS(rs, 1)];
Chris@42 86 }
Chris@42 87 Th = cr[WS(rs, 6)];
Chris@42 88 Ti = ci[WS(rs, 3)];
Chris@42 89 Tj = cr[WS(rs, 1)];
Chris@42 90 T9 = T7 + T8;
Chris@42 91 T1W = T7 - T8;
Chris@42 92 }
Chris@42 93 Tb = ci[WS(rs, 2)];
Chris@42 94 T23 = Ti - Tj;
Chris@42 95 Tk = Ti + Tj;
Chris@42 96 T1D = FNMS(KP500000000, T9, T6);
Chris@42 97 Ta = T6 + T9;
Chris@42 98 Tc = cr[WS(rs, 2)];
Chris@42 99 Tl = Th + Tk;
Chris@42 100 T1K = FNMS(KP500000000, Tk, Th);
Chris@42 101 Td = cr[WS(rs, 7)];
Chris@42 102 TQ = cr[WS(rs, 12)];
Chris@42 103 TN = ci[WS(rs, 12)];
Chris@42 104 TO = ci[WS(rs, 7)];
Chris@42 105 Te = Tc + Td;
Chris@42 106 T1Z = Tc - Td;
Chris@42 107 }
Chris@42 108 {
Chris@42 109 E Tm, TF, TC, TD, Tp, Tn, To, TP, TJ, TK, TE;
Chris@42 110 Tm = ci[WS(rs, 5)];
Chris@42 111 T1H = TO - TN;
Chris@42 112 TP = TN + TO;
Chris@42 113 T1G = FNMS(KP500000000, Te, Tb);
Chris@42 114 Tf = Tb + Te;
Chris@42 115 Tn = ci[0];
Chris@42 116 TR = TP - TQ;
Chris@42 117 T1Y = FMA(KP500000000, TP, TQ);
Chris@42 118 To = cr[WS(rs, 4)];
Chris@42 119 TF = cr[WS(rs, 9)];
Chris@42 120 TC = ci[WS(rs, 10)];
Chris@42 121 TD = cr[WS(rs, 14)];
Chris@42 122 Tp = Tn + To;
Chris@42 123 T26 = Tn - To;
Chris@42 124 TI = ci[WS(rs, 11)];
Chris@42 125 T1O = TC + TD;
Chris@42 126 TE = TC - TD;
Chris@42 127 T1N = FNMS(KP500000000, Tp, Tm);
Chris@42 128 Tq = Tm + Tp;
Chris@42 129 TJ = cr[WS(rs, 8)];
Chris@42 130 TG = TE - TF;
Chris@42 131 T25 = FMA(KP500000000, TE, TF);
Chris@42 132 TK = cr[WS(rs, 13)];
Chris@42 133 Tx = ci[WS(rs, 8)];
Chris@42 134 Ty = ci[WS(rs, 13)];
Chris@42 135 Tz = cr[WS(rs, 11)];
Chris@42 136 TL = TJ + TK;
Chris@42 137 T1E = TJ - TK;
Chris@42 138 }
Chris@42 139 }
Chris@42 140 {
Chris@42 141 E Tg, T1L, Tr, T22, T12, T1X, T38, T13, T39, T20;
Chris@42 142 {
Chris@42 143 E TA, T1V, TM, TB;
Chris@42 144 Tg = Ta + Tf;
Chris@42 145 T19 = Ta - Tf;
Chris@42 146 T1L = Ty + Tz;
Chris@42 147 TA = Ty - Tz;
Chris@42 148 T1V = FMA(KP500000000, TL, TI);
Chris@42 149 TM = TI - TL;
Chris@42 150 T18 = Tl - Tq;
Chris@42 151 Tr = Tl + Tq;
Chris@42 152 TB = Tx + TA;
Chris@42 153 T22 = FNMS(KP500000000, TA, Tx);
Chris@42 154 T12 = TM + TR;
Chris@42 155 TS = TM - TR;
Chris@42 156 T1X = FMA(KP866025403, T1W, T1V);
Chris@42 157 T38 = FNMS(KP866025403, T1W, T1V);
Chris@42 158 T13 = TB + TG;
Chris@42 159 TH = TB - TG;
Chris@42 160 T39 = FMA(KP866025403, T1Z, T1Y);
Chris@42 161 T20 = FNMS(KP866025403, T1Z, T1Y);
Chris@42 162 }
Chris@42 163 {
Chris@42 164 E T35, T24, T27, T36;
Chris@42 165 T14 = T12 + T13;
Chris@42 166 T16 = T12 - T13;
Chris@42 167 T3g = T38 - T39;
Chris@42 168 T3a = T38 + T39;
Chris@42 169 T35 = FNMS(KP866025403, T23, T22);
Chris@42 170 T24 = FMA(KP866025403, T23, T22);
Chris@42 171 Ts = Tg + Tr;
Chris@42 172 Tv = Tg - Tr;
Chris@42 173 T27 = FNMS(KP866025403, T26, T25);
Chris@42 174 T36 = FMA(KP866025403, T26, T25);
Chris@42 175 T37 = T35 + T36;
Chris@42 176 T3h = T35 - T36;
Chris@42 177 T28 = T24 + T27;
Chris@42 178 T2h = T24 - T27;
Chris@42 179 {
Chris@42 180 E T1F, T1I, T2Y, T2Z, T2V, T2W;
Chris@42 181 T2V = FNMS(KP866025403, T1E, T1D);
Chris@42 182 T1F = FMA(KP866025403, T1E, T1D);
Chris@42 183 T1I = FMA(KP866025403, T1H, T1G);
Chris@42 184 T2W = FNMS(KP866025403, T1H, T1G);
Chris@42 185 T2Y = FNMS(KP866025403, T1L, T1K);
Chris@42 186 T1M = FMA(KP866025403, T1L, T1K);
Chris@42 187 T21 = T1X + T20;
Chris@42 188 T2g = T1X - T20;
Chris@42 189 T3n = T2V - T2W;
Chris@42 190 T2X = T2V + T2W;
Chris@42 191 T2Z = FNMS(KP866025403, T1O, T1N);
Chris@42 192 T1P = FMA(KP866025403, T1O, T1N);
Chris@42 193 T30 = T2Y + T2Z;
Chris@42 194 T3m = T2Y - T2Z;
Chris@42 195 T1J = T1F + T1I;
Chris@42 196 T2m = T1F - T1I;
Chris@42 197 }
Chris@42 198 }
Chris@42 199 }
Chris@42 200 }
Chris@42 201 }
Chris@42 202 {
Chris@42 203 E T31, T33, T2n, T1Q;
Chris@42 204 cr[0] = T5 + Ts;
Chris@42 205 T31 = T2X + T30;
Chris@42 206 T33 = T2X - T30;
Chris@42 207 T2n = T1M - T1P;
Chris@42 208 T1Q = T1M + T1P;
Chris@42 209 ci[0] = T11 + T14;
Chris@42 210 {
Chris@42 211 E T1T, T1R, T1r, T1o, T1n;
Chris@42 212 {
Chris@42 213 E T1q, T1a, TT, T1l, Tu, T17, T1p, T15;
Chris@42 214 T1q = FMA(KP618033988, T18, T19);
Chris@42 215 T1a = FNMS(KP618033988, T19, T18);
Chris@42 216 T1T = T1J - T1Q;
Chris@42 217 T1R = T1J + T1Q;
Chris@42 218 T15 = FNMS(KP250000000, T14, T11);
Chris@42 219 TT = FNMS(KP618033988, TS, TH);
Chris@42 220 T1l = FMA(KP618033988, TH, TS);
Chris@42 221 Tu = FNMS(KP250000000, Ts, T5);
Chris@42 222 T17 = FNMS(KP559016994, T16, T15);
Chris@42 223 T1p = FMA(KP559016994, T16, T15);
Chris@42 224 {
Chris@42 225 E T1h, T1m, T1e, T1x, T1w, T1v, T1g, T1d;
Chris@42 226 {
Chris@42 227 E TW, T1b, Tt, T1u, TU, T1k, Tw;
Chris@42 228 TW = W[5];
Chris@42 229 T1k = FMA(KP559016994, Tv, Tu);
Chris@42 230 Tw = FNMS(KP559016994, Tv, Tu);
Chris@42 231 T1b = FMA(KP951056516, T1a, T17);
Chris@42 232 T1h = FNMS(KP951056516, T1a, T17);
Chris@42 233 Tt = W[4];
Chris@42 234 T1m = FNMS(KP951056516, T1l, T1k);
Chris@42 235 T1u = FMA(KP951056516, T1l, T1k);
Chris@42 236 T1e = FMA(KP951056516, TT, Tw);
Chris@42 237 TU = FNMS(KP951056516, TT, Tw);
Chris@42 238 {
Chris@42 239 E T1t, TV, T1c, T1y;
Chris@42 240 T1x = FNMS(KP951056516, T1q, T1p);
Chris@42 241 T1r = FMA(KP951056516, T1q, T1p);
Chris@42 242 T1w = W[17];
Chris@42 243 T1t = W[16];
Chris@42 244 TV = Tt * TU;
Chris@42 245 T1c = TW * TU;
Chris@42 246 T1y = T1w * T1u;
Chris@42 247 T1v = T1t * T1u;
Chris@42 248 cr[WS(rs, 3)] = FNMS(TW, T1b, TV);
Chris@42 249 ci[WS(rs, 3)] = FMA(Tt, T1b, T1c);
Chris@42 250 ci[WS(rs, 9)] = FMA(T1t, T1x, T1y);
Chris@42 251 }
Chris@42 252 }
Chris@42 253 cr[WS(rs, 9)] = FNMS(T1w, T1x, T1v);
Chris@42 254 T1g = W[23];
Chris@42 255 T1d = W[22];
Chris@42 256 {
Chris@42 257 E T1j, T1s, T1i, T1f;
Chris@42 258 T1o = W[11];
Chris@42 259 T1i = T1g * T1e;
Chris@42 260 T1f = T1d * T1e;
Chris@42 261 T1j = W[10];
Chris@42 262 T1s = T1o * T1m;
Chris@42 263 ci[WS(rs, 12)] = FMA(T1d, T1h, T1i);
Chris@42 264 cr[WS(rs, 12)] = FNMS(T1g, T1h, T1f);
Chris@42 265 T1n = T1j * T1m;
Chris@42 266 ci[WS(rs, 6)] = FMA(T1j, T1r, T1s);
Chris@42 267 }
Chris@42 268 }
Chris@42 269 }
Chris@42 270 {
Chris@42 271 E T2v, T2u, T2r, T2w, T2t;
Chris@42 272 {
Chris@42 273 E T1S, T2N, T2o, T2E, T2Q, T2P, T2k, T2S, T29, T2z, T2R, T2j, T2O, T2i;
Chris@42 274 cr[WS(rs, 6)] = FNMS(T1o, T1r, T1n);
Chris@42 275 T1S = FNMS(KP250000000, T1R, T1C);
Chris@42 276 T2O = T1C + T1R;
Chris@42 277 T2N = W[18];
Chris@42 278 T2o = FMA(KP618033988, T2n, T2m);
Chris@42 279 T2E = FNMS(KP618033988, T2m, T2n);
Chris@42 280 T2Q = W[19];
Chris@42 281 T2P = T2N * T2O;
Chris@42 282 T2i = T2g + T2h;
Chris@42 283 T2k = T2g - T2h;
Chris@42 284 T2S = T2Q * T2O;
Chris@42 285 T29 = FMA(KP618033988, T28, T21);
Chris@42 286 T2z = FNMS(KP618033988, T21, T28);
Chris@42 287 T2R = T2f + T2i;
Chris@42 288 T2j = FNMS(KP250000000, T2i, T2f);
Chris@42 289 {
Chris@42 290 E T2D, T2p, T2I, T2A, T2a, T2s, T2c, T1z, T2l, T1U, T2y;
Chris@42 291 cr[WS(rs, 10)] = FNMS(T2Q, T2R, T2P);
Chris@42 292 T2l = FMA(KP559016994, T2k, T2j);
Chris@42 293 T2D = FNMS(KP559016994, T2k, T2j);
Chris@42 294 T1U = FMA(KP559016994, T1T, T1S);
Chris@42 295 T2y = FNMS(KP559016994, T1T, T1S);
Chris@42 296 ci[WS(rs, 10)] = FMA(T2N, T2R, T2S);
Chris@42 297 T2p = FMA(KP951056516, T2o, T2l);
Chris@42 298 T2v = FNMS(KP951056516, T2o, T2l);
Chris@42 299 T2I = FNMS(KP951056516, T2z, T2y);
Chris@42 300 T2A = FMA(KP951056516, T2z, T2y);
Chris@42 301 T2a = FNMS(KP951056516, T29, T1U);
Chris@42 302 T2s = FMA(KP951056516, T29, T1U);
Chris@42 303 T2c = W[1];
Chris@42 304 T1z = W[0];
Chris@42 305 {
Chris@42 306 E T2F, T2L, T2K, T2J;
Chris@42 307 {
Chris@42 308 E T2H, T2M, T2q, T2b;
Chris@42 309 T2F = FNMS(KP951056516, T2E, T2D);
Chris@42 310 T2L = FMA(KP951056516, T2E, T2D);
Chris@42 311 T2K = W[25];
Chris@42 312 T2q = T2c * T2a;
Chris@42 313 T2b = T1z * T2a;
Chris@42 314 T2H = W[24];
Chris@42 315 T2M = T2K * T2I;
Chris@42 316 ci[WS(rs, 1)] = FMA(T1z, T2p, T2q);
Chris@42 317 cr[WS(rs, 1)] = FNMS(T2c, T2p, T2b);
Chris@42 318 T2J = T2H * T2I;
Chris@42 319 ci[WS(rs, 13)] = FMA(T2H, T2L, T2M);
Chris@42 320 }
Chris@42 321 {
Chris@42 322 E T2C, T2x, T2G, T2B;
Chris@42 323 T2C = W[13];
Chris@42 324 cr[WS(rs, 13)] = FNMS(T2K, T2L, T2J);
Chris@42 325 T2x = W[12];
Chris@42 326 T2G = T2C * T2A;
Chris@42 327 T2u = W[7];
Chris@42 328 T2B = T2x * T2A;
Chris@42 329 T2r = W[6];
Chris@42 330 ci[WS(rs, 7)] = FMA(T2x, T2F, T2G);
Chris@42 331 T2w = T2u * T2s;
Chris@42 332 cr[WS(rs, 7)] = FNMS(T2C, T2F, T2B);
Chris@42 333 T2t = T2r * T2s;
Chris@42 334 }
Chris@42 335 }
Chris@42 336 }
Chris@42 337 }
Chris@42 338 {
Chris@42 339 E T32, T3N, T3E, T3o, T3Q, T3P, T3k, T3S, T3z, T3b, T3j, T3R, T3O, T3i;
Chris@42 340 ci[WS(rs, 4)] = FMA(T2r, T2v, T2w);
Chris@42 341 cr[WS(rs, 4)] = FNMS(T2u, T2v, T2t);
Chris@42 342 T3O = T2U + T31;
Chris@42 343 T32 = FNMS(KP250000000, T31, T2U);
Chris@42 344 T3N = W[8];
Chris@42 345 T3E = FMA(KP618033988, T3m, T3n);
Chris@42 346 T3o = FNMS(KP618033988, T3n, T3m);
Chris@42 347 T3Q = W[9];
Chris@42 348 T3P = T3N * T3O;
Chris@42 349 T3k = T3g - T3h;
Chris@42 350 T3i = T3g + T3h;
Chris@42 351 T3S = T3Q * T3O;
Chris@42 352 T3z = FMA(KP618033988, T37, T3a);
Chris@42 353 T3b = FNMS(KP618033988, T3a, T37);
Chris@42 354 T3j = FNMS(KP250000000, T3i, T3f);
Chris@42 355 T3R = T3f + T3i;
Chris@42 356 {
Chris@42 357 E T3D, T3p, T3A, T3I, T3s, T3c, T3e, T2T, T3l, T3y, T34;
Chris@42 358 cr[WS(rs, 5)] = FNMS(T3Q, T3R, T3P);
Chris@42 359 T3D = FMA(KP559016994, T3k, T3j);
Chris@42 360 T3l = FNMS(KP559016994, T3k, T3j);
Chris@42 361 T3y = FMA(KP559016994, T33, T32);
Chris@42 362 T34 = FNMS(KP559016994, T33, T32);
Chris@42 363 ci[WS(rs, 5)] = FMA(T3N, T3R, T3S);
Chris@42 364 T3v = FMA(KP951056516, T3o, T3l);
Chris@42 365 T3p = FNMS(KP951056516, T3o, T3l);
Chris@42 366 T3A = FNMS(KP951056516, T3z, T3y);
Chris@42 367 T3I = FMA(KP951056516, T3z, T3y);
Chris@42 368 T3s = FNMS(KP951056516, T3b, T34);
Chris@42 369 T3c = FMA(KP951056516, T3b, T34);
Chris@42 370 T3e = W[3];
Chris@42 371 T2T = W[2];
Chris@42 372 {
Chris@42 373 E T3L, T3F, T3K, T3J;
Chris@42 374 {
Chris@42 375 E T3H, T3M, T3q, T3d;
Chris@42 376 T3L = FNMS(KP951056516, T3E, T3D);
Chris@42 377 T3F = FMA(KP951056516, T3E, T3D);
Chris@42 378 T3K = W[27];
Chris@42 379 T3q = T3e * T3c;
Chris@42 380 T3d = T2T * T3c;
Chris@42 381 T3H = W[26];
Chris@42 382 T3M = T3K * T3I;
Chris@42 383 ci[WS(rs, 2)] = FMA(T2T, T3p, T3q);
Chris@42 384 cr[WS(rs, 2)] = FNMS(T3e, T3p, T3d);
Chris@42 385 T3J = T3H * T3I;
Chris@42 386 ci[WS(rs, 14)] = FMA(T3H, T3L, T3M);
Chris@42 387 }
Chris@42 388 {
Chris@42 389 E T3C, T3x, T3G, T3B;
Chris@42 390 T3C = W[21];
Chris@42 391 cr[WS(rs, 14)] = FNMS(T3K, T3L, T3J);
Chris@42 392 T3x = W[20];
Chris@42 393 T3G = T3C * T3A;
Chris@42 394 T3u = W[15];
Chris@42 395 T3B = T3x * T3A;
Chris@42 396 T3r = W[14];
Chris@42 397 ci[WS(rs, 11)] = FMA(T3x, T3F, T3G);
Chris@42 398 T3w = T3u * T3s;
Chris@42 399 cr[WS(rs, 11)] = FNMS(T3C, T3F, T3B);
Chris@42 400 T3t = T3r * T3s;
Chris@42 401 }
Chris@42 402 }
Chris@42 403 }
Chris@42 404 }
Chris@42 405 }
Chris@42 406 }
Chris@42 407 }
Chris@42 408 }
Chris@42 409 ci[WS(rs, 8)] = FMA(T3r, T3v, T3w);
Chris@42 410 cr[WS(rs, 8)] = FNMS(T3u, T3v, T3t);
Chris@42 411 }
Chris@42 412 }
Chris@42 413 }
Chris@42 414
Chris@42 415 static const tw_instr twinstr[] = {
Chris@42 416 {TW_FULL, 1, 15},
Chris@42 417 {TW_NEXT, 1, 0}
Chris@42 418 };
Chris@42 419
Chris@42 420 static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {72, 28, 112, 0} };
Chris@42 421
Chris@42 422 void X(codelet_hb_15) (planner *p) {
Chris@42 423 X(khc2hc_register) (p, hb_15, &desc);
Chris@42 424 }
Chris@42 425 #else /* HAVE_FMA */
Chris@42 426
Chris@42 427 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -dif -name hb_15 -include hb.h */
Chris@42 428
Chris@42 429 /*
Chris@42 430 * This function contains 184 FP additions, 112 FP multiplications,
Chris@42 431 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@42 432 * 75 stack variables, 6 constants, and 60 memory accesses
Chris@42 433 */
Chris@42 434 #include "hb.h"
Chris@42 435
Chris@42 436 static void hb_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 437 {
Chris@42 438 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 439 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 440 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 441 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 442 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 443 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 444 {
Chris@42 445 INT m;
Chris@42 446 for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 447 E T5, T10, T1J, T2C, T2c, T2M, TH, T18, T17, TS, T2Q, T2R, T2S, Tg, Tr;
Chris@42 448 E Ts, T11, T12, T13, T2N, T2O, T2P, T1u, T1x, T1y, T1W, T1Z, T28, T1P, T1S;
Chris@42 449 E T27, T1B, T1E, T1F, T2G, T2H, T2I, T2D, T2E, T2F;
Chris@42 450 {
Chris@42 451 E T1, TW, T4, T2a, TZ, T1I, T1H, T2b;
Chris@42 452 T1 = cr[0];
Chris@42 453 TW = ci[WS(rs, 14)];
Chris@42 454 {
Chris@42 455 E T2, T3, TX, TY;
Chris@42 456 T2 = cr[WS(rs, 5)];
Chris@42 457 T3 = ci[WS(rs, 4)];
Chris@42 458 T4 = T2 + T3;
Chris@42 459 T2a = KP866025403 * (T2 - T3);
Chris@42 460 TX = ci[WS(rs, 9)];
Chris@42 461 TY = cr[WS(rs, 10)];
Chris@42 462 TZ = TX - TY;
Chris@42 463 T1I = KP866025403 * (TX + TY);
Chris@42 464 }
Chris@42 465 T5 = T1 + T4;
Chris@42 466 T10 = TW + TZ;
Chris@42 467 T1H = FNMS(KP500000000, T4, T1);
Chris@42 468 T1J = T1H - T1I;
Chris@42 469 T2C = T1H + T1I;
Chris@42 470 T2b = FNMS(KP500000000, TZ, TW);
Chris@42 471 T2c = T2a + T2b;
Chris@42 472 T2M = T2b - T2a;
Chris@42 473 }
Chris@42 474 {
Chris@42 475 E Ta, T1N, T1s, Tl, T1U, T1z, Tf, T1Q, T1v, TG, T1R, T1w, Tq, T1X, T1C;
Chris@42 476 E TM, T1V, T1A, TB, T1O, T1t, TR, T1Y, T1D;
Chris@42 477 {
Chris@42 478 E T6, T7, T8, T9;
Chris@42 479 T6 = cr[WS(rs, 3)];
Chris@42 480 T7 = ci[WS(rs, 6)];
Chris@42 481 T8 = ci[WS(rs, 1)];
Chris@42 482 T9 = T7 + T8;
Chris@42 483 Ta = T6 + T9;
Chris@42 484 T1N = KP866025403 * (T7 - T8);
Chris@42 485 T1s = FNMS(KP500000000, T9, T6);
Chris@42 486 }
Chris@42 487 {
Chris@42 488 E Th, Ti, Tj, Tk;
Chris@42 489 Th = cr[WS(rs, 6)];
Chris@42 490 Ti = ci[WS(rs, 3)];
Chris@42 491 Tj = cr[WS(rs, 1)];
Chris@42 492 Tk = Ti + Tj;
Chris@42 493 Tl = Th + Tk;
Chris@42 494 T1U = KP866025403 * (Ti - Tj);
Chris@42 495 T1z = FNMS(KP500000000, Tk, Th);
Chris@42 496 }
Chris@42 497 {
Chris@42 498 E Tb, Tc, Td, Te;
Chris@42 499 Tb = ci[WS(rs, 2)];
Chris@42 500 Tc = cr[WS(rs, 2)];
Chris@42 501 Td = cr[WS(rs, 7)];
Chris@42 502 Te = Tc + Td;
Chris@42 503 Tf = Tb + Te;
Chris@42 504 T1Q = KP866025403 * (Tc - Td);
Chris@42 505 T1v = FNMS(KP500000000, Te, Tb);
Chris@42 506 }
Chris@42 507 {
Chris@42 508 E TF, TC, TD, TE;
Chris@42 509 TF = cr[WS(rs, 12)];
Chris@42 510 TC = ci[WS(rs, 12)];
Chris@42 511 TD = ci[WS(rs, 7)];
Chris@42 512 TE = TC + TD;
Chris@42 513 TG = TE - TF;
Chris@42 514 T1R = FMA(KP500000000, TE, TF);
Chris@42 515 T1w = KP866025403 * (TD - TC);
Chris@42 516 }
Chris@42 517 {
Chris@42 518 E Tm, Tn, To, Tp;
Chris@42 519 Tm = ci[WS(rs, 5)];
Chris@42 520 Tn = ci[0];
Chris@42 521 To = cr[WS(rs, 4)];
Chris@42 522 Tp = Tn + To;
Chris@42 523 Tq = Tm + Tp;
Chris@42 524 T1X = KP866025403 * (Tn - To);
Chris@42 525 T1C = FNMS(KP500000000, Tp, Tm);
Chris@42 526 }
Chris@42 527 {
Chris@42 528 E TI, TJ, TK, TL;
Chris@42 529 TI = ci[WS(rs, 8)];
Chris@42 530 TJ = ci[WS(rs, 13)];
Chris@42 531 TK = cr[WS(rs, 11)];
Chris@42 532 TL = TJ - TK;
Chris@42 533 TM = TI + TL;
Chris@42 534 T1V = FNMS(KP500000000, TL, TI);
Chris@42 535 T1A = KP866025403 * (TJ + TK);
Chris@42 536 }
Chris@42 537 {
Chris@42 538 E Tx, Ty, Tz, TA;
Chris@42 539 Tx = ci[WS(rs, 11)];
Chris@42 540 Ty = cr[WS(rs, 8)];
Chris@42 541 Tz = cr[WS(rs, 13)];
Chris@42 542 TA = Ty + Tz;
Chris@42 543 TB = Tx - TA;
Chris@42 544 T1O = FMA(KP500000000, TA, Tx);
Chris@42 545 T1t = KP866025403 * (Ty - Tz);
Chris@42 546 }
Chris@42 547 {
Chris@42 548 E TQ, TN, TO, TP;
Chris@42 549 TQ = cr[WS(rs, 9)];
Chris@42 550 TN = ci[WS(rs, 10)];
Chris@42 551 TO = cr[WS(rs, 14)];
Chris@42 552 TP = TN - TO;
Chris@42 553 TR = TP - TQ;
Chris@42 554 T1Y = FMA(KP500000000, TP, TQ);
Chris@42 555 T1D = KP866025403 * (TN + TO);
Chris@42 556 }
Chris@42 557 TH = TB - TG;
Chris@42 558 T18 = Tl - Tq;
Chris@42 559 T17 = Ta - Tf;
Chris@42 560 TS = TM - TR;
Chris@42 561 T2Q = T1V - T1U;
Chris@42 562 T2R = T1X + T1Y;
Chris@42 563 T2S = T2Q - T2R;
Chris@42 564 Tg = Ta + Tf;
Chris@42 565 Tr = Tl + Tq;
Chris@42 566 Ts = Tg + Tr;
Chris@42 567 T11 = TB + TG;
Chris@42 568 T12 = TM + TR;
Chris@42 569 T13 = T11 + T12;
Chris@42 570 T2N = T1O - T1N;
Chris@42 571 T2O = T1Q + T1R;
Chris@42 572 T2P = T2N - T2O;
Chris@42 573 T1u = T1s + T1t;
Chris@42 574 T1x = T1v + T1w;
Chris@42 575 T1y = T1u + T1x;
Chris@42 576 T1W = T1U + T1V;
Chris@42 577 T1Z = T1X - T1Y;
Chris@42 578 T28 = T1W + T1Z;
Chris@42 579 T1P = T1N + T1O;
Chris@42 580 T1S = T1Q - T1R;
Chris@42 581 T27 = T1P + T1S;
Chris@42 582 T1B = T1z + T1A;
Chris@42 583 T1E = T1C + T1D;
Chris@42 584 T1F = T1B + T1E;
Chris@42 585 T2G = T1z - T1A;
Chris@42 586 T2H = T1C - T1D;
Chris@42 587 T2I = T2G + T2H;
Chris@42 588 T2D = T1s - T1t;
Chris@42 589 T2E = T1v - T1w;
Chris@42 590 T2F = T2D + T2E;
Chris@42 591 }
Chris@42 592 cr[0] = T5 + Ts;
Chris@42 593 ci[0] = T10 + T13;
Chris@42 594 {
Chris@42 595 E TT, T19, T1k, T1h, T16, T1l, Tw, T1g;
Chris@42 596 TT = FNMS(KP951056516, TS, KP587785252 * TH);
Chris@42 597 T19 = FNMS(KP951056516, T18, KP587785252 * T17);
Chris@42 598 T1k = FMA(KP951056516, T17, KP587785252 * T18);
Chris@42 599 T1h = FMA(KP951056516, TH, KP587785252 * TS);
Chris@42 600 {
Chris@42 601 E T14, T15, Tu, Tv;
Chris@42 602 T14 = FNMS(KP250000000, T13, T10);
Chris@42 603 T15 = KP559016994 * (T11 - T12);
Chris@42 604 T16 = T14 - T15;
Chris@42 605 T1l = T15 + T14;
Chris@42 606 Tu = FNMS(KP250000000, Ts, T5);
Chris@42 607 Tv = KP559016994 * (Tg - Tr);
Chris@42 608 Tw = Tu - Tv;
Chris@42 609 T1g = Tv + Tu;
Chris@42 610 }
Chris@42 611 {
Chris@42 612 E TU, T1a, Tt, TV;
Chris@42 613 TU = Tw + TT;
Chris@42 614 T1a = T16 - T19;
Chris@42 615 Tt = W[4];
Chris@42 616 TV = W[5];
Chris@42 617 cr[WS(rs, 3)] = FNMS(TV, T1a, Tt * TU);
Chris@42 618 ci[WS(rs, 3)] = FMA(TV, TU, Tt * T1a);
Chris@42 619 }
Chris@42 620 {
Chris@42 621 E T1o, T1q, T1n, T1p;
Chris@42 622 T1o = T1g + T1h;
Chris@42 623 T1q = T1l - T1k;
Chris@42 624 T1n = W[16];
Chris@42 625 T1p = W[17];
Chris@42 626 cr[WS(rs, 9)] = FNMS(T1p, T1q, T1n * T1o);
Chris@42 627 ci[WS(rs, 9)] = FMA(T1p, T1o, T1n * T1q);
Chris@42 628 }
Chris@42 629 {
Chris@42 630 E T1c, T1e, T1b, T1d;
Chris@42 631 T1c = Tw - TT;
Chris@42 632 T1e = T19 + T16;
Chris@42 633 T1b = W[22];
Chris@42 634 T1d = W[23];
Chris@42 635 cr[WS(rs, 12)] = FNMS(T1d, T1e, T1b * T1c);
Chris@42 636 ci[WS(rs, 12)] = FMA(T1d, T1c, T1b * T1e);
Chris@42 637 }
Chris@42 638 {
Chris@42 639 E T1i, T1m, T1f, T1j;
Chris@42 640 T1i = T1g - T1h;
Chris@42 641 T1m = T1k + T1l;
Chris@42 642 T1f = W[10];
Chris@42 643 T1j = W[11];
Chris@42 644 cr[WS(rs, 6)] = FNMS(T1j, T1m, T1f * T1i);
Chris@42 645 ci[WS(rs, 6)] = FMA(T1j, T1i, T1f * T1m);
Chris@42 646 }
Chris@42 647 }
Chris@42 648 {
Chris@42 649 E T21, T2n, T26, T2q, T1M, T2y, T2m, T2f, T2A, T2r, T2x, T2z;
Chris@42 650 {
Chris@42 651 E T1T, T20, T24, T25;
Chris@42 652 T1T = T1P - T1S;
Chris@42 653 T20 = T1W - T1Z;
Chris@42 654 T21 = FMA(KP951056516, T1T, KP587785252 * T20);
Chris@42 655 T2n = FNMS(KP951056516, T20, KP587785252 * T1T);
Chris@42 656 T24 = T1u - T1x;
Chris@42 657 T25 = T1B - T1E;
Chris@42 658 T26 = FMA(KP951056516, T24, KP587785252 * T25);
Chris@42 659 T2q = FNMS(KP951056516, T25, KP587785252 * T24);
Chris@42 660 }
Chris@42 661 {
Chris@42 662 E T1G, T1K, T1L, T29, T2d, T2e;
Chris@42 663 T1G = KP559016994 * (T1y - T1F);
Chris@42 664 T1K = T1y + T1F;
Chris@42 665 T1L = FNMS(KP250000000, T1K, T1J);
Chris@42 666 T1M = T1G + T1L;
Chris@42 667 T2y = T1J + T1K;
Chris@42 668 T2m = T1L - T1G;
Chris@42 669 T29 = KP559016994 * (T27 - T28);
Chris@42 670 T2d = T27 + T28;
Chris@42 671 T2e = FNMS(KP250000000, T2d, T2c);
Chris@42 672 T2f = T29 + T2e;
Chris@42 673 T2A = T2c + T2d;
Chris@42 674 T2r = T2e - T29;
Chris@42 675 }
Chris@42 676 T2x = W[18];
Chris@42 677 T2z = W[19];
Chris@42 678 cr[WS(rs, 10)] = FNMS(T2z, T2A, T2x * T2y);
Chris@42 679 ci[WS(rs, 10)] = FMA(T2z, T2y, T2x * T2A);
Chris@42 680 {
Chris@42 681 E T2u, T2w, T2t, T2v;
Chris@42 682 T2u = T2m + T2n;
Chris@42 683 T2w = T2r - T2q;
Chris@42 684 T2t = W[24];
Chris@42 685 T2v = W[25];
Chris@42 686 cr[WS(rs, 13)] = FNMS(T2v, T2w, T2t * T2u);
Chris@42 687 ci[WS(rs, 13)] = FMA(T2v, T2u, T2t * T2w);
Chris@42 688 }
Chris@42 689 {
Chris@42 690 E T22, T2g, T1r, T23;
Chris@42 691 T22 = T1M - T21;
Chris@42 692 T2g = T26 + T2f;
Chris@42 693 T1r = W[0];
Chris@42 694 T23 = W[1];
Chris@42 695 cr[WS(rs, 1)] = FNMS(T23, T2g, T1r * T22);
Chris@42 696 ci[WS(rs, 1)] = FMA(T23, T22, T1r * T2g);
Chris@42 697 }
Chris@42 698 {
Chris@42 699 E T2i, T2k, T2h, T2j;
Chris@42 700 T2i = T1M + T21;
Chris@42 701 T2k = T2f - T26;
Chris@42 702 T2h = W[6];
Chris@42 703 T2j = W[7];
Chris@42 704 cr[WS(rs, 4)] = FNMS(T2j, T2k, T2h * T2i);
Chris@42 705 ci[WS(rs, 4)] = FMA(T2j, T2i, T2h * T2k);
Chris@42 706 }
Chris@42 707 {
Chris@42 708 E T2o, T2s, T2l, T2p;
Chris@42 709 T2o = T2m - T2n;
Chris@42 710 T2s = T2q + T2r;
Chris@42 711 T2l = W[12];
Chris@42 712 T2p = W[13];
Chris@42 713 cr[WS(rs, 7)] = FNMS(T2p, T2s, T2l * T2o);
Chris@42 714 ci[WS(rs, 7)] = FMA(T2p, T2o, T2l * T2s);
Chris@42 715 }
Chris@42 716 }
Chris@42 717 {
Chris@42 718 E T31, T3h, T36, T3k, T2K, T3g, T2Y, T2U, T3l, T39, T2B, T2L;
Chris@42 719 {
Chris@42 720 E T2Z, T30, T34, T35;
Chris@42 721 T2Z = T2N + T2O;
Chris@42 722 T30 = T2Q + T2R;
Chris@42 723 T31 = FNMS(KP951056516, T30, KP587785252 * T2Z);
Chris@42 724 T3h = FMA(KP951056516, T2Z, KP587785252 * T30);
Chris@42 725 T34 = T2D - T2E;
Chris@42 726 T35 = T2G - T2H;
Chris@42 727 T36 = FNMS(KP951056516, T35, KP587785252 * T34);
Chris@42 728 T3k = FMA(KP951056516, T34, KP587785252 * T35);
Chris@42 729 }
Chris@42 730 {
Chris@42 731 E T2X, T2J, T2W, T38, T2T, T37;
Chris@42 732 T2X = KP559016994 * (T2F - T2I);
Chris@42 733 T2J = T2F + T2I;
Chris@42 734 T2W = FNMS(KP250000000, T2J, T2C);
Chris@42 735 T2K = T2C + T2J;
Chris@42 736 T3g = T2X + T2W;
Chris@42 737 T2Y = T2W - T2X;
Chris@42 738 T38 = KP559016994 * (T2P - T2S);
Chris@42 739 T2T = T2P + T2S;
Chris@42 740 T37 = FNMS(KP250000000, T2T, T2M);
Chris@42 741 T2U = T2M + T2T;
Chris@42 742 T3l = T38 + T37;
Chris@42 743 T39 = T37 - T38;
Chris@42 744 }
Chris@42 745 T2B = W[8];
Chris@42 746 T2L = W[9];
Chris@42 747 cr[WS(rs, 5)] = FNMS(T2L, T2U, T2B * T2K);
Chris@42 748 ci[WS(rs, 5)] = FMA(T2L, T2K, T2B * T2U);
Chris@42 749 {
Chris@42 750 E T3o, T3q, T3n, T3p;
Chris@42 751 T3o = T3g + T3h;
Chris@42 752 T3q = T3l - T3k;
Chris@42 753 T3n = W[26];
Chris@42 754 T3p = W[27];
Chris@42 755 cr[WS(rs, 14)] = FNMS(T3p, T3q, T3n * T3o);
Chris@42 756 ci[WS(rs, 14)] = FMA(T3n, T3q, T3p * T3o);
Chris@42 757 }
Chris@42 758 {
Chris@42 759 E T32, T3a, T2V, T33;
Chris@42 760 T32 = T2Y - T31;
Chris@42 761 T3a = T36 + T39;
Chris@42 762 T2V = W[2];
Chris@42 763 T33 = W[3];
Chris@42 764 cr[WS(rs, 2)] = FNMS(T33, T3a, T2V * T32);
Chris@42 765 ci[WS(rs, 2)] = FMA(T2V, T3a, T33 * T32);
Chris@42 766 }
Chris@42 767 {
Chris@42 768 E T3c, T3e, T3b, T3d;
Chris@42 769 T3c = T2Y + T31;
Chris@42 770 T3e = T39 - T36;
Chris@42 771 T3b = W[14];
Chris@42 772 T3d = W[15];
Chris@42 773 cr[WS(rs, 8)] = FNMS(T3d, T3e, T3b * T3c);
Chris@42 774 ci[WS(rs, 8)] = FMA(T3b, T3e, T3d * T3c);
Chris@42 775 }
Chris@42 776 {
Chris@42 777 E T3i, T3m, T3f, T3j;
Chris@42 778 T3i = T3g - T3h;
Chris@42 779 T3m = T3k + T3l;
Chris@42 780 T3f = W[20];
Chris@42 781 T3j = W[21];
Chris@42 782 cr[WS(rs, 11)] = FNMS(T3j, T3m, T3f * T3i);
Chris@42 783 ci[WS(rs, 11)] = FMA(T3f, T3m, T3j * T3i);
Chris@42 784 }
Chris@42 785 }
Chris@42 786 }
Chris@42 787 }
Chris@42 788 }
Chris@42 789
Chris@42 790 static const tw_instr twinstr[] = {
Chris@42 791 {TW_FULL, 1, 15},
Chris@42 792 {TW_NEXT, 1, 0}
Chris@42 793 };
Chris@42 794
Chris@42 795 static const hc2hc_desc desc = { 15, "hb_15", twinstr, &GENUS, {128, 56, 56, 0} };
Chris@42 796
Chris@42 797 void X(codelet_hb_15) (planner *p) {
Chris@42 798 X(khc2hc_register) (p, hb_15, &desc);
Chris@42 799 }
Chris@42 800 #endif /* HAVE_FMA */