annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:11 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@42 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@42 33 * 114 stack variables, 3 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 45 E Tv, TB, TF, Ty, T1J, T1O, T1N, T1K;
Chris@42 46 {
Chris@42 47 E Tw, T2z, T2C, Tx, T3f, T3l, T2F, T3r, Tz;
Chris@42 48 Tv = W[0];
Chris@42 49 Tw = W[2];
Chris@42 50 T2z = W[6];
Chris@42 51 T2C = W[7];
Chris@42 52 TB = W[4];
Chris@42 53 Tx = Tv * Tw;
Chris@42 54 T3f = Tv * T2z;
Chris@42 55 T3l = Tv * T2C;
Chris@42 56 T2F = Tv * TB;
Chris@42 57 T3r = Tw * TB;
Chris@42 58 TF = W[5];
Chris@42 59 Ty = W[1];
Chris@42 60 Tz = W[3];
Chris@42 61 {
Chris@42 62 E T2G, T3z, T3m, T3g, T3L, T3s, T1V, TA, T3w, T3Q, T30, T3C, TE, T1X, T1D;
Chris@42 63 E TG, T1G, T1o, T2p, T1Y, T2u, T2c, T1Z, TL, T1t, T2d, T35, T3n, T3R, T3F;
Chris@42 64 E T20, T1w, T3M, Tf, T3h, T2L, T2e, TW, T3N, T3I, T2Q, T36, T2V, T37, T1d;
Chris@42 65 E Tu, T3S, T18, T1z, T1i, T24, T2g, T27, T2h, TQ, TV;
Chris@42 66 {
Chris@42 67 E TH, T3, T2I, TU, T32, T1s, T1p, T6, TM, Ta, Tb, T33, TK, T2J, TP;
Chris@42 68 E Tc, T4, T5;
Chris@42 69 {
Chris@42 70 E TS, TT, T1q, T1r;
Chris@42 71 {
Chris@42 72 E T1, T1n, TC, T2b, T1W, T2, T3v, T2Z, TD;
Chris@42 73 T1 = cr[0];
Chris@42 74 T3v = Tw * TF;
Chris@42 75 T2Z = Tv * TF;
Chris@42 76 T2G = FNMS(Ty, TF, T2F);
Chris@42 77 T3z = FMA(Ty, TF, T2F);
Chris@42 78 T3m = FNMS(Ty, T2z, T3l);
Chris@42 79 T3g = FMA(Ty, T2C, T3f);
Chris@42 80 T3L = FNMS(Tz, TF, T3r);
Chris@42 81 T3s = FMA(Tz, TF, T3r);
Chris@42 82 T1V = FMA(Ty, Tz, Tx);
Chris@42 83 TA = FNMS(Ty, Tz, Tx);
Chris@42 84 TD = Tv * Tz;
Chris@42 85 T3w = FNMS(Tz, TB, T3v);
Chris@42 86 T3Q = FMA(Tz, TB, T3v);
Chris@42 87 T30 = FMA(Ty, TB, T2Z);
Chris@42 88 T3C = FNMS(Ty, TB, T2Z);
Chris@42 89 T1n = TA * TF;
Chris@42 90 TC = TA * TB;
Chris@42 91 T2b = T1V * TF;
Chris@42 92 T1W = T1V * TB;
Chris@42 93 TE = FMA(Ty, Tw, TD);
Chris@42 94 T1X = FNMS(Ty, Tw, TD);
Chris@42 95 T2 = ci[WS(rs, 7)];
Chris@42 96 TS = ci[WS(rs, 9)];
Chris@42 97 T1D = FMA(TE, TF, TC);
Chris@42 98 TG = FNMS(TE, TF, TC);
Chris@42 99 T1G = FNMS(TE, TB, T1n);
Chris@42 100 T1o = FMA(TE, TB, T1n);
Chris@42 101 T2p = FMA(T1X, TF, T1W);
Chris@42 102 T1Y = FNMS(T1X, TF, T1W);
Chris@42 103 T2u = FNMS(T1X, TB, T2b);
Chris@42 104 T2c = FMA(T1X, TB, T2b);
Chris@42 105 TH = T1 - T2;
Chris@42 106 T3 = T1 + T2;
Chris@42 107 TT = cr[WS(rs, 14)];
Chris@42 108 }
Chris@42 109 T1q = ci[WS(rs, 15)];
Chris@42 110 T1r = cr[WS(rs, 8)];
Chris@42 111 T4 = cr[WS(rs, 4)];
Chris@42 112 T2I = TS - TT;
Chris@42 113 TU = TS + TT;
Chris@42 114 T32 = T1q - T1r;
Chris@42 115 T1s = T1q + T1r;
Chris@42 116 T5 = ci[WS(rs, 3)];
Chris@42 117 }
Chris@42 118 {
Chris@42 119 E TI, TJ, T8, T9, TN, TO;
Chris@42 120 T8 = cr[WS(rs, 2)];
Chris@42 121 T9 = ci[WS(rs, 5)];
Chris@42 122 TI = ci[WS(rs, 11)];
Chris@42 123 T1p = T4 - T5;
Chris@42 124 T6 = T4 + T5;
Chris@42 125 TM = T8 - T9;
Chris@42 126 Ta = T8 + T9;
Chris@42 127 TJ = cr[WS(rs, 12)];
Chris@42 128 TN = ci[WS(rs, 13)];
Chris@42 129 TO = cr[WS(rs, 10)];
Chris@42 130 Tb = ci[WS(rs, 1)];
Chris@42 131 T33 = TI - TJ;
Chris@42 132 TK = TI + TJ;
Chris@42 133 T2J = TN - TO;
Chris@42 134 TP = TN + TO;
Chris@42 135 Tc = cr[WS(rs, 6)];
Chris@42 136 }
Chris@42 137 {
Chris@42 138 E TR, Td, T3D, T34;
Chris@42 139 T1Z = TH + TK;
Chris@42 140 TL = TH - TK;
Chris@42 141 T1t = T1p + T1s;
Chris@42 142 T2d = T1s - T1p;
Chris@42 143 TR = Tb - Tc;
Chris@42 144 Td = Tb + Tc;
Chris@42 145 T3D = T32 + T33;
Chris@42 146 T34 = T32 - T33;
Chris@42 147 {
Chris@42 148 E Te, T2K, T1u, T1v, T31, T3E, T2H, T7;
Chris@42 149 Te = Ta + Td;
Chris@42 150 T31 = Ta - Td;
Chris@42 151 T3E = T2J + T2I;
Chris@42 152 T2K = T2I - T2J;
Chris@42 153 TQ = TM - TP;
Chris@42 154 T1u = TM + TP;
Chris@42 155 T1v = TR + TU;
Chris@42 156 TV = TR - TU;
Chris@42 157 T35 = T31 + T34;
Chris@42 158 T3n = T34 - T31;
Chris@42 159 T3R = T3D - T3E;
Chris@42 160 T3F = T3D + T3E;
Chris@42 161 T2H = T3 - T6;
Chris@42 162 T7 = T3 + T6;
Chris@42 163 T20 = T1u + T1v;
Chris@42 164 T1w = T1u - T1v;
Chris@42 165 T3M = T7 - Te;
Chris@42 166 Tf = T7 + Te;
Chris@42 167 T3h = T2H - T2K;
Chris@42 168 T2L = T2H + T2K;
Chris@42 169 }
Chris@42 170 }
Chris@42 171 }
Chris@42 172 {
Chris@42 173 E T1e, Ti, T2N, T1c, T2O, T1h, T19, Tl, T13, Tp, Tq, T2S, T11, T2T, T16;
Chris@42 174 E Tr, Tj, Tk, Tm, TY, Tt;
Chris@42 175 {
Chris@42 176 E T1a, T1b, Tg, Th, T1f, T1g;
Chris@42 177 Tg = cr[WS(rs, 1)];
Chris@42 178 Th = ci[WS(rs, 6)];
Chris@42 179 T1a = ci[WS(rs, 14)];
Chris@42 180 T2e = TQ - TV;
Chris@42 181 TW = TQ + TV;
Chris@42 182 T1e = Tg - Th;
Chris@42 183 Ti = Tg + Th;
Chris@42 184 T1b = cr[WS(rs, 9)];
Chris@42 185 T1f = ci[WS(rs, 10)];
Chris@42 186 T1g = cr[WS(rs, 13)];
Chris@42 187 Tj = cr[WS(rs, 5)];
Chris@42 188 T2N = T1a - T1b;
Chris@42 189 T1c = T1a + T1b;
Chris@42 190 T2O = T1f - T1g;
Chris@42 191 T1h = T1f + T1g;
Chris@42 192 Tk = ci[WS(rs, 2)];
Chris@42 193 }
Chris@42 194 {
Chris@42 195 E TZ, T10, Tn, To, T14, T15;
Chris@42 196 Tn = ci[0];
Chris@42 197 To = cr[WS(rs, 7)];
Chris@42 198 TZ = ci[WS(rs, 8)];
Chris@42 199 T19 = Tj - Tk;
Chris@42 200 Tl = Tj + Tk;
Chris@42 201 T13 = Tn - To;
Chris@42 202 Tp = Tn + To;
Chris@42 203 T10 = cr[WS(rs, 15)];
Chris@42 204 T14 = ci[WS(rs, 12)];
Chris@42 205 T15 = cr[WS(rs, 11)];
Chris@42 206 Tq = cr[WS(rs, 3)];
Chris@42 207 T2S = TZ - T10;
Chris@42 208 T11 = TZ + T10;
Chris@42 209 T2T = T14 - T15;
Chris@42 210 T16 = T14 + T15;
Chris@42 211 Tr = ci[WS(rs, 4)];
Chris@42 212 }
Chris@42 213 {
Chris@42 214 E T2P, T2U, T2M, Ts, T3G, T3H, T2R;
Chris@42 215 T2P = T2N - T2O;
Chris@42 216 T3G = T2N + T2O;
Chris@42 217 T3H = T2S + T2T;
Chris@42 218 T2U = T2S - T2T;
Chris@42 219 Tm = Ti + Tl;
Chris@42 220 T2M = Ti - Tl;
Chris@42 221 TY = Tq - Tr;
Chris@42 222 Ts = Tq + Tr;
Chris@42 223 T3N = T3H - T3G;
Chris@42 224 T3I = T3G + T3H;
Chris@42 225 Tt = Tp + Ts;
Chris@42 226 T2R = Tp - Ts;
Chris@42 227 T2Q = T2M - T2P;
Chris@42 228 T36 = T2M + T2P;
Chris@42 229 T2V = T2R + T2U;
Chris@42 230 T37 = T2U - T2R;
Chris@42 231 }
Chris@42 232 {
Chris@42 233 E T25, T26, T22, T23, T12, T17;
Chris@42 234 T12 = TY - T11;
Chris@42 235 T25 = TY + T11;
Chris@42 236 T26 = T13 + T16;
Chris@42 237 T17 = T13 - T16;
Chris@42 238 T22 = T1c - T19;
Chris@42 239 T1d = T19 + T1c;
Chris@42 240 Tu = Tm + Tt;
Chris@42 241 T3S = Tm - Tt;
Chris@42 242 T18 = FNMS(KP414213562, T17, T12);
Chris@42 243 T1z = FMA(KP414213562, T12, T17);
Chris@42 244 T1i = T1e - T1h;
Chris@42 245 T23 = T1e + T1h;
Chris@42 246 T24 = FNMS(KP414213562, T23, T22);
Chris@42 247 T2g = FMA(KP414213562, T22, T23);
Chris@42 248 T27 = FNMS(KP414213562, T26, T25);
Chris@42 249 T2h = FMA(KP414213562, T25, T26);
Chris@42 250 }
Chris@42 251 }
Chris@42 252 {
Chris@42 253 E T1j, T1y, T3V, T3X, T3W, T38, T3i, T3o, T2W, T3K, T3B, T3A;
Chris@42 254 cr[0] = Tf + Tu;
Chris@42 255 T3A = Tf - Tu;
Chris@42 256 T1j = FMA(KP414213562, T1i, T1d);
Chris@42 257 T1y = FNMS(KP414213562, T1d, T1i);
Chris@42 258 T3K = T3C * T3A;
Chris@42 259 T3B = T3z * T3A;
Chris@42 260 {
Chris@42 261 E T3O, T3T, T3J, T3P, T3U;
Chris@42 262 T3O = T3M - T3N;
Chris@42 263 T3V = T3M + T3N;
Chris@42 264 T3X = T3S + T3R;
Chris@42 265 T3T = T3R - T3S;
Chris@42 266 ci[0] = T3F + T3I;
Chris@42 267 T3J = T3F - T3I;
Chris@42 268 T3P = T3L * T3O;
Chris@42 269 T3U = T3L * T3T;
Chris@42 270 T3W = TA * T3V;
Chris@42 271 cr[WS(rs, 8)] = FNMS(T3C, T3J, T3B);
Chris@42 272 ci[WS(rs, 8)] = FMA(T3z, T3J, T3K);
Chris@42 273 cr[WS(rs, 12)] = FNMS(T3Q, T3T, T3P);
Chris@42 274 ci[WS(rs, 12)] = FMA(T3Q, T3O, T3U);
Chris@42 275 T38 = T36 + T37;
Chris@42 276 T3i = T37 - T36;
Chris@42 277 T3o = T2Q - T2V;
Chris@42 278 T2W = T2Q + T2V;
Chris@42 279 }
Chris@42 280 {
Chris@42 281 E T2q, T21, T28, T2w, T2v, T2f, T2i, T2r;
Chris@42 282 {
Chris@42 283 E T2Y, T3a, T3c, T3d, T39, T3e, T3b, T2X, T3Y;
Chris@42 284 cr[WS(rs, 4)] = FNMS(TE, T3X, T3W);
Chris@42 285 T3Y = TA * T3X;
Chris@42 286 {
Chris@42 287 E T3t, T3j, T3x, T3p;
Chris@42 288 T3t = FMA(KP707106781, T3i, T3h);
Chris@42 289 T3j = FNMS(KP707106781, T3i, T3h);
Chris@42 290 T3x = FMA(KP707106781, T3o, T3n);
Chris@42 291 T3p = FNMS(KP707106781, T3o, T3n);
Chris@42 292 ci[WS(rs, 4)] = FMA(TE, T3V, T3Y);
Chris@42 293 {
Chris@42 294 E T3u, T3k, T3y, T3q;
Chris@42 295 T3u = T3s * T3t;
Chris@42 296 T3k = T3g * T3j;
Chris@42 297 T3y = T3s * T3x;
Chris@42 298 T3q = T3g * T3p;
Chris@42 299 cr[WS(rs, 6)] = FNMS(T3w, T3x, T3u);
Chris@42 300 cr[WS(rs, 14)] = FNMS(T3m, T3p, T3k);
Chris@42 301 ci[WS(rs, 6)] = FMA(T3w, T3t, T3y);
Chris@42 302 ci[WS(rs, 14)] = FMA(T3m, T3j, T3q);
Chris@42 303 T3b = FMA(KP707106781, T2W, T2L);
Chris@42 304 T2X = FNMS(KP707106781, T2W, T2L);
Chris@42 305 }
Chris@42 306 }
Chris@42 307 T2Y = T2G * T2X;
Chris@42 308 T3a = T30 * T2X;
Chris@42 309 T3c = T1V * T3b;
Chris@42 310 T3d = FMA(KP707106781, T38, T35);
Chris@42 311 T39 = FNMS(KP707106781, T38, T35);
Chris@42 312 T3e = T1X * T3b;
Chris@42 313 T2q = FMA(KP707106781, T20, T1Z);
Chris@42 314 T21 = FNMS(KP707106781, T20, T1Z);
Chris@42 315 cr[WS(rs, 2)] = FNMS(T1X, T3d, T3c);
Chris@42 316 ci[WS(rs, 10)] = FMA(T2G, T39, T3a);
Chris@42 317 cr[WS(rs, 10)] = FNMS(T30, T39, T2Y);
Chris@42 318 ci[WS(rs, 2)] = FMA(T1V, T3d, T3e);
Chris@42 319 T28 = T24 + T27;
Chris@42 320 T2w = T27 - T24;
Chris@42 321 T2v = FNMS(KP707106781, T2e, T2d);
Chris@42 322 T2f = FMA(KP707106781, T2e, T2d);
Chris@42 323 T2i = T2g - T2h;
Chris@42 324 T2r = T2g + T2h;
Chris@42 325 }
Chris@42 326 {
Chris@42 327 E TX, T1k, T1x, T1A;
Chris@42 328 T1J = FMA(KP707106781, TW, TL);
Chris@42 329 TX = FNMS(KP707106781, TW, TL);
Chris@42 330 {
Chris@42 331 E T2l, T29, T2n, T2j;
Chris@42 332 T2l = FNMS(KP923879532, T28, T21);
Chris@42 333 T29 = FMA(KP923879532, T28, T21);
Chris@42 334 T2n = FMA(KP923879532, T2i, T2f);
Chris@42 335 T2j = FNMS(KP923879532, T2i, T2f);
Chris@42 336 {
Chris@42 337 E T2o, T2m, T2k, T2a;
Chris@42 338 T2o = Tz * T2l;
Chris@42 339 T2m = Tw * T2l;
Chris@42 340 T2k = T2c * T29;
Chris@42 341 T2a = T1Y * T29;
Chris@42 342 ci[WS(rs, 3)] = FMA(Tw, T2n, T2o);
Chris@42 343 cr[WS(rs, 3)] = FNMS(Tz, T2n, T2m);
Chris@42 344 ci[WS(rs, 11)] = FMA(T1Y, T2j, T2k);
Chris@42 345 cr[WS(rs, 11)] = FNMS(T2c, T2j, T2a);
Chris@42 346 T1k = T18 - T1j;
Chris@42 347 T1O = T1j + T18;
Chris@42 348 }
Chris@42 349 }
Chris@42 350 T1N = FMA(KP707106781, T1w, T1t);
Chris@42 351 T1x = FNMS(KP707106781, T1w, T1t);
Chris@42 352 T1A = T1y - T1z;
Chris@42 353 T1K = T1y + T1z;
Chris@42 354 {
Chris@42 355 E T1E, T1l, T1H, T1B;
Chris@42 356 T1E = FMA(KP923879532, T1k, TX);
Chris@42 357 T1l = FNMS(KP923879532, T1k, TX);
Chris@42 358 T1H = FMA(KP923879532, T1A, T1x);
Chris@42 359 T1B = FNMS(KP923879532, T1A, T1x);
Chris@42 360 {
Chris@42 361 E T1I, T1F, T1C, T1m;
Chris@42 362 T1I = T1G * T1E;
Chris@42 363 T1F = T1D * T1E;
Chris@42 364 T1C = T1o * T1l;
Chris@42 365 T1m = TG * T1l;
Chris@42 366 ci[WS(rs, 5)] = FMA(T1D, T1H, T1I);
Chris@42 367 cr[WS(rs, 5)] = FNMS(T1G, T1H, T1F);
Chris@42 368 ci[WS(rs, 13)] = FMA(TG, T1B, T1C);
Chris@42 369 cr[WS(rs, 13)] = FNMS(T1o, T1B, T1m);
Chris@42 370 }
Chris@42 371 }
Chris@42 372 {
Chris@42 373 E T2A, T2s, T2D, T2x;
Chris@42 374 T2A = FMA(KP923879532, T2r, T2q);
Chris@42 375 T2s = FNMS(KP923879532, T2r, T2q);
Chris@42 376 T2D = FNMS(KP923879532, T2w, T2v);
Chris@42 377 T2x = FMA(KP923879532, T2w, T2v);
Chris@42 378 {
Chris@42 379 E T2B, T2t, T2E, T2y;
Chris@42 380 T2B = T2z * T2A;
Chris@42 381 T2t = T2p * T2s;
Chris@42 382 T2E = T2z * T2D;
Chris@42 383 T2y = T2p * T2x;
Chris@42 384 cr[WS(rs, 15)] = FNMS(T2C, T2D, T2B);
Chris@42 385 cr[WS(rs, 7)] = FNMS(T2u, T2x, T2t);
Chris@42 386 ci[WS(rs, 15)] = FMA(T2C, T2A, T2E);
Chris@42 387 ci[WS(rs, 7)] = FMA(T2u, T2s, T2y);
Chris@42 388 }
Chris@42 389 }
Chris@42 390 }
Chris@42 391 }
Chris@42 392 }
Chris@42 393 }
Chris@42 394 }
Chris@42 395 {
Chris@42 396 E T1L, T1R, T1P, T1T;
Chris@42 397 T1L = FNMS(KP923879532, T1K, T1J);
Chris@42 398 T1R = FMA(KP923879532, T1K, T1J);
Chris@42 399 T1P = FNMS(KP923879532, T1O, T1N);
Chris@42 400 T1T = FMA(KP923879532, T1O, T1N);
Chris@42 401 {
Chris@42 402 E T1S, T1M, T1U, T1Q;
Chris@42 403 T1S = Tv * T1R;
Chris@42 404 T1M = TB * T1L;
Chris@42 405 T1U = Tv * T1T;
Chris@42 406 T1Q = TB * T1P;
Chris@42 407 cr[WS(rs, 1)] = FNMS(Ty, T1T, T1S);
Chris@42 408 cr[WS(rs, 9)] = FNMS(TF, T1P, T1M);
Chris@42 409 ci[WS(rs, 1)] = FMA(Ty, T1R, T1U);
Chris@42 410 ci[WS(rs, 9)] = FMA(TF, T1L, T1Q);
Chris@42 411 }
Chris@42 412 }
Chris@42 413 }
Chris@42 414 }
Chris@42 415 }
Chris@42 416
Chris@42 417 static const tw_instr twinstr[] = {
Chris@42 418 {TW_CEXP, 1, 1},
Chris@42 419 {TW_CEXP, 1, 3},
Chris@42 420 {TW_CEXP, 1, 9},
Chris@42 421 {TW_CEXP, 1, 15},
Chris@42 422 {TW_NEXT, 1, 0}
Chris@42 423 };
Chris@42 424
Chris@42 425 static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {104, 42, 92, 0} };
Chris@42 426
Chris@42 427 void X(codelet_hb2_16) (planner *p) {
Chris@42 428 X(khc2hc_register) (p, hb2_16, &desc);
Chris@42 429 }
Chris@42 430 #else /* HAVE_FMA */
Chris@42 431
Chris@42 432 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 16 -dif -name hb2_16 -include hb.h */
Chris@42 433
Chris@42 434 /*
Chris@42 435 * This function contains 196 FP additions, 108 FP multiplications,
Chris@42 436 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@42 437 * 80 stack variables, 3 constants, and 64 memory accesses
Chris@42 438 */
Chris@42 439 #include "hb.h"
Chris@42 440
Chris@42 441 static void hb2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 442 {
Chris@42 443 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 444 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 445 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 446 {
Chris@42 447 INT m;
Chris@42 448 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 449 E Tv, Ty, T1l, T1n, T1p, T1t, T27, T25, Tz, Tw, TB, T21, T1P, T1H, T1X;
Chris@42 450 E T17, T1L, T1N, T1v, T1w, T1x, T1B, T2F, T2T, T2b, T2R, T3j, T3x, T35, T3t;
Chris@42 451 {
Chris@42 452 E TA, T1J, T15, T1G, Tx, T1K, T16, T1F;
Chris@42 453 {
Chris@42 454 E T1m, T1s, T1o, T1r;
Chris@42 455 Tv = W[0];
Chris@42 456 Ty = W[1];
Chris@42 457 T1l = W[2];
Chris@42 458 T1n = W[3];
Chris@42 459 T1m = Tv * T1l;
Chris@42 460 T1s = Ty * T1l;
Chris@42 461 T1o = Ty * T1n;
Chris@42 462 T1r = Tv * T1n;
Chris@42 463 T1p = T1m + T1o;
Chris@42 464 T1t = T1r - T1s;
Chris@42 465 T27 = T1r + T1s;
Chris@42 466 T25 = T1m - T1o;
Chris@42 467 Tz = W[5];
Chris@42 468 TA = Ty * Tz;
Chris@42 469 T1J = T1l * Tz;
Chris@42 470 T15 = Tv * Tz;
Chris@42 471 T1G = T1n * Tz;
Chris@42 472 Tw = W[4];
Chris@42 473 Tx = Tv * Tw;
Chris@42 474 T1K = T1n * Tw;
Chris@42 475 T16 = Ty * Tw;
Chris@42 476 T1F = T1l * Tw;
Chris@42 477 }
Chris@42 478 TB = Tx - TA;
Chris@42 479 T21 = T1J + T1K;
Chris@42 480 T1P = T15 - T16;
Chris@42 481 T1H = T1F + T1G;
Chris@42 482 T1X = T1F - T1G;
Chris@42 483 T17 = T15 + T16;
Chris@42 484 T1L = T1J - T1K;
Chris@42 485 T1N = Tx + TA;
Chris@42 486 T1v = W[6];
Chris@42 487 T1w = W[7];
Chris@42 488 T1x = FMA(Tv, T1v, Ty * T1w);
Chris@42 489 T1B = FNMS(Ty, T1v, Tv * T1w);
Chris@42 490 {
Chris@42 491 E T2D, T2E, T29, T2a;
Chris@42 492 T2D = T25 * Tz;
Chris@42 493 T2E = T27 * Tw;
Chris@42 494 T2F = T2D + T2E;
Chris@42 495 T2T = T2D - T2E;
Chris@42 496 T29 = T25 * Tw;
Chris@42 497 T2a = T27 * Tz;
Chris@42 498 T2b = T29 - T2a;
Chris@42 499 T2R = T29 + T2a;
Chris@42 500 }
Chris@42 501 {
Chris@42 502 E T3h, T3i, T33, T34;
Chris@42 503 T3h = T1p * Tz;
Chris@42 504 T3i = T1t * Tw;
Chris@42 505 T3j = T3h + T3i;
Chris@42 506 T3x = T3h - T3i;
Chris@42 507 T33 = T1p * Tw;
Chris@42 508 T34 = T1t * Tz;
Chris@42 509 T35 = T33 - T34;
Chris@42 510 T3t = T33 + T34;
Chris@42 511 }
Chris@42 512 }
Chris@42 513 {
Chris@42 514 E T7, T36, T3k, TC, T1f, T2e, T2I, T1Q, Te, TJ, T1R, T18, T2L, T37, T2l;
Chris@42 515 E T3l, Tm, T1T, TT, T1h, T2A, T2N, T3b, T3n, Tt, T1U, T12, T1i, T2t, T2O;
Chris@42 516 E T3e, T3o;
Chris@42 517 {
Chris@42 518 E T3, T2c, T1e, T2d, T6, T2G, T1b, T2H;
Chris@42 519 {
Chris@42 520 E T1, T2, T1c, T1d;
Chris@42 521 T1 = cr[0];
Chris@42 522 T2 = ci[WS(rs, 7)];
Chris@42 523 T3 = T1 + T2;
Chris@42 524 T2c = T1 - T2;
Chris@42 525 T1c = ci[WS(rs, 11)];
Chris@42 526 T1d = cr[WS(rs, 12)];
Chris@42 527 T1e = T1c - T1d;
Chris@42 528 T2d = T1c + T1d;
Chris@42 529 }
Chris@42 530 {
Chris@42 531 E T4, T5, T19, T1a;
Chris@42 532 T4 = cr[WS(rs, 4)];
Chris@42 533 T5 = ci[WS(rs, 3)];
Chris@42 534 T6 = T4 + T5;
Chris@42 535 T2G = T4 - T5;
Chris@42 536 T19 = ci[WS(rs, 15)];
Chris@42 537 T1a = cr[WS(rs, 8)];
Chris@42 538 T1b = T19 - T1a;
Chris@42 539 T2H = T19 + T1a;
Chris@42 540 }
Chris@42 541 T7 = T3 + T6;
Chris@42 542 T36 = T2c + T2d;
Chris@42 543 T3k = T2H - T2G;
Chris@42 544 TC = T3 - T6;
Chris@42 545 T1f = T1b - T1e;
Chris@42 546 T2e = T2c - T2d;
Chris@42 547 T2I = T2G + T2H;
Chris@42 548 T1Q = T1b + T1e;
Chris@42 549 }
Chris@42 550 {
Chris@42 551 E Ta, T2f, TI, T2g, Td, T2i, TF, T2j;
Chris@42 552 {
Chris@42 553 E T8, T9, TG, TH;
Chris@42 554 T8 = cr[WS(rs, 2)];
Chris@42 555 T9 = ci[WS(rs, 5)];
Chris@42 556 Ta = T8 + T9;
Chris@42 557 T2f = T8 - T9;
Chris@42 558 TG = ci[WS(rs, 13)];
Chris@42 559 TH = cr[WS(rs, 10)];
Chris@42 560 TI = TG - TH;
Chris@42 561 T2g = TG + TH;
Chris@42 562 }
Chris@42 563 {
Chris@42 564 E Tb, Tc, TD, TE;
Chris@42 565 Tb = ci[WS(rs, 1)];
Chris@42 566 Tc = cr[WS(rs, 6)];
Chris@42 567 Td = Tb + Tc;
Chris@42 568 T2i = Tb - Tc;
Chris@42 569 TD = ci[WS(rs, 9)];
Chris@42 570 TE = cr[WS(rs, 14)];
Chris@42 571 TF = TD - TE;
Chris@42 572 T2j = TD + TE;
Chris@42 573 }
Chris@42 574 Te = Ta + Td;
Chris@42 575 TJ = TF - TI;
Chris@42 576 T1R = TI + TF;
Chris@42 577 T18 = Ta - Td;
Chris@42 578 {
Chris@42 579 E T2J, T2K, T2h, T2k;
Chris@42 580 T2J = T2f + T2g;
Chris@42 581 T2K = T2i + T2j;
Chris@42 582 T2L = KP707106781 * (T2J - T2K);
Chris@42 583 T37 = KP707106781 * (T2J + T2K);
Chris@42 584 T2h = T2f - T2g;
Chris@42 585 T2k = T2i - T2j;
Chris@42 586 T2l = KP707106781 * (T2h + T2k);
Chris@42 587 T3l = KP707106781 * (T2h - T2k);
Chris@42 588 }
Chris@42 589 }
Chris@42 590 {
Chris@42 591 E Ti, T2x, TR, T2y, Tl, T2u, TO, T2v, TL, TS;
Chris@42 592 {
Chris@42 593 E Tg, Th, TP, TQ;
Chris@42 594 Tg = cr[WS(rs, 1)];
Chris@42 595 Th = ci[WS(rs, 6)];
Chris@42 596 Ti = Tg + Th;
Chris@42 597 T2x = Tg - Th;
Chris@42 598 TP = ci[WS(rs, 10)];
Chris@42 599 TQ = cr[WS(rs, 13)];
Chris@42 600 TR = TP - TQ;
Chris@42 601 T2y = TP + TQ;
Chris@42 602 }
Chris@42 603 {
Chris@42 604 E Tj, Tk, TM, TN;
Chris@42 605 Tj = cr[WS(rs, 5)];
Chris@42 606 Tk = ci[WS(rs, 2)];
Chris@42 607 Tl = Tj + Tk;
Chris@42 608 T2u = Tj - Tk;
Chris@42 609 TM = ci[WS(rs, 14)];
Chris@42 610 TN = cr[WS(rs, 9)];
Chris@42 611 TO = TM - TN;
Chris@42 612 T2v = TM + TN;
Chris@42 613 }
Chris@42 614 Tm = Ti + Tl;
Chris@42 615 T1T = TO + TR;
Chris@42 616 TL = Ti - Tl;
Chris@42 617 TS = TO - TR;
Chris@42 618 TT = TL - TS;
Chris@42 619 T1h = TL + TS;
Chris@42 620 {
Chris@42 621 E T2w, T2z, T39, T3a;
Chris@42 622 T2w = T2u + T2v;
Chris@42 623 T2z = T2x - T2y;
Chris@42 624 T2A = FMA(KP923879532, T2w, KP382683432 * T2z);
Chris@42 625 T2N = FNMS(KP382683432, T2w, KP923879532 * T2z);
Chris@42 626 T39 = T2x + T2y;
Chris@42 627 T3a = T2v - T2u;
Chris@42 628 T3b = FNMS(KP923879532, T3a, KP382683432 * T39);
Chris@42 629 T3n = FMA(KP382683432, T3a, KP923879532 * T39);
Chris@42 630 }
Chris@42 631 }
Chris@42 632 {
Chris@42 633 E Tp, T2q, T10, T2r, Ts, T2n, TX, T2o, TU, T11;
Chris@42 634 {
Chris@42 635 E Tn, To, TY, TZ;
Chris@42 636 Tn = ci[0];
Chris@42 637 To = cr[WS(rs, 7)];
Chris@42 638 Tp = Tn + To;
Chris@42 639 T2q = Tn - To;
Chris@42 640 TY = ci[WS(rs, 12)];
Chris@42 641 TZ = cr[WS(rs, 11)];
Chris@42 642 T10 = TY - TZ;
Chris@42 643 T2r = TY + TZ;
Chris@42 644 }
Chris@42 645 {
Chris@42 646 E Tq, Tr, TV, TW;
Chris@42 647 Tq = cr[WS(rs, 3)];
Chris@42 648 Tr = ci[WS(rs, 4)];
Chris@42 649 Ts = Tq + Tr;
Chris@42 650 T2n = Tq - Tr;
Chris@42 651 TV = ci[WS(rs, 8)];
Chris@42 652 TW = cr[WS(rs, 15)];
Chris@42 653 TX = TV - TW;
Chris@42 654 T2o = TV + TW;
Chris@42 655 }
Chris@42 656 Tt = Tp + Ts;
Chris@42 657 T1U = TX + T10;
Chris@42 658 TU = Tp - Ts;
Chris@42 659 T11 = TX - T10;
Chris@42 660 T12 = TU + T11;
Chris@42 661 T1i = T11 - TU;
Chris@42 662 {
Chris@42 663 E T2p, T2s, T3c, T3d;
Chris@42 664 T2p = T2n - T2o;
Chris@42 665 T2s = T2q - T2r;
Chris@42 666 T2t = FNMS(KP382683432, T2s, KP923879532 * T2p);
Chris@42 667 T2O = FMA(KP382683432, T2p, KP923879532 * T2s);
Chris@42 668 T3c = T2q + T2r;
Chris@42 669 T3d = T2n + T2o;
Chris@42 670 T3e = FNMS(KP923879532, T3d, KP382683432 * T3c);
Chris@42 671 T3o = FMA(KP382683432, T3d, KP923879532 * T3c);
Chris@42 672 }
Chris@42 673 }
Chris@42 674 {
Chris@42 675 E Tf, Tu, T1O, T1S, T1V, T1W;
Chris@42 676 Tf = T7 + Te;
Chris@42 677 Tu = Tm + Tt;
Chris@42 678 T1O = Tf - Tu;
Chris@42 679 T1S = T1Q + T1R;
Chris@42 680 T1V = T1T + T1U;
Chris@42 681 T1W = T1S - T1V;
Chris@42 682 cr[0] = Tf + Tu;
Chris@42 683 ci[0] = T1S + T1V;
Chris@42 684 cr[WS(rs, 8)] = FNMS(T1P, T1W, T1N * T1O);
Chris@42 685 ci[WS(rs, 8)] = FMA(T1P, T1O, T1N * T1W);
Chris@42 686 }
Chris@42 687 {
Chris@42 688 E T3g, T3r, T3q, T3s;
Chris@42 689 {
Chris@42 690 E T38, T3f, T3m, T3p;
Chris@42 691 T38 = T36 - T37;
Chris@42 692 T3f = T3b + T3e;
Chris@42 693 T3g = T38 - T3f;
Chris@42 694 T3r = T38 + T3f;
Chris@42 695 T3m = T3k + T3l;
Chris@42 696 T3p = T3n - T3o;
Chris@42 697 T3q = T3m - T3p;
Chris@42 698 T3s = T3m + T3p;
Chris@42 699 }
Chris@42 700 cr[WS(rs, 11)] = FNMS(T3j, T3q, T35 * T3g);
Chris@42 701 ci[WS(rs, 11)] = FMA(T3j, T3g, T35 * T3q);
Chris@42 702 cr[WS(rs, 3)] = FNMS(T1n, T3s, T1l * T3r);
Chris@42 703 ci[WS(rs, 3)] = FMA(T1n, T3r, T1l * T3s);
Chris@42 704 }
Chris@42 705 {
Chris@42 706 E T3w, T3B, T3A, T3C;
Chris@42 707 {
Chris@42 708 E T3u, T3v, T3y, T3z;
Chris@42 709 T3u = T36 + T37;
Chris@42 710 T3v = T3n + T3o;
Chris@42 711 T3w = T3u - T3v;
Chris@42 712 T3B = T3u + T3v;
Chris@42 713 T3y = T3k - T3l;
Chris@42 714 T3z = T3b - T3e;
Chris@42 715 T3A = T3y + T3z;
Chris@42 716 T3C = T3y - T3z;
Chris@42 717 }
Chris@42 718 cr[WS(rs, 7)] = FNMS(T3x, T3A, T3t * T3w);
Chris@42 719 ci[WS(rs, 7)] = FMA(T3t, T3A, T3x * T3w);
Chris@42 720 cr[WS(rs, 15)] = FNMS(T1w, T3C, T1v * T3B);
Chris@42 721 ci[WS(rs, 15)] = FMA(T1v, T3C, T1w * T3B);
Chris@42 722 }
Chris@42 723 {
Chris@42 724 E T14, T1q, T1k, T1u;
Chris@42 725 {
Chris@42 726 E TK, T13, T1g, T1j;
Chris@42 727 TK = TC + TJ;
Chris@42 728 T13 = KP707106781 * (TT + T12);
Chris@42 729 T14 = TK - T13;
Chris@42 730 T1q = TK + T13;
Chris@42 731 T1g = T18 + T1f;
Chris@42 732 T1j = KP707106781 * (T1h + T1i);
Chris@42 733 T1k = T1g - T1j;
Chris@42 734 T1u = T1g + T1j;
Chris@42 735 }
Chris@42 736 cr[WS(rs, 10)] = FNMS(T17, T1k, TB * T14);
Chris@42 737 ci[WS(rs, 10)] = FMA(T17, T14, TB * T1k);
Chris@42 738 cr[WS(rs, 2)] = FNMS(T1t, T1u, T1p * T1q);
Chris@42 739 ci[WS(rs, 2)] = FMA(T1t, T1q, T1p * T1u);
Chris@42 740 }
Chris@42 741 {
Chris@42 742 E T1A, T1I, T1E, T1M;
Chris@42 743 {
Chris@42 744 E T1y, T1z, T1C, T1D;
Chris@42 745 T1y = TC - TJ;
Chris@42 746 T1z = KP707106781 * (T1i - T1h);
Chris@42 747 T1A = T1y - T1z;
Chris@42 748 T1I = T1y + T1z;
Chris@42 749 T1C = T1f - T18;
Chris@42 750 T1D = KP707106781 * (TT - T12);
Chris@42 751 T1E = T1C - T1D;
Chris@42 752 T1M = T1C + T1D;
Chris@42 753 }
Chris@42 754 cr[WS(rs, 14)] = FNMS(T1B, T1E, T1x * T1A);
Chris@42 755 ci[WS(rs, 14)] = FMA(T1x, T1E, T1B * T1A);
Chris@42 756 cr[WS(rs, 6)] = FNMS(T1L, T1M, T1H * T1I);
Chris@42 757 ci[WS(rs, 6)] = FMA(T1H, T1M, T1L * T1I);
Chris@42 758 }
Chris@42 759 {
Chris@42 760 E T2C, T2S, T2Q, T2U;
Chris@42 761 {
Chris@42 762 E T2m, T2B, T2M, T2P;
Chris@42 763 T2m = T2e - T2l;
Chris@42 764 T2B = T2t - T2A;
Chris@42 765 T2C = T2m - T2B;
Chris@42 766 T2S = T2m + T2B;
Chris@42 767 T2M = T2I - T2L;
Chris@42 768 T2P = T2N - T2O;
Chris@42 769 T2Q = T2M - T2P;
Chris@42 770 T2U = T2M + T2P;
Chris@42 771 }
Chris@42 772 cr[WS(rs, 13)] = FNMS(T2F, T2Q, T2b * T2C);
Chris@42 773 ci[WS(rs, 13)] = FMA(T2F, T2C, T2b * T2Q);
Chris@42 774 cr[WS(rs, 5)] = FNMS(T2T, T2U, T2R * T2S);
Chris@42 775 ci[WS(rs, 5)] = FMA(T2T, T2S, T2R * T2U);
Chris@42 776 }
Chris@42 777 {
Chris@42 778 E T2X, T31, T30, T32;
Chris@42 779 {
Chris@42 780 E T2V, T2W, T2Y, T2Z;
Chris@42 781 T2V = T2e + T2l;
Chris@42 782 T2W = T2N + T2O;
Chris@42 783 T2X = T2V - T2W;
Chris@42 784 T31 = T2V + T2W;
Chris@42 785 T2Y = T2I + T2L;
Chris@42 786 T2Z = T2A + T2t;
Chris@42 787 T30 = T2Y - T2Z;
Chris@42 788 T32 = T2Y + T2Z;
Chris@42 789 }
Chris@42 790 cr[WS(rs, 9)] = FNMS(Tz, T30, Tw * T2X);
Chris@42 791 ci[WS(rs, 9)] = FMA(Tw, T30, Tz * T2X);
Chris@42 792 cr[WS(rs, 1)] = FNMS(Ty, T32, Tv * T31);
Chris@42 793 ci[WS(rs, 1)] = FMA(Tv, T32, Ty * T31);
Chris@42 794 }
Chris@42 795 {
Chris@42 796 E T20, T26, T24, T28;
Chris@42 797 {
Chris@42 798 E T1Y, T1Z, T22, T23;
Chris@42 799 T1Y = T7 - Te;
Chris@42 800 T1Z = T1U - T1T;
Chris@42 801 T20 = T1Y - T1Z;
Chris@42 802 T26 = T1Y + T1Z;
Chris@42 803 T22 = T1Q - T1R;
Chris@42 804 T23 = Tm - Tt;
Chris@42 805 T24 = T22 - T23;
Chris@42 806 T28 = T23 + T22;
Chris@42 807 }
Chris@42 808 cr[WS(rs, 12)] = FNMS(T21, T24, T1X * T20);
Chris@42 809 ci[WS(rs, 12)] = FMA(T1X, T24, T21 * T20);
Chris@42 810 cr[WS(rs, 4)] = FNMS(T27, T28, T25 * T26);
Chris@42 811 ci[WS(rs, 4)] = FMA(T25, T28, T27 * T26);
Chris@42 812 }
Chris@42 813 }
Chris@42 814 }
Chris@42 815 }
Chris@42 816 }
Chris@42 817
Chris@42 818 static const tw_instr twinstr[] = {
Chris@42 819 {TW_CEXP, 1, 1},
Chris@42 820 {TW_CEXP, 1, 3},
Chris@42 821 {TW_CEXP, 1, 9},
Chris@42 822 {TW_CEXP, 1, 15},
Chris@42 823 {TW_NEXT, 1, 0}
Chris@42 824 };
Chris@42 825
Chris@42 826 static const hc2hc_desc desc = { 16, "hb2_16", twinstr, &GENUS, {156, 68, 40, 0} };
Chris@42 827
Chris@42 828 void X(codelet_hb2_16) (planner *p) {
Chris@42 829 X(khc2hc_register) (p, hb2_16, &desc);
Chris@42 830 }
Chris@42 831 #endif /* HAVE_FMA */