annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:32 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@82 32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
Chris@82 33 * 63 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E TA, T1O, T21, T1h, T2P, T2S, T3b, T3p, T3q, T3D, T1k, T1P, Tf, T3y, T2A;
Chris@82 46 E T36, TL, T22, T3s, T3t, T3z, T2F, T2U, T2K, T2V, Tu, T3E, TX, T1n, T1T;
Chris@82 47 E T24, T1W, T25, T18, T1m;
Chris@82 48 {
Chris@82 49 E T3, Tw, TJ, T2x, T1g, T2Q, T6, T1d, Ta, TB, Tz, T2R, TE, T2y, Td;
Chris@82 50 E TG;
Chris@82 51 {
Chris@82 52 E T1, T2, TH, TI;
Chris@82 53 T1 = cr[0];
Chris@82 54 T2 = ci[WS(rs, 7)];
Chris@82 55 T3 = T1 + T2;
Chris@82 56 Tw = T1 - T2;
Chris@82 57 TH = ci[WS(rs, 9)];
Chris@82 58 TI = cr[WS(rs, 14)];
Chris@82 59 TJ = TH + TI;
Chris@82 60 T2x = TH - TI;
Chris@82 61 }
Chris@82 62 {
Chris@82 63 E T1e, T1f, T4, T5;
Chris@82 64 T1e = ci[WS(rs, 15)];
Chris@82 65 T1f = cr[WS(rs, 8)];
Chris@82 66 T1g = T1e + T1f;
Chris@82 67 T2Q = T1e - T1f;
Chris@82 68 T4 = cr[WS(rs, 4)];
Chris@82 69 T5 = ci[WS(rs, 3)];
Chris@82 70 T6 = T4 + T5;
Chris@82 71 T1d = T4 - T5;
Chris@82 72 }
Chris@82 73 {
Chris@82 74 E T8, T9, Tx, Ty;
Chris@82 75 T8 = cr[WS(rs, 2)];
Chris@82 76 T9 = ci[WS(rs, 5)];
Chris@82 77 Ta = T8 + T9;
Chris@82 78 TB = T8 - T9;
Chris@82 79 Tx = ci[WS(rs, 11)];
Chris@82 80 Ty = cr[WS(rs, 12)];
Chris@82 81 Tz = Tx + Ty;
Chris@82 82 T2R = Tx - Ty;
Chris@82 83 }
Chris@82 84 {
Chris@82 85 E TC, TD, Tb, Tc;
Chris@82 86 TC = ci[WS(rs, 13)];
Chris@82 87 TD = cr[WS(rs, 10)];
Chris@82 88 TE = TC + TD;
Chris@82 89 T2y = TC - TD;
Chris@82 90 Tb = ci[WS(rs, 1)];
Chris@82 91 Tc = cr[WS(rs, 6)];
Chris@82 92 Td = Tb + Tc;
Chris@82 93 TG = Tb - Tc;
Chris@82 94 }
Chris@82 95 TA = Tw - Tz;
Chris@82 96 T1O = Tw + Tz;
Chris@82 97 T21 = T1g - T1d;
Chris@82 98 T1h = T1d + T1g;
Chris@82 99 T2P = Ta - Td;
Chris@82 100 T2S = T2Q - T2R;
Chris@82 101 T3b = T2S - T2P;
Chris@82 102 {
Chris@82 103 E T1i, T1j, T7, Te;
Chris@82 104 T3p = T2Q + T2R;
Chris@82 105 T3q = T2y + T2x;
Chris@82 106 T3D = T3p - T3q;
Chris@82 107 T1i = TB + TE;
Chris@82 108 T1j = TG + TJ;
Chris@82 109 T1k = T1i - T1j;
Chris@82 110 T1P = T1i + T1j;
Chris@82 111 T7 = T3 + T6;
Chris@82 112 Te = Ta + Td;
Chris@82 113 Tf = T7 + Te;
Chris@82 114 T3y = T7 - Te;
Chris@82 115 {
Chris@82 116 E T2w, T2z, TF, TK;
Chris@82 117 T2w = T3 - T6;
Chris@82 118 T2z = T2x - T2y;
Chris@82 119 T2A = T2w + T2z;
Chris@82 120 T36 = T2w - T2z;
Chris@82 121 TF = TB - TE;
Chris@82 122 TK = TG - TJ;
Chris@82 123 TL = TF + TK;
Chris@82 124 T22 = TF - TK;
Chris@82 125 }
Chris@82 126 }
Chris@82 127 }
Chris@82 128 {
Chris@82 129 E Ti, T13, T11, T2C, T16, T2D, Tl, TY, Tp, TS, TQ, T2H, TV, T2I, Ts;
Chris@82 130 E TN, T2B, T2E;
Chris@82 131 {
Chris@82 132 E Tg, Th, TZ, T10;
Chris@82 133 Tg = cr[WS(rs, 1)];
Chris@82 134 Th = ci[WS(rs, 6)];
Chris@82 135 Ti = Tg + Th;
Chris@82 136 T13 = Tg - Th;
Chris@82 137 TZ = ci[WS(rs, 14)];
Chris@82 138 T10 = cr[WS(rs, 9)];
Chris@82 139 T11 = TZ + T10;
Chris@82 140 T2C = TZ - T10;
Chris@82 141 }
Chris@82 142 {
Chris@82 143 E T14, T15, Tj, Tk;
Chris@82 144 T14 = ci[WS(rs, 10)];
Chris@82 145 T15 = cr[WS(rs, 13)];
Chris@82 146 T16 = T14 + T15;
Chris@82 147 T2D = T14 - T15;
Chris@82 148 Tj = cr[WS(rs, 5)];
Chris@82 149 Tk = ci[WS(rs, 2)];
Chris@82 150 Tl = Tj + Tk;
Chris@82 151 TY = Tj - Tk;
Chris@82 152 }
Chris@82 153 {
Chris@82 154 E Tn, To, TO, TP;
Chris@82 155 Tn = ci[0];
Chris@82 156 To = cr[WS(rs, 7)];
Chris@82 157 Tp = Tn + To;
Chris@82 158 TS = Tn - To;
Chris@82 159 TO = ci[WS(rs, 8)];
Chris@82 160 TP = cr[WS(rs, 15)];
Chris@82 161 TQ = TO + TP;
Chris@82 162 T2H = TO - TP;
Chris@82 163 }
Chris@82 164 {
Chris@82 165 E TT, TU, Tq, Tr;
Chris@82 166 TT = ci[WS(rs, 12)];
Chris@82 167 TU = cr[WS(rs, 11)];
Chris@82 168 TV = TT + TU;
Chris@82 169 T2I = TT - TU;
Chris@82 170 Tq = cr[WS(rs, 3)];
Chris@82 171 Tr = ci[WS(rs, 4)];
Chris@82 172 Ts = Tq + Tr;
Chris@82 173 TN = Tq - Tr;
Chris@82 174 }
Chris@82 175 T3s = T2C + T2D;
Chris@82 176 T3t = T2H + T2I;
Chris@82 177 T3z = T3t - T3s;
Chris@82 178 T2B = Ti - Tl;
Chris@82 179 T2E = T2C - T2D;
Chris@82 180 T2F = T2B - T2E;
Chris@82 181 T2U = T2B + T2E;
Chris@82 182 {
Chris@82 183 E T2G, T2J, Tm, Tt;
Chris@82 184 T2G = Tp - Ts;
Chris@82 185 T2J = T2H - T2I;
Chris@82 186 T2K = T2G + T2J;
Chris@82 187 T2V = T2J - T2G;
Chris@82 188 Tm = Ti + Tl;
Chris@82 189 Tt = Tp + Ts;
Chris@82 190 Tu = Tm + Tt;
Chris@82 191 T3E = Tm - Tt;
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E TR, TW, T1R, T1S;
Chris@82 195 TR = TN - TQ;
Chris@82 196 TW = TS - TV;
Chris@82 197 TX = FNMS(KP414213562, TW, TR);
Chris@82 198 T1n = FMA(KP414213562, TR, TW);
Chris@82 199 T1R = T11 - TY;
Chris@82 200 T1S = T13 + T16;
Chris@82 201 T1T = FNMS(KP414213562, T1S, T1R);
Chris@82 202 T24 = FMA(KP414213562, T1R, T1S);
Chris@82 203 }
Chris@82 204 {
Chris@82 205 E T1U, T1V, T12, T17;
Chris@82 206 T1U = TN + TQ;
Chris@82 207 T1V = TS + TV;
Chris@82 208 T1W = FNMS(KP414213562, T1V, T1U);
Chris@82 209 T25 = FMA(KP414213562, T1U, T1V);
Chris@82 210 T12 = TY + T11;
Chris@82 211 T17 = T13 - T16;
Chris@82 212 T18 = FMA(KP414213562, T17, T12);
Chris@82 213 T1m = FNMS(KP414213562, T12, T17);
Chris@82 214 }
Chris@82 215 }
Chris@82 216 cr[0] = Tf + Tu;
Chris@82 217 {
Chris@82 218 E T3r, T3u, T3v, T3l, T3n, T3o, T3w, T3m;
Chris@82 219 T3r = T3p + T3q;
Chris@82 220 T3u = T3s + T3t;
Chris@82 221 T3v = T3r - T3u;
Chris@82 222 T3m = Tf - Tu;
Chris@82 223 T3l = W[14];
Chris@82 224 T3n = T3l * T3m;
Chris@82 225 T3o = W[15];
Chris@82 226 T3w = T3o * T3m;
Chris@82 227 ci[0] = T3r + T3u;
Chris@82 228 ci[WS(rs, 8)] = FMA(T3l, T3v, T3w);
Chris@82 229 cr[WS(rs, 8)] = FNMS(T3o, T3v, T3n);
Chris@82 230 }
Chris@82 231 {
Chris@82 232 E T3A, T3F, T3B, T3G, T3x, T3C;
Chris@82 233 T3A = T3y - T3z;
Chris@82 234 T3F = T3D - T3E;
Chris@82 235 T3x = W[22];
Chris@82 236 T3B = T3x * T3A;
Chris@82 237 T3G = T3x * T3F;
Chris@82 238 T3C = W[23];
Chris@82 239 cr[WS(rs, 12)] = FNMS(T3C, T3F, T3B);
Chris@82 240 ci[WS(rs, 12)] = FMA(T3C, T3A, T3G);
Chris@82 241 }
Chris@82 242 {
Chris@82 243 E T3I, T3L, T3J, T3M, T3H, T3K;
Chris@82 244 T3I = T3y + T3z;
Chris@82 245 T3L = T3E + T3D;
Chris@82 246 T3H = W[6];
Chris@82 247 T3J = T3H * T3I;
Chris@82 248 T3M = T3H * T3L;
Chris@82 249 T3K = W[7];
Chris@82 250 cr[WS(rs, 4)] = FNMS(T3K, T3L, T3J);
Chris@82 251 ci[WS(rs, 4)] = FMA(T3K, T3I, T3M);
Chris@82 252 }
Chris@82 253 {
Chris@82 254 E T38, T3g, T3d, T3j, T37, T3c;
Chris@82 255 T37 = T2V - T2U;
Chris@82 256 T38 = FNMS(KP707106781, T37, T36);
Chris@82 257 T3g = FMA(KP707106781, T37, T36);
Chris@82 258 T3c = T2F - T2K;
Chris@82 259 T3d = FNMS(KP707106781, T3c, T3b);
Chris@82 260 T3j = FMA(KP707106781, T3c, T3b);
Chris@82 261 {
Chris@82 262 E T39, T3e, T35, T3a;
Chris@82 263 T35 = W[26];
Chris@82 264 T39 = T35 * T38;
Chris@82 265 T3e = T35 * T3d;
Chris@82 266 T3a = W[27];
Chris@82 267 cr[WS(rs, 14)] = FNMS(T3a, T3d, T39);
Chris@82 268 ci[WS(rs, 14)] = FMA(T3a, T38, T3e);
Chris@82 269 }
Chris@82 270 {
Chris@82 271 E T3h, T3k, T3f, T3i;
Chris@82 272 T3f = W[10];
Chris@82 273 T3h = T3f * T3g;
Chris@82 274 T3k = T3f * T3j;
Chris@82 275 T3i = W[11];
Chris@82 276 cr[WS(rs, 6)] = FNMS(T3i, T3j, T3h);
Chris@82 277 ci[WS(rs, 6)] = FMA(T3i, T3g, T3k);
Chris@82 278 }
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E T2M, T30, T2X, T33, T2L, T2T, T2W;
Chris@82 282 T2L = T2F + T2K;
Chris@82 283 T2M = FNMS(KP707106781, T2L, T2A);
Chris@82 284 T30 = FMA(KP707106781, T2L, T2A);
Chris@82 285 T2T = T2P + T2S;
Chris@82 286 T2W = T2U + T2V;
Chris@82 287 T2X = FNMS(KP707106781, T2W, T2T);
Chris@82 288 T33 = FMA(KP707106781, T2W, T2T);
Chris@82 289 {
Chris@82 290 E T2v, T2N, T2O, T2Y;
Chris@82 291 T2v = W[18];
Chris@82 292 T2N = T2v * T2M;
Chris@82 293 T2O = W[19];
Chris@82 294 T2Y = T2O * T2M;
Chris@82 295 cr[WS(rs, 10)] = FNMS(T2O, T2X, T2N);
Chris@82 296 ci[WS(rs, 10)] = FMA(T2v, T2X, T2Y);
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E T2Z, T31, T32, T34;
Chris@82 300 T2Z = W[2];
Chris@82 301 T31 = T2Z * T30;
Chris@82 302 T32 = W[3];
Chris@82 303 T34 = T32 * T30;
Chris@82 304 cr[WS(rs, 2)] = FNMS(T32, T33, T31);
Chris@82 305 ci[WS(rs, 2)] = FMA(T2Z, T33, T34);
Chris@82 306 }
Chris@82 307 }
Chris@82 308 {
Chris@82 309 E T1Y, T2a, T27, T2d;
Chris@82 310 {
Chris@82 311 E T1Q, T1X, T23, T26;
Chris@82 312 T1Q = FNMS(KP707106781, T1P, T1O);
Chris@82 313 T1X = T1T + T1W;
Chris@82 314 T1Y = FMA(KP923879532, T1X, T1Q);
Chris@82 315 T2a = FNMS(KP923879532, T1X, T1Q);
Chris@82 316 T23 = FMA(KP707106781, T22, T21);
Chris@82 317 T26 = T24 - T25;
Chris@82 318 T27 = FNMS(KP923879532, T26, T23);
Chris@82 319 T2d = FMA(KP923879532, T26, T23);
Chris@82 320 }
Chris@82 321 {
Chris@82 322 E T1N, T1Z, T20, T28;
Chris@82 323 T1N = W[20];
Chris@82 324 T1Z = T1N * T1Y;
Chris@82 325 T20 = W[21];
Chris@82 326 T28 = T20 * T1Y;
Chris@82 327 cr[WS(rs, 11)] = FNMS(T20, T27, T1Z);
Chris@82 328 ci[WS(rs, 11)] = FMA(T1N, T27, T28);
Chris@82 329 }
Chris@82 330 {
Chris@82 331 E T29, T2b, T2c, T2e;
Chris@82 332 T29 = W[4];
Chris@82 333 T2b = T29 * T2a;
Chris@82 334 T2c = W[5];
Chris@82 335 T2e = T2c * T2a;
Chris@82 336 cr[WS(rs, 3)] = FNMS(T2c, T2d, T2b);
Chris@82 337 ci[WS(rs, 3)] = FMA(T29, T2d, T2e);
Chris@82 338 }
Chris@82 339 }
Chris@82 340 {
Chris@82 341 E T1a, T1s, T1p, T1v;
Chris@82 342 {
Chris@82 343 E TM, T19, T1l, T1o;
Chris@82 344 TM = FNMS(KP707106781, TL, TA);
Chris@82 345 T19 = TX - T18;
Chris@82 346 T1a = FNMS(KP923879532, T19, TM);
Chris@82 347 T1s = FMA(KP923879532, T19, TM);
Chris@82 348 T1l = FNMS(KP707106781, T1k, T1h);
Chris@82 349 T1o = T1m - T1n;
Chris@82 350 T1p = FNMS(KP923879532, T1o, T1l);
Chris@82 351 T1v = FMA(KP923879532, T1o, T1l);
Chris@82 352 }
Chris@82 353 {
Chris@82 354 E Tv, T1b, T1c, T1q;
Chris@82 355 Tv = W[24];
Chris@82 356 T1b = Tv * T1a;
Chris@82 357 T1c = W[25];
Chris@82 358 T1q = T1c * T1a;
Chris@82 359 cr[WS(rs, 13)] = FNMS(T1c, T1p, T1b);
Chris@82 360 ci[WS(rs, 13)] = FMA(Tv, T1p, T1q);
Chris@82 361 }
Chris@82 362 {
Chris@82 363 E T1r, T1t, T1u, T1w;
Chris@82 364 T1r = W[8];
Chris@82 365 T1t = T1r * T1s;
Chris@82 366 T1u = W[9];
Chris@82 367 T1w = T1u * T1s;
Chris@82 368 cr[WS(rs, 5)] = FNMS(T1u, T1v, T1t);
Chris@82 369 ci[WS(rs, 5)] = FMA(T1r, T1v, T1w);
Chris@82 370 }
Chris@82 371 }
Chris@82 372 {
Chris@82 373 E T2i, T2q, T2n, T2t;
Chris@82 374 {
Chris@82 375 E T2g, T2h, T2l, T2m;
Chris@82 376 T2g = FMA(KP707106781, T1P, T1O);
Chris@82 377 T2h = T24 + T25;
Chris@82 378 T2i = FNMS(KP923879532, T2h, T2g);
Chris@82 379 T2q = FMA(KP923879532, T2h, T2g);
Chris@82 380 T2l = FNMS(KP707106781, T22, T21);
Chris@82 381 T2m = T1W - T1T;
Chris@82 382 T2n = FMA(KP923879532, T2m, T2l);
Chris@82 383 T2t = FNMS(KP923879532, T2m, T2l);
Chris@82 384 }
Chris@82 385 {
Chris@82 386 E T2j, T2o, T2f, T2k;
Chris@82 387 T2f = W[12];
Chris@82 388 T2j = T2f * T2i;
Chris@82 389 T2o = T2f * T2n;
Chris@82 390 T2k = W[13];
Chris@82 391 cr[WS(rs, 7)] = FNMS(T2k, T2n, T2j);
Chris@82 392 ci[WS(rs, 7)] = FMA(T2k, T2i, T2o);
Chris@82 393 }
Chris@82 394 {
Chris@82 395 E T2r, T2u, T2p, T2s;
Chris@82 396 T2p = W[28];
Chris@82 397 T2r = T2p * T2q;
Chris@82 398 T2u = T2p * T2t;
Chris@82 399 T2s = W[29];
Chris@82 400 cr[WS(rs, 15)] = FNMS(T2s, T2t, T2r);
Chris@82 401 ci[WS(rs, 15)] = FMA(T2s, T2q, T2u);
Chris@82 402 }
Chris@82 403 }
Chris@82 404 {
Chris@82 405 E T1A, T1I, T1F, T1L;
Chris@82 406 {
Chris@82 407 E T1y, T1z, T1D, T1E;
Chris@82 408 T1y = FMA(KP707106781, TL, TA);
Chris@82 409 T1z = T1m + T1n;
Chris@82 410 T1A = FNMS(KP923879532, T1z, T1y);
Chris@82 411 T1I = FMA(KP923879532, T1z, T1y);
Chris@82 412 T1D = FMA(KP707106781, T1k, T1h);
Chris@82 413 T1E = T18 + TX;
Chris@82 414 T1F = FNMS(KP923879532, T1E, T1D);
Chris@82 415 T1L = FMA(KP923879532, T1E, T1D);
Chris@82 416 }
Chris@82 417 {
Chris@82 418 E T1B, T1G, T1x, T1C;
Chris@82 419 T1x = W[16];
Chris@82 420 T1B = T1x * T1A;
Chris@82 421 T1G = T1x * T1F;
Chris@82 422 T1C = W[17];
Chris@82 423 cr[WS(rs, 9)] = FNMS(T1C, T1F, T1B);
Chris@82 424 ci[WS(rs, 9)] = FMA(T1C, T1A, T1G);
Chris@82 425 }
Chris@82 426 {
Chris@82 427 E T1J, T1M, T1H, T1K;
Chris@82 428 T1H = W[0];
Chris@82 429 T1J = T1H * T1I;
Chris@82 430 T1M = T1H * T1L;
Chris@82 431 T1K = W[1];
Chris@82 432 cr[WS(rs, 1)] = FNMS(T1K, T1L, T1J);
Chris@82 433 ci[WS(rs, 1)] = FMA(T1K, T1I, T1M);
Chris@82 434 }
Chris@82 435 }
Chris@82 436 }
Chris@82 437 }
Chris@82 438 }
Chris@82 439
Chris@82 440 static const tw_instr twinstr[] = {
Chris@82 441 {TW_FULL, 1, 16},
Chris@82 442 {TW_NEXT, 1, 0}
Chris@82 443 };
Chris@82 444
Chris@82 445 static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {104, 30, 70, 0} };
Chris@82 446
Chris@82 447 void X(codelet_hb_16) (planner *p) {
Chris@82 448 X(khc2hc_register) (p, hb_16, &desc);
Chris@82 449 }
Chris@82 450 #else
Chris@82 451
Chris@82 452 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include rdft/scalar/hb.h */
Chris@82 453
Chris@82 454 /*
Chris@82 455 * This function contains 174 FP additions, 84 FP multiplications,
Chris@82 456 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
Chris@82 457 * 50 stack variables, 3 constants, and 64 memory accesses
Chris@82 458 */
Chris@82 459 #include "rdft/scalar/hb.h"
Chris@82 460
Chris@82 461 static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 462 {
Chris@82 463 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 464 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 465 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 466 {
Chris@82 467 INT m;
Chris@82 468 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 469 E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
Chris@82 470 E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
Chris@82 471 E T2S, T30;
Chris@82 472 {
Chris@82 473 E T3, T1Q, T16, T1R, T6, T2i, T13, T2j;
Chris@82 474 {
Chris@82 475 E T1, T2, T14, T15;
Chris@82 476 T1 = cr[0];
Chris@82 477 T2 = ci[WS(rs, 7)];
Chris@82 478 T3 = T1 + T2;
Chris@82 479 T1Q = T1 - T2;
Chris@82 480 T14 = ci[WS(rs, 11)];
Chris@82 481 T15 = cr[WS(rs, 12)];
Chris@82 482 T16 = T14 - T15;
Chris@82 483 T1R = T14 + T15;
Chris@82 484 }
Chris@82 485 {
Chris@82 486 E T4, T5, T11, T12;
Chris@82 487 T4 = cr[WS(rs, 4)];
Chris@82 488 T5 = ci[WS(rs, 3)];
Chris@82 489 T6 = T4 + T5;
Chris@82 490 T2i = T4 - T5;
Chris@82 491 T11 = ci[WS(rs, 15)];
Chris@82 492 T12 = cr[WS(rs, 8)];
Chris@82 493 T13 = T11 - T12;
Chris@82 494 T2j = T11 + T12;
Chris@82 495 }
Chris@82 496 T7 = T3 + T6;
Chris@82 497 T2K = T1Q + T1R;
Chris@82 498 T2W = T2j - T2i;
Chris@82 499 Tw = T3 - T6;
Chris@82 500 T17 = T13 - T16;
Chris@82 501 T1S = T1Q - T1R;
Chris@82 502 T2k = T2i + T2j;
Chris@82 503 T1w = T13 + T16;
Chris@82 504 }
Chris@82 505 {
Chris@82 506 E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
Chris@82 507 {
Chris@82 508 E T8, T9, TA, TB;
Chris@82 509 T8 = cr[WS(rs, 2)];
Chris@82 510 T9 = ci[WS(rs, 5)];
Chris@82 511 Ta = T8 + T9;
Chris@82 512 T1T = T8 - T9;
Chris@82 513 TA = ci[WS(rs, 13)];
Chris@82 514 TB = cr[WS(rs, 10)];
Chris@82 515 TC = TA - TB;
Chris@82 516 T1U = TA + TB;
Chris@82 517 }
Chris@82 518 {
Chris@82 519 E Tb, Tc, Tx, Ty;
Chris@82 520 Tb = ci[WS(rs, 1)];
Chris@82 521 Tc = cr[WS(rs, 6)];
Chris@82 522 Td = Tb + Tc;
Chris@82 523 T1W = Tb - Tc;
Chris@82 524 Tx = ci[WS(rs, 9)];
Chris@82 525 Ty = cr[WS(rs, 14)];
Chris@82 526 Tz = Tx - Ty;
Chris@82 527 T1X = Tx + Ty;
Chris@82 528 }
Chris@82 529 Te = Ta + Td;
Chris@82 530 TD = Tz - TC;
Chris@82 531 T1x = TC + Tz;
Chris@82 532 T10 = Ta - Td;
Chris@82 533 {
Chris@82 534 E T2l, T2m, T1V, T1Y;
Chris@82 535 T2l = T1T + T1U;
Chris@82 536 T2m = T1W + T1X;
Chris@82 537 T2n = KP707106781 * (T2l - T2m);
Chris@82 538 T2L = KP707106781 * (T2l + T2m);
Chris@82 539 T1V = T1T - T1U;
Chris@82 540 T1Y = T1W - T1X;
Chris@82 541 T1Z = KP707106781 * (T1V + T1Y);
Chris@82 542 T2X = KP707106781 * (T1V - T1Y);
Chris@82 543 }
Chris@82 544 }
Chris@82 545 {
Chris@82 546 E Ti, T2b, TL, T2c, Tl, T28, TI, T29, TF, TM;
Chris@82 547 {
Chris@82 548 E Tg, Th, TJ, TK;
Chris@82 549 Tg = cr[WS(rs, 1)];
Chris@82 550 Th = ci[WS(rs, 6)];
Chris@82 551 Ti = Tg + Th;
Chris@82 552 T2b = Tg - Th;
Chris@82 553 TJ = ci[WS(rs, 10)];
Chris@82 554 TK = cr[WS(rs, 13)];
Chris@82 555 TL = TJ - TK;
Chris@82 556 T2c = TJ + TK;
Chris@82 557 }
Chris@82 558 {
Chris@82 559 E Tj, Tk, TG, TH;
Chris@82 560 Tj = cr[WS(rs, 5)];
Chris@82 561 Tk = ci[WS(rs, 2)];
Chris@82 562 Tl = Tj + Tk;
Chris@82 563 T28 = Tj - Tk;
Chris@82 564 TG = ci[WS(rs, 14)];
Chris@82 565 TH = cr[WS(rs, 9)];
Chris@82 566 TI = TG - TH;
Chris@82 567 T29 = TG + TH;
Chris@82 568 }
Chris@82 569 Tm = Ti + Tl;
Chris@82 570 T1z = TI + TL;
Chris@82 571 TF = Ti - Tl;
Chris@82 572 TM = TI - TL;
Chris@82 573 TN = TF - TM;
Chris@82 574 T19 = TF + TM;
Chris@82 575 {
Chris@82 576 E T2a, T2d, T2N, T2O;
Chris@82 577 T2a = T28 + T29;
Chris@82 578 T2d = T2b - T2c;
Chris@82 579 T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
Chris@82 580 T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
Chris@82 581 T2N = T2b + T2c;
Chris@82 582 T2O = T29 - T28;
Chris@82 583 T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
Chris@82 584 T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
Chris@82 585 }
Chris@82 586 }
Chris@82 587 {
Chris@82 588 E Tp, T24, TU, T25, Ts, T21, TR, T22, TO, TV;
Chris@82 589 {
Chris@82 590 E Tn, To, TS, TT;
Chris@82 591 Tn = ci[0];
Chris@82 592 To = cr[WS(rs, 7)];
Chris@82 593 Tp = Tn + To;
Chris@82 594 T24 = Tn - To;
Chris@82 595 TS = ci[WS(rs, 12)];
Chris@82 596 TT = cr[WS(rs, 11)];
Chris@82 597 TU = TS - TT;
Chris@82 598 T25 = TS + TT;
Chris@82 599 }
Chris@82 600 {
Chris@82 601 E Tq, Tr, TP, TQ;
Chris@82 602 Tq = cr[WS(rs, 3)];
Chris@82 603 Tr = ci[WS(rs, 4)];
Chris@82 604 Ts = Tq + Tr;
Chris@82 605 T21 = Tq - Tr;
Chris@82 606 TP = ci[WS(rs, 8)];
Chris@82 607 TQ = cr[WS(rs, 15)];
Chris@82 608 TR = TP - TQ;
Chris@82 609 T22 = TP + TQ;
Chris@82 610 }
Chris@82 611 Tt = Tp + Ts;
Chris@82 612 T1A = TR + TU;
Chris@82 613 TO = Tp - Ts;
Chris@82 614 TV = TR - TU;
Chris@82 615 TW = TO + TV;
Chris@82 616 T1a = TV - TO;
Chris@82 617 {
Chris@82 618 E T23, T26, T2Q, T2R;
Chris@82 619 T23 = T21 - T22;
Chris@82 620 T26 = T24 - T25;
Chris@82 621 T27 = FNMS(KP382683432, T26, KP923879532 * T23);
Chris@82 622 T2q = FMA(KP382683432, T23, KP923879532 * T26);
Chris@82 623 T2Q = T24 + T25;
Chris@82 624 T2R = T21 + T22;
Chris@82 625 T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
Chris@82 626 T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
Chris@82 627 }
Chris@82 628 }
Chris@82 629 {
Chris@82 630 E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
Chris@82 631 Tf = T7 + Te;
Chris@82 632 Tu = Tm + Tt;
Chris@82 633 T1u = Tf - Tu;
Chris@82 634 T1y = T1w + T1x;
Chris@82 635 T1B = T1z + T1A;
Chris@82 636 T1C = T1y - T1B;
Chris@82 637 cr[0] = Tf + Tu;
Chris@82 638 ci[0] = T1y + T1B;
Chris@82 639 T1t = W[14];
Chris@82 640 T1v = W[15];
Chris@82 641 cr[WS(rs, 8)] = FNMS(T1v, T1C, T1t * T1u);
Chris@82 642 ci[WS(rs, 8)] = FMA(T1v, T1u, T1t * T1C);
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T2U, T34, T32, T36;
Chris@82 646 {
Chris@82 647 E T2M, T2T, T2Y, T31;
Chris@82 648 T2M = T2K - T2L;
Chris@82 649 T2T = T2P + T2S;
Chris@82 650 T2U = T2M - T2T;
Chris@82 651 T34 = T2M + T2T;
Chris@82 652 T2Y = T2W + T2X;
Chris@82 653 T31 = T2Z - T30;
Chris@82 654 T32 = T2Y - T31;
Chris@82 655 T36 = T2Y + T31;
Chris@82 656 }
Chris@82 657 {
Chris@82 658 E T2J, T2V, T33, T35;
Chris@82 659 T2J = W[20];
Chris@82 660 T2V = W[21];
Chris@82 661 cr[WS(rs, 11)] = FNMS(T2V, T32, T2J * T2U);
Chris@82 662 ci[WS(rs, 11)] = FMA(T2V, T2U, T2J * T32);
Chris@82 663 T33 = W[4];
Chris@82 664 T35 = W[5];
Chris@82 665 cr[WS(rs, 3)] = FNMS(T35, T36, T33 * T34);
Chris@82 666 ci[WS(rs, 3)] = FMA(T35, T34, T33 * T36);
Chris@82 667 }
Chris@82 668 }
Chris@82 669 {
Chris@82 670 E T3a, T3g, T3e, T3i;
Chris@82 671 {
Chris@82 672 E T38, T39, T3c, T3d;
Chris@82 673 T38 = T2K + T2L;
Chris@82 674 T39 = T2Z + T30;
Chris@82 675 T3a = T38 - T39;
Chris@82 676 T3g = T38 + T39;
Chris@82 677 T3c = T2W - T2X;
Chris@82 678 T3d = T2P - T2S;
Chris@82 679 T3e = T3c + T3d;
Chris@82 680 T3i = T3c - T3d;
Chris@82 681 }
Chris@82 682 {
Chris@82 683 E T37, T3b, T3f, T3h;
Chris@82 684 T37 = W[12];
Chris@82 685 T3b = W[13];
Chris@82 686 cr[WS(rs, 7)] = FNMS(T3b, T3e, T37 * T3a);
Chris@82 687 ci[WS(rs, 7)] = FMA(T37, T3e, T3b * T3a);
Chris@82 688 T3f = W[28];
Chris@82 689 T3h = W[29];
Chris@82 690 cr[WS(rs, 15)] = FNMS(T3h, T3i, T3f * T3g);
Chris@82 691 ci[WS(rs, 15)] = FMA(T3f, T3i, T3h * T3g);
Chris@82 692 }
Chris@82 693 }
Chris@82 694 {
Chris@82 695 E TY, T1e, T1c, T1g;
Chris@82 696 {
Chris@82 697 E TE, TX, T18, T1b;
Chris@82 698 TE = Tw + TD;
Chris@82 699 TX = KP707106781 * (TN + TW);
Chris@82 700 TY = TE - TX;
Chris@82 701 T1e = TE + TX;
Chris@82 702 T18 = T10 + T17;
Chris@82 703 T1b = KP707106781 * (T19 + T1a);
Chris@82 704 T1c = T18 - T1b;
Chris@82 705 T1g = T18 + T1b;
Chris@82 706 }
Chris@82 707 {
Chris@82 708 E Tv, TZ, T1d, T1f;
Chris@82 709 Tv = W[18];
Chris@82 710 TZ = W[19];
Chris@82 711 cr[WS(rs, 10)] = FNMS(TZ, T1c, Tv * TY);
Chris@82 712 ci[WS(rs, 10)] = FMA(TZ, TY, Tv * T1c);
Chris@82 713 T1d = W[2];
Chris@82 714 T1f = W[3];
Chris@82 715 cr[WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
Chris@82 716 ci[WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
Chris@82 717 }
Chris@82 718 }
Chris@82 719 {
Chris@82 720 E T1k, T1q, T1o, T1s;
Chris@82 721 {
Chris@82 722 E T1i, T1j, T1m, T1n;
Chris@82 723 T1i = Tw - TD;
Chris@82 724 T1j = KP707106781 * (T1a - T19);
Chris@82 725 T1k = T1i - T1j;
Chris@82 726 T1q = T1i + T1j;
Chris@82 727 T1m = T17 - T10;
Chris@82 728 T1n = KP707106781 * (TN - TW);
Chris@82 729 T1o = T1m - T1n;
Chris@82 730 T1s = T1m + T1n;
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T1h, T1l, T1p, T1r;
Chris@82 734 T1h = W[26];
Chris@82 735 T1l = W[27];
Chris@82 736 cr[WS(rs, 14)] = FNMS(T1l, T1o, T1h * T1k);
Chris@82 737 ci[WS(rs, 14)] = FMA(T1h, T1o, T1l * T1k);
Chris@82 738 T1p = W[10];
Chris@82 739 T1r = W[11];
Chris@82 740 cr[WS(rs, 6)] = FNMS(T1r, T1s, T1p * T1q);
Chris@82 741 ci[WS(rs, 6)] = FMA(T1p, T1s, T1r * T1q);
Chris@82 742 }
Chris@82 743 }
Chris@82 744 {
Chris@82 745 E T2g, T2u, T2s, T2w;
Chris@82 746 {
Chris@82 747 E T20, T2f, T2o, T2r;
Chris@82 748 T20 = T1S - T1Z;
Chris@82 749 T2f = T27 - T2e;
Chris@82 750 T2g = T20 - T2f;
Chris@82 751 T2u = T20 + T2f;
Chris@82 752 T2o = T2k - T2n;
Chris@82 753 T2r = T2p - T2q;
Chris@82 754 T2s = T2o - T2r;
Chris@82 755 T2w = T2o + T2r;
Chris@82 756 }
Chris@82 757 {
Chris@82 758 E T1P, T2h, T2t, T2v;
Chris@82 759 T1P = W[24];
Chris@82 760 T2h = W[25];
Chris@82 761 cr[WS(rs, 13)] = FNMS(T2h, T2s, T1P * T2g);
Chris@82 762 ci[WS(rs, 13)] = FMA(T2h, T2g, T1P * T2s);
Chris@82 763 T2t = W[8];
Chris@82 764 T2v = W[9];
Chris@82 765 cr[WS(rs, 5)] = FNMS(T2v, T2w, T2t * T2u);
Chris@82 766 ci[WS(rs, 5)] = FMA(T2v, T2u, T2t * T2w);
Chris@82 767 }
Chris@82 768 }
Chris@82 769 {
Chris@82 770 E T2A, T2G, T2E, T2I;
Chris@82 771 {
Chris@82 772 E T2y, T2z, T2C, T2D;
Chris@82 773 T2y = T1S + T1Z;
Chris@82 774 T2z = T2p + T2q;
Chris@82 775 T2A = T2y - T2z;
Chris@82 776 T2G = T2y + T2z;
Chris@82 777 T2C = T2k + T2n;
Chris@82 778 T2D = T2e + T27;
Chris@82 779 T2E = T2C - T2D;
Chris@82 780 T2I = T2C + T2D;
Chris@82 781 }
Chris@82 782 {
Chris@82 783 E T2x, T2B, T2F, T2H;
Chris@82 784 T2x = W[16];
Chris@82 785 T2B = W[17];
Chris@82 786 cr[WS(rs, 9)] = FNMS(T2B, T2E, T2x * T2A);
Chris@82 787 ci[WS(rs, 9)] = FMA(T2x, T2E, T2B * T2A);
Chris@82 788 T2F = W[0];
Chris@82 789 T2H = W[1];
Chris@82 790 cr[WS(rs, 1)] = FNMS(T2H, T2I, T2F * T2G);
Chris@82 791 ci[WS(rs, 1)] = FMA(T2F, T2I, T2H * T2G);
Chris@82 792 }
Chris@82 793 }
Chris@82 794 {
Chris@82 795 E T1G, T1M, T1K, T1O;
Chris@82 796 {
Chris@82 797 E T1E, T1F, T1I, T1J;
Chris@82 798 T1E = T7 - Te;
Chris@82 799 T1F = T1A - T1z;
Chris@82 800 T1G = T1E - T1F;
Chris@82 801 T1M = T1E + T1F;
Chris@82 802 T1I = T1w - T1x;
Chris@82 803 T1J = Tm - Tt;
Chris@82 804 T1K = T1I - T1J;
Chris@82 805 T1O = T1J + T1I;
Chris@82 806 }
Chris@82 807 {
Chris@82 808 E T1D, T1H, T1L, T1N;
Chris@82 809 T1D = W[22];
Chris@82 810 T1H = W[23];
Chris@82 811 cr[WS(rs, 12)] = FNMS(T1H, T1K, T1D * T1G);
Chris@82 812 ci[WS(rs, 12)] = FMA(T1D, T1K, T1H * T1G);
Chris@82 813 T1L = W[6];
Chris@82 814 T1N = W[7];
Chris@82 815 cr[WS(rs, 4)] = FNMS(T1N, T1O, T1L * T1M);
Chris@82 816 ci[WS(rs, 4)] = FMA(T1L, T1O, T1N * T1M);
Chris@82 817 }
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 }
Chris@82 822
Chris@82 823 static const tw_instr twinstr[] = {
Chris@82 824 {TW_FULL, 1, 16},
Chris@82 825 {TW_NEXT, 1, 0}
Chris@82 826 };
Chris@82 827
Chris@82 828 static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {136, 46, 38, 0} };
Chris@82 829
Chris@82 830 void X(codelet_hb_16) (planner *p) {
Chris@82 831 X(khc2hc_register) (p, hb_16, &desc);
Chris@82 832 }
Chris@82 833 #endif