annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:45 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@42 32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
Chris@42 33 * 78 stack variables, 3 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 42 {
Chris@42 43 INT m;
Chris@42 44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 45 E T1I, T1L, T1K, T1M, T1J;
Chris@42 46 {
Chris@42 47 E T1O, TA, T1h, T21, T3b, T2T, T3D, T3r, T1k, T1P, T3y, Tf, T36, T2A, T22;
Chris@42 48 E TL, T3z, T3u, T2U, T2F, T2K, T2V, T12, Tu, T3E, TX, T1n, T17, T1T, T24;
Chris@42 49 E T1W, T25;
Chris@42 50 {
Chris@42 51 E T2z, TF, TK, T2w;
Chris@42 52 {
Chris@42 53 E Tw, T3, T2x, TJ, T2Q, T1g, T1d, T6, TC, TB, Ta, T2R, Tz, TD, Tb;
Chris@42 54 E Tc;
Chris@42 55 {
Chris@42 56 E T1e, T1f, T4, T5;
Chris@42 57 {
Chris@42 58 E T1, T2, TH, TI;
Chris@42 59 T1 = cr[0];
Chris@42 60 T2 = ci[WS(rs, 7)];
Chris@42 61 TH = ci[WS(rs, 9)];
Chris@42 62 TI = cr[WS(rs, 14)];
Chris@42 63 T1e = ci[WS(rs, 15)];
Chris@42 64 Tw = T1 - T2;
Chris@42 65 T3 = T1 + T2;
Chris@42 66 T2x = TH - TI;
Chris@42 67 TJ = TH + TI;
Chris@42 68 T1f = cr[WS(rs, 8)];
Chris@42 69 T4 = cr[WS(rs, 4)];
Chris@42 70 T5 = ci[WS(rs, 3)];
Chris@42 71 }
Chris@42 72 {
Chris@42 73 E T8, T9, Tx, Ty;
Chris@42 74 T8 = cr[WS(rs, 2)];
Chris@42 75 T2Q = T1e - T1f;
Chris@42 76 T1g = T1e + T1f;
Chris@42 77 T1d = T4 - T5;
Chris@42 78 T6 = T4 + T5;
Chris@42 79 T9 = ci[WS(rs, 5)];
Chris@42 80 Tx = ci[WS(rs, 11)];
Chris@42 81 Ty = cr[WS(rs, 12)];
Chris@42 82 TC = ci[WS(rs, 13)];
Chris@42 83 TB = T8 - T9;
Chris@42 84 Ta = T8 + T9;
Chris@42 85 T2R = Tx - Ty;
Chris@42 86 Tz = Tx + Ty;
Chris@42 87 TD = cr[WS(rs, 10)];
Chris@42 88 Tb = ci[WS(rs, 1)];
Chris@42 89 Tc = cr[WS(rs, 6)];
Chris@42 90 }
Chris@42 91 }
Chris@42 92 {
Chris@42 93 E T2y, TE, TG, Te, T2P, T2S, T3p, Td;
Chris@42 94 T1O = Tw + Tz;
Chris@42 95 TA = Tw - Tz;
Chris@42 96 T2y = TC - TD;
Chris@42 97 TE = TC + TD;
Chris@42 98 TG = Tb - Tc;
Chris@42 99 Td = Tb + Tc;
Chris@42 100 T1h = T1d + T1g;
Chris@42 101 T21 = T1g - T1d;
Chris@42 102 Te = Ta + Td;
Chris@42 103 T2P = Ta - Td;
Chris@42 104 T2S = T2Q - T2R;
Chris@42 105 T3p = T2Q + T2R;
Chris@42 106 {
Chris@42 107 E T1i, T1j, T3q, T7;
Chris@42 108 T3q = T2y + T2x;
Chris@42 109 T2z = T2x - T2y;
Chris@42 110 TF = TB - TE;
Chris@42 111 T1i = TB + TE;
Chris@42 112 T3b = T2S - T2P;
Chris@42 113 T2T = T2P + T2S;
Chris@42 114 TK = TG - TJ;
Chris@42 115 T1j = TG + TJ;
Chris@42 116 T3D = T3p - T3q;
Chris@42 117 T3r = T3p + T3q;
Chris@42 118 T2w = T3 - T6;
Chris@42 119 T7 = T3 + T6;
Chris@42 120 T1k = T1i - T1j;
Chris@42 121 T1P = T1i + T1j;
Chris@42 122 T3y = T7 - Te;
Chris@42 123 Tf = T7 + Te;
Chris@42 124 }
Chris@42 125 }
Chris@42 126 }
Chris@42 127 {
Chris@42 128 E T13, Ti, T2C, T11, T2D, T16, TY, Tl, TT, TS, Tp, T2H, TQ, TU, Tq;
Chris@42 129 E Tr;
Chris@42 130 {
Chris@42 131 E T14, T15, Tj, Tk;
Chris@42 132 {
Chris@42 133 E Tg, Th, TZ, T10;
Chris@42 134 Tg = cr[WS(rs, 1)];
Chris@42 135 T36 = T2w - T2z;
Chris@42 136 T2A = T2w + T2z;
Chris@42 137 T22 = TF - TK;
Chris@42 138 TL = TF + TK;
Chris@42 139 Th = ci[WS(rs, 6)];
Chris@42 140 TZ = ci[WS(rs, 14)];
Chris@42 141 T10 = cr[WS(rs, 9)];
Chris@42 142 T14 = ci[WS(rs, 10)];
Chris@42 143 T13 = Tg - Th;
Chris@42 144 Ti = Tg + Th;
Chris@42 145 T2C = TZ - T10;
Chris@42 146 T11 = TZ + T10;
Chris@42 147 T15 = cr[WS(rs, 13)];
Chris@42 148 Tj = cr[WS(rs, 5)];
Chris@42 149 Tk = ci[WS(rs, 2)];
Chris@42 150 }
Chris@42 151 {
Chris@42 152 E Tn, To, TO, TP;
Chris@42 153 Tn = ci[0];
Chris@42 154 T2D = T14 - T15;
Chris@42 155 T16 = T14 + T15;
Chris@42 156 TY = Tj - Tk;
Chris@42 157 Tl = Tj + Tk;
Chris@42 158 To = cr[WS(rs, 7)];
Chris@42 159 TO = ci[WS(rs, 8)];
Chris@42 160 TP = cr[WS(rs, 15)];
Chris@42 161 TT = ci[WS(rs, 12)];
Chris@42 162 TS = Tn - To;
Chris@42 163 Tp = Tn + To;
Chris@42 164 T2H = TO - TP;
Chris@42 165 TQ = TO + TP;
Chris@42 166 TU = cr[WS(rs, 11)];
Chris@42 167 Tq = cr[WS(rs, 3)];
Chris@42 168 Tr = ci[WS(rs, 4)];
Chris@42 169 }
Chris@42 170 }
Chris@42 171 {
Chris@42 172 E TV, TN, Tm, Tt;
Chris@42 173 {
Chris@42 174 E T2E, T3s, Ts, T3t, T2J, T2B, T2I, T2G;
Chris@42 175 T2E = T2C - T2D;
Chris@42 176 T3s = T2C + T2D;
Chris@42 177 T2I = TT - TU;
Chris@42 178 TV = TT + TU;
Chris@42 179 TN = Tq - Tr;
Chris@42 180 Ts = Tq + Tr;
Chris@42 181 T3t = T2H + T2I;
Chris@42 182 T2J = T2H - T2I;
Chris@42 183 Tm = Ti + Tl;
Chris@42 184 T2B = Ti - Tl;
Chris@42 185 Tt = Tp + Ts;
Chris@42 186 T2G = Tp - Ts;
Chris@42 187 T3z = T3t - T3s;
Chris@42 188 T3u = T3s + T3t;
Chris@42 189 T2U = T2B + T2E;
Chris@42 190 T2F = T2B - T2E;
Chris@42 191 T2K = T2G + T2J;
Chris@42 192 T2V = T2J - T2G;
Chris@42 193 }
Chris@42 194 {
Chris@42 195 E T1U, T1V, T1R, T1S, TR, TW;
Chris@42 196 TR = TN - TQ;
Chris@42 197 T1U = TN + TQ;
Chris@42 198 T1V = TS + TV;
Chris@42 199 TW = TS - TV;
Chris@42 200 T1R = T11 - TY;
Chris@42 201 T12 = TY + T11;
Chris@42 202 Tu = Tm + Tt;
Chris@42 203 T3E = Tm - Tt;
Chris@42 204 TX = FNMS(KP414213562, TW, TR);
Chris@42 205 T1n = FMA(KP414213562, TR, TW);
Chris@42 206 T17 = T13 - T16;
Chris@42 207 T1S = T13 + T16;
Chris@42 208 T1T = FNMS(KP414213562, T1S, T1R);
Chris@42 209 T24 = FMA(KP414213562, T1R, T1S);
Chris@42 210 T1W = FNMS(KP414213562, T1V, T1U);
Chris@42 211 T25 = FMA(KP414213562, T1U, T1V);
Chris@42 212 }
Chris@42 213 }
Chris@42 214 }
Chris@42 215 }
Chris@42 216 {
Chris@42 217 E T18, T1m, T2W, T2L, T3j, T3i, T3h;
Chris@42 218 {
Chris@42 219 E T3m, T3v, T3l, T3o;
Chris@42 220 cr[0] = Tf + Tu;
Chris@42 221 T18 = FMA(KP414213562, T17, T12);
Chris@42 222 T1m = FNMS(KP414213562, T12, T17);
Chris@42 223 T3m = Tf - Tu;
Chris@42 224 T3v = T3r - T3u;
Chris@42 225 T3l = W[14];
Chris@42 226 T3o = W[15];
Chris@42 227 ci[0] = T3r + T3u;
Chris@42 228 {
Chris@42 229 E T3A, T3I, T3L, T3F, T3C, T3G, T3B, T3x, T3n, T3w, T3H, T3K;
Chris@42 230 T3A = T3y - T3z;
Chris@42 231 T3I = T3y + T3z;
Chris@42 232 T3n = T3l * T3m;
Chris@42 233 T3w = T3o * T3m;
Chris@42 234 T3L = T3E + T3D;
Chris@42 235 T3F = T3D - T3E;
Chris@42 236 T3x = W[22];
Chris@42 237 cr[WS(rs, 8)] = FNMS(T3o, T3v, T3n);
Chris@42 238 ci[WS(rs, 8)] = FMA(T3l, T3v, T3w);
Chris@42 239 T3C = W[23];
Chris@42 240 T3G = T3x * T3F;
Chris@42 241 T3B = T3x * T3A;
Chris@42 242 ci[WS(rs, 12)] = FMA(T3C, T3A, T3G);
Chris@42 243 cr[WS(rs, 12)] = FNMS(T3C, T3F, T3B);
Chris@42 244 T3H = W[6];
Chris@42 245 T3K = W[7];
Chris@42 246 {
Chris@42 247 E T3g, T38, T3d, T35, T3a;
Chris@42 248 {
Chris@42 249 E T37, T3c, T3M, T3J;
Chris@42 250 T37 = T2V - T2U;
Chris@42 251 T2W = T2U + T2V;
Chris@42 252 T2L = T2F + T2K;
Chris@42 253 T3c = T2F - T2K;
Chris@42 254 T3M = T3H * T3L;
Chris@42 255 T3J = T3H * T3I;
Chris@42 256 T3g = FMA(KP707106781, T37, T36);
Chris@42 257 T38 = FNMS(KP707106781, T37, T36);
Chris@42 258 ci[WS(rs, 4)] = FMA(T3K, T3I, T3M);
Chris@42 259 cr[WS(rs, 4)] = FNMS(T3K, T3L, T3J);
Chris@42 260 T3d = FNMS(KP707106781, T3c, T3b);
Chris@42 261 T3j = FMA(KP707106781, T3c, T3b);
Chris@42 262 }
Chris@42 263 T35 = W[26];
Chris@42 264 T3a = W[27];
Chris@42 265 {
Chris@42 266 E T3f, T3e, T39, T3k;
Chris@42 267 T3f = W[10];
Chris@42 268 T3i = W[11];
Chris@42 269 T3e = T35 * T3d;
Chris@42 270 T39 = T35 * T38;
Chris@42 271 T3k = T3f * T3j;
Chris@42 272 T3h = T3f * T3g;
Chris@42 273 ci[WS(rs, 14)] = FMA(T3a, T38, T3e);
Chris@42 274 cr[WS(rs, 14)] = FNMS(T3a, T3d, T39);
Chris@42 275 ci[WS(rs, 6)] = FMA(T3i, T3g, T3k);
Chris@42 276 }
Chris@42 277 }
Chris@42 278 }
Chris@42 279 }
Chris@42 280 cr[WS(rs, 6)] = FNMS(T3i, T3j, T3h);
Chris@42 281 {
Chris@42 282 E T2g, T2m, T2l, T2h, T2d, T29, T2c, T2b, T2e;
Chris@42 283 {
Chris@42 284 E T33, T2Z, T32, T31, T34;
Chris@42 285 {
Chris@42 286 E T2v, T30, T2M, T2X, T2O, T2N, T2Y;
Chris@42 287 T2v = W[18];
Chris@42 288 T30 = FMA(KP707106781, T2L, T2A);
Chris@42 289 T2M = FNMS(KP707106781, T2L, T2A);
Chris@42 290 T33 = FMA(KP707106781, T2W, T2T);
Chris@42 291 T2X = FNMS(KP707106781, T2W, T2T);
Chris@42 292 T2O = W[19];
Chris@42 293 T2N = T2v * T2M;
Chris@42 294 T2Z = W[2];
Chris@42 295 T32 = W[3];
Chris@42 296 T2Y = T2O * T2M;
Chris@42 297 cr[WS(rs, 10)] = FNMS(T2O, T2X, T2N);
Chris@42 298 T31 = T2Z * T30;
Chris@42 299 T34 = T32 * T30;
Chris@42 300 ci[WS(rs, 10)] = FMA(T2v, T2X, T2Y);
Chris@42 301 }
Chris@42 302 {
Chris@42 303 E T1Q, T1X, T23, T26;
Chris@42 304 T2g = FMA(KP707106781, T1P, T1O);
Chris@42 305 T1Q = FNMS(KP707106781, T1P, T1O);
Chris@42 306 cr[WS(rs, 2)] = FNMS(T32, T33, T31);
Chris@42 307 ci[WS(rs, 2)] = FMA(T2Z, T33, T34);
Chris@42 308 T1X = T1T + T1W;
Chris@42 309 T2m = T1W - T1T;
Chris@42 310 T2l = FNMS(KP707106781, T22, T21);
Chris@42 311 T23 = FMA(KP707106781, T22, T21);
Chris@42 312 T26 = T24 - T25;
Chris@42 313 T2h = T24 + T25;
Chris@42 314 {
Chris@42 315 E T1N, T2a, T1Y, T27, T20, T1Z, T28;
Chris@42 316 T1N = W[20];
Chris@42 317 T2a = FNMS(KP923879532, T1X, T1Q);
Chris@42 318 T1Y = FMA(KP923879532, T1X, T1Q);
Chris@42 319 T2d = FMA(KP923879532, T26, T23);
Chris@42 320 T27 = FNMS(KP923879532, T26, T23);
Chris@42 321 T20 = W[21];
Chris@42 322 T1Z = T1N * T1Y;
Chris@42 323 T29 = W[4];
Chris@42 324 T2c = W[5];
Chris@42 325 T28 = T20 * T1Y;
Chris@42 326 cr[WS(rs, 11)] = FNMS(T20, T27, T1Z);
Chris@42 327 T2b = T29 * T2a;
Chris@42 328 T2e = T2c * T2a;
Chris@42 329 ci[WS(rs, 11)] = FMA(T1N, T27, T28);
Chris@42 330 }
Chris@42 331 }
Chris@42 332 }
Chris@42 333 {
Chris@42 334 E T1y, T1E, T1D, T1z, T1v, T1r, T1u, T1t, T1w;
Chris@42 335 {
Chris@42 336 E TM, T19, T1l, T1o;
Chris@42 337 T1y = FMA(KP707106781, TL, TA);
Chris@42 338 TM = FNMS(KP707106781, TL, TA);
Chris@42 339 cr[WS(rs, 3)] = FNMS(T2c, T2d, T2b);
Chris@42 340 ci[WS(rs, 3)] = FMA(T29, T2d, T2e);
Chris@42 341 T19 = TX - T18;
Chris@42 342 T1E = T18 + TX;
Chris@42 343 T1D = FMA(KP707106781, T1k, T1h);
Chris@42 344 T1l = FNMS(KP707106781, T1k, T1h);
Chris@42 345 T1o = T1m - T1n;
Chris@42 346 T1z = T1m + T1n;
Chris@42 347 {
Chris@42 348 E Tv, T1s, T1a, T1p, T1c, T1b, T1q;
Chris@42 349 Tv = W[24];
Chris@42 350 T1s = FMA(KP923879532, T19, TM);
Chris@42 351 T1a = FNMS(KP923879532, T19, TM);
Chris@42 352 T1v = FMA(KP923879532, T1o, T1l);
Chris@42 353 T1p = FNMS(KP923879532, T1o, T1l);
Chris@42 354 T1c = W[25];
Chris@42 355 T1b = Tv * T1a;
Chris@42 356 T1r = W[8];
Chris@42 357 T1u = W[9];
Chris@42 358 T1q = T1c * T1a;
Chris@42 359 cr[WS(rs, 13)] = FNMS(T1c, T1p, T1b);
Chris@42 360 T1t = T1r * T1s;
Chris@42 361 T1w = T1u * T1s;
Chris@42 362 ci[WS(rs, 13)] = FMA(Tv, T1p, T1q);
Chris@42 363 }
Chris@42 364 }
Chris@42 365 {
Chris@42 366 E T2q, T2t, T2s, T2u, T2r;
Chris@42 367 cr[WS(rs, 5)] = FNMS(T1u, T1v, T1t);
Chris@42 368 ci[WS(rs, 5)] = FMA(T1r, T1v, T1w);
Chris@42 369 {
Chris@42 370 E T2f, T2i, T2n, T2k, T2j, T2p, T2o;
Chris@42 371 T2f = W[12];
Chris@42 372 T2q = FMA(KP923879532, T2h, T2g);
Chris@42 373 T2i = FNMS(KP923879532, T2h, T2g);
Chris@42 374 T2t = FNMS(KP923879532, T2m, T2l);
Chris@42 375 T2n = FMA(KP923879532, T2m, T2l);
Chris@42 376 T2k = W[13];
Chris@42 377 T2j = T2f * T2i;
Chris@42 378 T2p = W[28];
Chris@42 379 T2o = T2f * T2n;
Chris@42 380 T2s = W[29];
Chris@42 381 cr[WS(rs, 7)] = FNMS(T2k, T2n, T2j);
Chris@42 382 T2u = T2p * T2t;
Chris@42 383 T2r = T2p * T2q;
Chris@42 384 ci[WS(rs, 7)] = FMA(T2k, T2i, T2o);
Chris@42 385 }
Chris@42 386 ci[WS(rs, 15)] = FMA(T2s, T2q, T2u);
Chris@42 387 cr[WS(rs, 15)] = FNMS(T2s, T2t, T2r);
Chris@42 388 {
Chris@42 389 E T1x, T1A, T1F, T1C, T1B, T1H, T1G;
Chris@42 390 T1x = W[16];
Chris@42 391 T1I = FMA(KP923879532, T1z, T1y);
Chris@42 392 T1A = FNMS(KP923879532, T1z, T1y);
Chris@42 393 T1L = FMA(KP923879532, T1E, T1D);
Chris@42 394 T1F = FNMS(KP923879532, T1E, T1D);
Chris@42 395 T1C = W[17];
Chris@42 396 T1B = T1x * T1A;
Chris@42 397 T1H = W[0];
Chris@42 398 T1G = T1x * T1F;
Chris@42 399 T1K = W[1];
Chris@42 400 cr[WS(rs, 9)] = FNMS(T1C, T1F, T1B);
Chris@42 401 T1M = T1H * T1L;
Chris@42 402 T1J = T1H * T1I;
Chris@42 403 ci[WS(rs, 9)] = FMA(T1C, T1A, T1G);
Chris@42 404 }
Chris@42 405 }
Chris@42 406 }
Chris@42 407 }
Chris@42 408 }
Chris@42 409 }
Chris@42 410 ci[WS(rs, 1)] = FMA(T1K, T1I, T1M);
Chris@42 411 cr[WS(rs, 1)] = FNMS(T1K, T1L, T1J);
Chris@42 412 }
Chris@42 413 }
Chris@42 414 }
Chris@42 415
Chris@42 416 static const tw_instr twinstr[] = {
Chris@42 417 {TW_FULL, 1, 16},
Chris@42 418 {TW_NEXT, 1, 0}
Chris@42 419 };
Chris@42 420
Chris@42 421 static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {104, 30, 70, 0} };
Chris@42 422
Chris@42 423 void X(codelet_hb_16) (planner *p) {
Chris@42 424 X(khc2hc_register) (p, hb_16, &desc);
Chris@42 425 }
Chris@42 426 #else /* HAVE_FMA */
Chris@42 427
Chris@42 428 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 16 -dif -name hb_16 -include hb.h */
Chris@42 429
Chris@42 430 /*
Chris@42 431 * This function contains 174 FP additions, 84 FP multiplications,
Chris@42 432 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
Chris@42 433 * 50 stack variables, 3 constants, and 64 memory accesses
Chris@42 434 */
Chris@42 435 #include "hb.h"
Chris@42 436
Chris@42 437 static void hb_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 438 {
Chris@42 439 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 440 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 441 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 442 {
Chris@42 443 INT m;
Chris@42 444 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 445 E T7, T2K, T2W, Tw, T17, T1S, T2k, T1w, Te, TD, T1x, T10, T2n, T2L, T1Z;
Chris@42 446 E T2X, Tm, T1z, TN, T19, T2e, T2p, T2P, T2Z, Tt, T1A, TW, T1a, T27, T2q;
Chris@42 447 E T2S, T30;
Chris@42 448 {
Chris@42 449 E T3, T1Q, T16, T1R, T6, T2i, T13, T2j;
Chris@42 450 {
Chris@42 451 E T1, T2, T14, T15;
Chris@42 452 T1 = cr[0];
Chris@42 453 T2 = ci[WS(rs, 7)];
Chris@42 454 T3 = T1 + T2;
Chris@42 455 T1Q = T1 - T2;
Chris@42 456 T14 = ci[WS(rs, 11)];
Chris@42 457 T15 = cr[WS(rs, 12)];
Chris@42 458 T16 = T14 - T15;
Chris@42 459 T1R = T14 + T15;
Chris@42 460 }
Chris@42 461 {
Chris@42 462 E T4, T5, T11, T12;
Chris@42 463 T4 = cr[WS(rs, 4)];
Chris@42 464 T5 = ci[WS(rs, 3)];
Chris@42 465 T6 = T4 + T5;
Chris@42 466 T2i = T4 - T5;
Chris@42 467 T11 = ci[WS(rs, 15)];
Chris@42 468 T12 = cr[WS(rs, 8)];
Chris@42 469 T13 = T11 - T12;
Chris@42 470 T2j = T11 + T12;
Chris@42 471 }
Chris@42 472 T7 = T3 + T6;
Chris@42 473 T2K = T1Q + T1R;
Chris@42 474 T2W = T2j - T2i;
Chris@42 475 Tw = T3 - T6;
Chris@42 476 T17 = T13 - T16;
Chris@42 477 T1S = T1Q - T1R;
Chris@42 478 T2k = T2i + T2j;
Chris@42 479 T1w = T13 + T16;
Chris@42 480 }
Chris@42 481 {
Chris@42 482 E Ta, T1T, TC, T1U, Td, T1W, Tz, T1X;
Chris@42 483 {
Chris@42 484 E T8, T9, TA, TB;
Chris@42 485 T8 = cr[WS(rs, 2)];
Chris@42 486 T9 = ci[WS(rs, 5)];
Chris@42 487 Ta = T8 + T9;
Chris@42 488 T1T = T8 - T9;
Chris@42 489 TA = ci[WS(rs, 13)];
Chris@42 490 TB = cr[WS(rs, 10)];
Chris@42 491 TC = TA - TB;
Chris@42 492 T1U = TA + TB;
Chris@42 493 }
Chris@42 494 {
Chris@42 495 E Tb, Tc, Tx, Ty;
Chris@42 496 Tb = ci[WS(rs, 1)];
Chris@42 497 Tc = cr[WS(rs, 6)];
Chris@42 498 Td = Tb + Tc;
Chris@42 499 T1W = Tb - Tc;
Chris@42 500 Tx = ci[WS(rs, 9)];
Chris@42 501 Ty = cr[WS(rs, 14)];
Chris@42 502 Tz = Tx - Ty;
Chris@42 503 T1X = Tx + Ty;
Chris@42 504 }
Chris@42 505 Te = Ta + Td;
Chris@42 506 TD = Tz - TC;
Chris@42 507 T1x = TC + Tz;
Chris@42 508 T10 = Ta - Td;
Chris@42 509 {
Chris@42 510 E T2l, T2m, T1V, T1Y;
Chris@42 511 T2l = T1T + T1U;
Chris@42 512 T2m = T1W + T1X;
Chris@42 513 T2n = KP707106781 * (T2l - T2m);
Chris@42 514 T2L = KP707106781 * (T2l + T2m);
Chris@42 515 T1V = T1T - T1U;
Chris@42 516 T1Y = T1W - T1X;
Chris@42 517 T1Z = KP707106781 * (T1V + T1Y);
Chris@42 518 T2X = KP707106781 * (T1V - T1Y);
Chris@42 519 }
Chris@42 520 }
Chris@42 521 {
Chris@42 522 E Ti, T2b, TL, T2c, Tl, T28, TI, T29, TF, TM;
Chris@42 523 {
Chris@42 524 E Tg, Th, TJ, TK;
Chris@42 525 Tg = cr[WS(rs, 1)];
Chris@42 526 Th = ci[WS(rs, 6)];
Chris@42 527 Ti = Tg + Th;
Chris@42 528 T2b = Tg - Th;
Chris@42 529 TJ = ci[WS(rs, 10)];
Chris@42 530 TK = cr[WS(rs, 13)];
Chris@42 531 TL = TJ - TK;
Chris@42 532 T2c = TJ + TK;
Chris@42 533 }
Chris@42 534 {
Chris@42 535 E Tj, Tk, TG, TH;
Chris@42 536 Tj = cr[WS(rs, 5)];
Chris@42 537 Tk = ci[WS(rs, 2)];
Chris@42 538 Tl = Tj + Tk;
Chris@42 539 T28 = Tj - Tk;
Chris@42 540 TG = ci[WS(rs, 14)];
Chris@42 541 TH = cr[WS(rs, 9)];
Chris@42 542 TI = TG - TH;
Chris@42 543 T29 = TG + TH;
Chris@42 544 }
Chris@42 545 Tm = Ti + Tl;
Chris@42 546 T1z = TI + TL;
Chris@42 547 TF = Ti - Tl;
Chris@42 548 TM = TI - TL;
Chris@42 549 TN = TF - TM;
Chris@42 550 T19 = TF + TM;
Chris@42 551 {
Chris@42 552 E T2a, T2d, T2N, T2O;
Chris@42 553 T2a = T28 + T29;
Chris@42 554 T2d = T2b - T2c;
Chris@42 555 T2e = FMA(KP923879532, T2a, KP382683432 * T2d);
Chris@42 556 T2p = FNMS(KP382683432, T2a, KP923879532 * T2d);
Chris@42 557 T2N = T2b + T2c;
Chris@42 558 T2O = T29 - T28;
Chris@42 559 T2P = FNMS(KP923879532, T2O, KP382683432 * T2N);
Chris@42 560 T2Z = FMA(KP382683432, T2O, KP923879532 * T2N);
Chris@42 561 }
Chris@42 562 }
Chris@42 563 {
Chris@42 564 E Tp, T24, TU, T25, Ts, T21, TR, T22, TO, TV;
Chris@42 565 {
Chris@42 566 E Tn, To, TS, TT;
Chris@42 567 Tn = ci[0];
Chris@42 568 To = cr[WS(rs, 7)];
Chris@42 569 Tp = Tn + To;
Chris@42 570 T24 = Tn - To;
Chris@42 571 TS = ci[WS(rs, 12)];
Chris@42 572 TT = cr[WS(rs, 11)];
Chris@42 573 TU = TS - TT;
Chris@42 574 T25 = TS + TT;
Chris@42 575 }
Chris@42 576 {
Chris@42 577 E Tq, Tr, TP, TQ;
Chris@42 578 Tq = cr[WS(rs, 3)];
Chris@42 579 Tr = ci[WS(rs, 4)];
Chris@42 580 Ts = Tq + Tr;
Chris@42 581 T21 = Tq - Tr;
Chris@42 582 TP = ci[WS(rs, 8)];
Chris@42 583 TQ = cr[WS(rs, 15)];
Chris@42 584 TR = TP - TQ;
Chris@42 585 T22 = TP + TQ;
Chris@42 586 }
Chris@42 587 Tt = Tp + Ts;
Chris@42 588 T1A = TR + TU;
Chris@42 589 TO = Tp - Ts;
Chris@42 590 TV = TR - TU;
Chris@42 591 TW = TO + TV;
Chris@42 592 T1a = TV - TO;
Chris@42 593 {
Chris@42 594 E T23, T26, T2Q, T2R;
Chris@42 595 T23 = T21 - T22;
Chris@42 596 T26 = T24 - T25;
Chris@42 597 T27 = FNMS(KP382683432, T26, KP923879532 * T23);
Chris@42 598 T2q = FMA(KP382683432, T23, KP923879532 * T26);
Chris@42 599 T2Q = T24 + T25;
Chris@42 600 T2R = T21 + T22;
Chris@42 601 T2S = FNMS(KP923879532, T2R, KP382683432 * T2Q);
Chris@42 602 T30 = FMA(KP382683432, T2R, KP923879532 * T2Q);
Chris@42 603 }
Chris@42 604 }
Chris@42 605 {
Chris@42 606 E Tf, Tu, T1u, T1y, T1B, T1C, T1t, T1v;
Chris@42 607 Tf = T7 + Te;
Chris@42 608 Tu = Tm + Tt;
Chris@42 609 T1u = Tf - Tu;
Chris@42 610 T1y = T1w + T1x;
Chris@42 611 T1B = T1z + T1A;
Chris@42 612 T1C = T1y - T1B;
Chris@42 613 cr[0] = Tf + Tu;
Chris@42 614 ci[0] = T1y + T1B;
Chris@42 615 T1t = W[14];
Chris@42 616 T1v = W[15];
Chris@42 617 cr[WS(rs, 8)] = FNMS(T1v, T1C, T1t * T1u);
Chris@42 618 ci[WS(rs, 8)] = FMA(T1v, T1u, T1t * T1C);
Chris@42 619 }
Chris@42 620 {
Chris@42 621 E T2U, T34, T32, T36;
Chris@42 622 {
Chris@42 623 E T2M, T2T, T2Y, T31;
Chris@42 624 T2M = T2K - T2L;
Chris@42 625 T2T = T2P + T2S;
Chris@42 626 T2U = T2M - T2T;
Chris@42 627 T34 = T2M + T2T;
Chris@42 628 T2Y = T2W + T2X;
Chris@42 629 T31 = T2Z - T30;
Chris@42 630 T32 = T2Y - T31;
Chris@42 631 T36 = T2Y + T31;
Chris@42 632 }
Chris@42 633 {
Chris@42 634 E T2J, T2V, T33, T35;
Chris@42 635 T2J = W[20];
Chris@42 636 T2V = W[21];
Chris@42 637 cr[WS(rs, 11)] = FNMS(T2V, T32, T2J * T2U);
Chris@42 638 ci[WS(rs, 11)] = FMA(T2V, T2U, T2J * T32);
Chris@42 639 T33 = W[4];
Chris@42 640 T35 = W[5];
Chris@42 641 cr[WS(rs, 3)] = FNMS(T35, T36, T33 * T34);
Chris@42 642 ci[WS(rs, 3)] = FMA(T35, T34, T33 * T36);
Chris@42 643 }
Chris@42 644 }
Chris@42 645 {
Chris@42 646 E T3a, T3g, T3e, T3i;
Chris@42 647 {
Chris@42 648 E T38, T39, T3c, T3d;
Chris@42 649 T38 = T2K + T2L;
Chris@42 650 T39 = T2Z + T30;
Chris@42 651 T3a = T38 - T39;
Chris@42 652 T3g = T38 + T39;
Chris@42 653 T3c = T2W - T2X;
Chris@42 654 T3d = T2P - T2S;
Chris@42 655 T3e = T3c + T3d;
Chris@42 656 T3i = T3c - T3d;
Chris@42 657 }
Chris@42 658 {
Chris@42 659 E T37, T3b, T3f, T3h;
Chris@42 660 T37 = W[12];
Chris@42 661 T3b = W[13];
Chris@42 662 cr[WS(rs, 7)] = FNMS(T3b, T3e, T37 * T3a);
Chris@42 663 ci[WS(rs, 7)] = FMA(T37, T3e, T3b * T3a);
Chris@42 664 T3f = W[28];
Chris@42 665 T3h = W[29];
Chris@42 666 cr[WS(rs, 15)] = FNMS(T3h, T3i, T3f * T3g);
Chris@42 667 ci[WS(rs, 15)] = FMA(T3f, T3i, T3h * T3g);
Chris@42 668 }
Chris@42 669 }
Chris@42 670 {
Chris@42 671 E TY, T1e, T1c, T1g;
Chris@42 672 {
Chris@42 673 E TE, TX, T18, T1b;
Chris@42 674 TE = Tw + TD;
Chris@42 675 TX = KP707106781 * (TN + TW);
Chris@42 676 TY = TE - TX;
Chris@42 677 T1e = TE + TX;
Chris@42 678 T18 = T10 + T17;
Chris@42 679 T1b = KP707106781 * (T19 + T1a);
Chris@42 680 T1c = T18 - T1b;
Chris@42 681 T1g = T18 + T1b;
Chris@42 682 }
Chris@42 683 {
Chris@42 684 E Tv, TZ, T1d, T1f;
Chris@42 685 Tv = W[18];
Chris@42 686 TZ = W[19];
Chris@42 687 cr[WS(rs, 10)] = FNMS(TZ, T1c, Tv * TY);
Chris@42 688 ci[WS(rs, 10)] = FMA(TZ, TY, Tv * T1c);
Chris@42 689 T1d = W[2];
Chris@42 690 T1f = W[3];
Chris@42 691 cr[WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
Chris@42 692 ci[WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
Chris@42 693 }
Chris@42 694 }
Chris@42 695 {
Chris@42 696 E T1k, T1q, T1o, T1s;
Chris@42 697 {
Chris@42 698 E T1i, T1j, T1m, T1n;
Chris@42 699 T1i = Tw - TD;
Chris@42 700 T1j = KP707106781 * (T1a - T19);
Chris@42 701 T1k = T1i - T1j;
Chris@42 702 T1q = T1i + T1j;
Chris@42 703 T1m = T17 - T10;
Chris@42 704 T1n = KP707106781 * (TN - TW);
Chris@42 705 T1o = T1m - T1n;
Chris@42 706 T1s = T1m + T1n;
Chris@42 707 }
Chris@42 708 {
Chris@42 709 E T1h, T1l, T1p, T1r;
Chris@42 710 T1h = W[26];
Chris@42 711 T1l = W[27];
Chris@42 712 cr[WS(rs, 14)] = FNMS(T1l, T1o, T1h * T1k);
Chris@42 713 ci[WS(rs, 14)] = FMA(T1h, T1o, T1l * T1k);
Chris@42 714 T1p = W[10];
Chris@42 715 T1r = W[11];
Chris@42 716 cr[WS(rs, 6)] = FNMS(T1r, T1s, T1p * T1q);
Chris@42 717 ci[WS(rs, 6)] = FMA(T1p, T1s, T1r * T1q);
Chris@42 718 }
Chris@42 719 }
Chris@42 720 {
Chris@42 721 E T2g, T2u, T2s, T2w;
Chris@42 722 {
Chris@42 723 E T20, T2f, T2o, T2r;
Chris@42 724 T20 = T1S - T1Z;
Chris@42 725 T2f = T27 - T2e;
Chris@42 726 T2g = T20 - T2f;
Chris@42 727 T2u = T20 + T2f;
Chris@42 728 T2o = T2k - T2n;
Chris@42 729 T2r = T2p - T2q;
Chris@42 730 T2s = T2o - T2r;
Chris@42 731 T2w = T2o + T2r;
Chris@42 732 }
Chris@42 733 {
Chris@42 734 E T1P, T2h, T2t, T2v;
Chris@42 735 T1P = W[24];
Chris@42 736 T2h = W[25];
Chris@42 737 cr[WS(rs, 13)] = FNMS(T2h, T2s, T1P * T2g);
Chris@42 738 ci[WS(rs, 13)] = FMA(T2h, T2g, T1P * T2s);
Chris@42 739 T2t = W[8];
Chris@42 740 T2v = W[9];
Chris@42 741 cr[WS(rs, 5)] = FNMS(T2v, T2w, T2t * T2u);
Chris@42 742 ci[WS(rs, 5)] = FMA(T2v, T2u, T2t * T2w);
Chris@42 743 }
Chris@42 744 }
Chris@42 745 {
Chris@42 746 E T2A, T2G, T2E, T2I;
Chris@42 747 {
Chris@42 748 E T2y, T2z, T2C, T2D;
Chris@42 749 T2y = T1S + T1Z;
Chris@42 750 T2z = T2p + T2q;
Chris@42 751 T2A = T2y - T2z;
Chris@42 752 T2G = T2y + T2z;
Chris@42 753 T2C = T2k + T2n;
Chris@42 754 T2D = T2e + T27;
Chris@42 755 T2E = T2C - T2D;
Chris@42 756 T2I = T2C + T2D;
Chris@42 757 }
Chris@42 758 {
Chris@42 759 E T2x, T2B, T2F, T2H;
Chris@42 760 T2x = W[16];
Chris@42 761 T2B = W[17];
Chris@42 762 cr[WS(rs, 9)] = FNMS(T2B, T2E, T2x * T2A);
Chris@42 763 ci[WS(rs, 9)] = FMA(T2x, T2E, T2B * T2A);
Chris@42 764 T2F = W[0];
Chris@42 765 T2H = W[1];
Chris@42 766 cr[WS(rs, 1)] = FNMS(T2H, T2I, T2F * T2G);
Chris@42 767 ci[WS(rs, 1)] = FMA(T2F, T2I, T2H * T2G);
Chris@42 768 }
Chris@42 769 }
Chris@42 770 {
Chris@42 771 E T1G, T1M, T1K, T1O;
Chris@42 772 {
Chris@42 773 E T1E, T1F, T1I, T1J;
Chris@42 774 T1E = T7 - Te;
Chris@42 775 T1F = T1A - T1z;
Chris@42 776 T1G = T1E - T1F;
Chris@42 777 T1M = T1E + T1F;
Chris@42 778 T1I = T1w - T1x;
Chris@42 779 T1J = Tm - Tt;
Chris@42 780 T1K = T1I - T1J;
Chris@42 781 T1O = T1J + T1I;
Chris@42 782 }
Chris@42 783 {
Chris@42 784 E T1D, T1H, T1L, T1N;
Chris@42 785 T1D = W[22];
Chris@42 786 T1H = W[23];
Chris@42 787 cr[WS(rs, 12)] = FNMS(T1H, T1K, T1D * T1G);
Chris@42 788 ci[WS(rs, 12)] = FMA(T1D, T1K, T1H * T1G);
Chris@42 789 T1L = W[6];
Chris@42 790 T1N = W[7];
Chris@42 791 cr[WS(rs, 4)] = FNMS(T1N, T1O, T1L * T1M);
Chris@42 792 ci[WS(rs, 4)] = FMA(T1L, T1O, T1N * T1M);
Chris@42 793 }
Chris@42 794 }
Chris@42 795 }
Chris@42 796 }
Chris@42 797 }
Chris@42 798
Chris@42 799 static const tw_instr twinstr[] = {
Chris@42 800 {TW_FULL, 1, 16},
Chris@42 801 {TW_NEXT, 1, 0}
Chris@42 802 };
Chris@42 803
Chris@42 804 static const hc2hc_desc desc = { 16, "hb_16", twinstr, &GENUS, {136, 46, 38, 0} };
Chris@42 805
Chris@42 806 void X(codelet_hb_16) (planner *p) {
Chris@42 807 X(khc2hc_register) (p, hb_16, &desc);
Chris@42 808 }
Chris@42 809 #endif /* HAVE_FMA */