annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:30 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@82 32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
Chris@82 33 * 60 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E T8, T3A, T1I, T3o, T1s, T35, T2k, T2w, T1F, T36, T2p, T2r, Tl, T3z, T1N;
Chris@82 46 E T3k, Tz, T2W, T1P, T1U, T11, T30, T25, T2g, T1e, T31, T2a, T2h, TM, T2V;
Chris@82 47 E T1W, T21;
Chris@82 48 {
Chris@82 49 E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
Chris@82 50 T1 = cr[0];
Chris@82 51 T3n = ci[0];
Chris@82 52 T3 = cr[WS(rs, 8)];
Chris@82 53 T6 = ci[WS(rs, 8)];
Chris@82 54 T2 = W[14];
Chris@82 55 T4 = T2 * T3;
Chris@82 56 T3l = T2 * T6;
Chris@82 57 T5 = W[15];
Chris@82 58 T7 = FMA(T5, T6, T4);
Chris@82 59 T3m = FNMS(T5, T3, T3l);
Chris@82 60 T8 = T1 + T7;
Chris@82 61 T3A = T3n - T3m;
Chris@82 62 T1I = T1 - T7;
Chris@82 63 T3o = T3m + T3n;
Chris@82 64 }
Chris@82 65 {
Chris@82 66 E T1h, T1k, T1i, T2s, T1n, T1q, T1o, T2u, T1g, T1m;
Chris@82 67 T1h = cr[WS(rs, 15)];
Chris@82 68 T1k = ci[WS(rs, 15)];
Chris@82 69 T1g = W[28];
Chris@82 70 T1i = T1g * T1h;
Chris@82 71 T2s = T1g * T1k;
Chris@82 72 T1n = cr[WS(rs, 7)];
Chris@82 73 T1q = ci[WS(rs, 7)];
Chris@82 74 T1m = W[12];
Chris@82 75 T1o = T1m * T1n;
Chris@82 76 T2u = T1m * T1q;
Chris@82 77 {
Chris@82 78 E T1l, T2t, T1r, T2v, T1j, T1p;
Chris@82 79 T1j = W[29];
Chris@82 80 T1l = FMA(T1j, T1k, T1i);
Chris@82 81 T2t = FNMS(T1j, T1h, T2s);
Chris@82 82 T1p = W[13];
Chris@82 83 T1r = FMA(T1p, T1q, T1o);
Chris@82 84 T2v = FNMS(T1p, T1n, T2u);
Chris@82 85 T1s = T1l + T1r;
Chris@82 86 T35 = T2t + T2v;
Chris@82 87 T2k = T1l - T1r;
Chris@82 88 T2w = T2t - T2v;
Chris@82 89 }
Chris@82 90 }
Chris@82 91 {
Chris@82 92 E T1u, T1x, T1v, T2l, T1A, T1D, T1B, T2n, T1t, T1z;
Chris@82 93 T1u = cr[WS(rs, 3)];
Chris@82 94 T1x = ci[WS(rs, 3)];
Chris@82 95 T1t = W[4];
Chris@82 96 T1v = T1t * T1u;
Chris@82 97 T2l = T1t * T1x;
Chris@82 98 T1A = cr[WS(rs, 11)];
Chris@82 99 T1D = ci[WS(rs, 11)];
Chris@82 100 T1z = W[20];
Chris@82 101 T1B = T1z * T1A;
Chris@82 102 T2n = T1z * T1D;
Chris@82 103 {
Chris@82 104 E T1y, T2m, T1E, T2o, T1w, T1C;
Chris@82 105 T1w = W[5];
Chris@82 106 T1y = FMA(T1w, T1x, T1v);
Chris@82 107 T2m = FNMS(T1w, T1u, T2l);
Chris@82 108 T1C = W[21];
Chris@82 109 T1E = FMA(T1C, T1D, T1B);
Chris@82 110 T2o = FNMS(T1C, T1A, T2n);
Chris@82 111 T1F = T1y + T1E;
Chris@82 112 T36 = T2m + T2o;
Chris@82 113 T2p = T2m - T2o;
Chris@82 114 T2r = T1E - T1y;
Chris@82 115 }
Chris@82 116 }
Chris@82 117 {
Chris@82 118 E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Chris@82 119 Ta = cr[WS(rs, 4)];
Chris@82 120 Td = ci[WS(rs, 4)];
Chris@82 121 T9 = W[6];
Chris@82 122 Tb = T9 * Ta;
Chris@82 123 T1J = T9 * Td;
Chris@82 124 Tg = cr[WS(rs, 12)];
Chris@82 125 Tj = ci[WS(rs, 12)];
Chris@82 126 Tf = W[22];
Chris@82 127 Th = Tf * Tg;
Chris@82 128 T1L = Tf * Tj;
Chris@82 129 {
Chris@82 130 E Te, T1K, Tk, T1M, Tc, Ti;
Chris@82 131 Tc = W[7];
Chris@82 132 Te = FMA(Tc, Td, Tb);
Chris@82 133 T1K = FNMS(Tc, Ta, T1J);
Chris@82 134 Ti = W[23];
Chris@82 135 Tk = FMA(Ti, Tj, Th);
Chris@82 136 T1M = FNMS(Ti, Tg, T1L);
Chris@82 137 Tl = Te + Tk;
Chris@82 138 T3z = Te - Tk;
Chris@82 139 T1N = T1K - T1M;
Chris@82 140 T3k = T1K + T1M;
Chris@82 141 }
Chris@82 142 }
Chris@82 143 {
Chris@82 144 E To, Tr, Tp, T1Q, Tu, Tx, Tv, T1S, Tn, Tt;
Chris@82 145 To = cr[WS(rs, 2)];
Chris@82 146 Tr = ci[WS(rs, 2)];
Chris@82 147 Tn = W[2];
Chris@82 148 Tp = Tn * To;
Chris@82 149 T1Q = Tn * Tr;
Chris@82 150 Tu = cr[WS(rs, 10)];
Chris@82 151 Tx = ci[WS(rs, 10)];
Chris@82 152 Tt = W[18];
Chris@82 153 Tv = Tt * Tu;
Chris@82 154 T1S = Tt * Tx;
Chris@82 155 {
Chris@82 156 E Ts, T1R, Ty, T1T, Tq, Tw;
Chris@82 157 Tq = W[3];
Chris@82 158 Ts = FMA(Tq, Tr, Tp);
Chris@82 159 T1R = FNMS(Tq, To, T1Q);
Chris@82 160 Tw = W[19];
Chris@82 161 Ty = FMA(Tw, Tx, Tv);
Chris@82 162 T1T = FNMS(Tw, Tu, T1S);
Chris@82 163 Tz = Ts + Ty;
Chris@82 164 T2W = T1R + T1T;
Chris@82 165 T1P = Ts - Ty;
Chris@82 166 T1U = T1R - T1T;
Chris@82 167 }
Chris@82 168 }
Chris@82 169 {
Chris@82 170 E TQ, TT, TR, T2c, TW, TZ, TX, T2e, TP, TV;
Chris@82 171 TQ = cr[WS(rs, 1)];
Chris@82 172 TT = ci[WS(rs, 1)];
Chris@82 173 TP = W[0];
Chris@82 174 TR = TP * TQ;
Chris@82 175 T2c = TP * TT;
Chris@82 176 TW = cr[WS(rs, 9)];
Chris@82 177 TZ = ci[WS(rs, 9)];
Chris@82 178 TV = W[16];
Chris@82 179 TX = TV * TW;
Chris@82 180 T2e = TV * TZ;
Chris@82 181 {
Chris@82 182 E TU, T2d, T10, T2f, TS, TY;
Chris@82 183 TS = W[1];
Chris@82 184 TU = FMA(TS, TT, TR);
Chris@82 185 T2d = FNMS(TS, TQ, T2c);
Chris@82 186 TY = W[17];
Chris@82 187 T10 = FMA(TY, TZ, TX);
Chris@82 188 T2f = FNMS(TY, TW, T2e);
Chris@82 189 T11 = TU + T10;
Chris@82 190 T30 = T2d + T2f;
Chris@82 191 T25 = TU - T10;
Chris@82 192 T2g = T2d - T2f;
Chris@82 193 }
Chris@82 194 }
Chris@82 195 {
Chris@82 196 E T13, T16, T14, T26, T19, T1c, T1a, T28, T12, T18;
Chris@82 197 T13 = cr[WS(rs, 5)];
Chris@82 198 T16 = ci[WS(rs, 5)];
Chris@82 199 T12 = W[8];
Chris@82 200 T14 = T12 * T13;
Chris@82 201 T26 = T12 * T16;
Chris@82 202 T19 = cr[WS(rs, 13)];
Chris@82 203 T1c = ci[WS(rs, 13)];
Chris@82 204 T18 = W[24];
Chris@82 205 T1a = T18 * T19;
Chris@82 206 T28 = T18 * T1c;
Chris@82 207 {
Chris@82 208 E T17, T27, T1d, T29, T15, T1b;
Chris@82 209 T15 = W[9];
Chris@82 210 T17 = FMA(T15, T16, T14);
Chris@82 211 T27 = FNMS(T15, T13, T26);
Chris@82 212 T1b = W[25];
Chris@82 213 T1d = FMA(T1b, T1c, T1a);
Chris@82 214 T29 = FNMS(T1b, T19, T28);
Chris@82 215 T1e = T17 + T1d;
Chris@82 216 T31 = T27 + T29;
Chris@82 217 T2a = T27 - T29;
Chris@82 218 T2h = T17 - T1d;
Chris@82 219 }
Chris@82 220 }
Chris@82 221 {
Chris@82 222 E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
Chris@82 223 TB = cr[WS(rs, 14)];
Chris@82 224 TE = ci[WS(rs, 14)];
Chris@82 225 TA = W[26];
Chris@82 226 TC = TA * TB;
Chris@82 227 T1X = TA * TE;
Chris@82 228 TH = cr[WS(rs, 6)];
Chris@82 229 TK = ci[WS(rs, 6)];
Chris@82 230 TG = W[10];
Chris@82 231 TI = TG * TH;
Chris@82 232 T1Z = TG * TK;
Chris@82 233 {
Chris@82 234 E TF, T1Y, TL, T20, TD, TJ;
Chris@82 235 TD = W[27];
Chris@82 236 TF = FMA(TD, TE, TC);
Chris@82 237 T1Y = FNMS(TD, TB, T1X);
Chris@82 238 TJ = W[11];
Chris@82 239 TL = FMA(TJ, TK, TI);
Chris@82 240 T20 = FNMS(TJ, TH, T1Z);
Chris@82 241 TM = TF + TL;
Chris@82 242 T2V = T1Y + T20;
Chris@82 243 T1W = TF - TL;
Chris@82 244 T21 = T1Y - T20;
Chris@82 245 }
Chris@82 246 }
Chris@82 247 {
Chris@82 248 E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
Chris@82 249 {
Chris@82 250 E Tm, TN, T3j, T3p;
Chris@82 251 Tm = T8 + Tl;
Chris@82 252 TN = Tz + TM;
Chris@82 253 TO = Tm + TN;
Chris@82 254 T3e = Tm - TN;
Chris@82 255 T3j = T2W + T2V;
Chris@82 256 T3p = T3k + T3o;
Chris@82 257 T3q = T3j + T3p;
Chris@82 258 T3s = T3p - T3j;
Chris@82 259 }
Chris@82 260 {
Chris@82 261 E T1f, T1G, T3f, T3g;
Chris@82 262 T1f = T11 + T1e;
Chris@82 263 T1G = T1s + T1F;
Chris@82 264 T1H = T1f + T1G;
Chris@82 265 T3r = T1G - T1f;
Chris@82 266 T3f = T35 + T36;
Chris@82 267 T3g = T30 + T31;
Chris@82 268 T3h = T3f - T3g;
Chris@82 269 T3i = T3g + T3f;
Chris@82 270 }
Chris@82 271 ci[WS(rs, 7)] = TO - T1H;
Chris@82 272 cr[WS(rs, 12)] = T3r - T3s;
Chris@82 273 ci[WS(rs, 11)] = T3r + T3s;
Chris@82 274 cr[0] = TO + T1H;
Chris@82 275 cr[WS(rs, 4)] = T3e - T3h;
Chris@82 276 cr[WS(rs, 8)] = T3i - T3q;
Chris@82 277 ci[WS(rs, 15)] = T3i + T3q;
Chris@82 278 ci[WS(rs, 3)] = T3e + T3h;
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
Chris@82 282 {
Chris@82 283 E T2U, T2X, T3t, T3u;
Chris@82 284 T2U = T8 - Tl;
Chris@82 285 T2X = T2V - T2W;
Chris@82 286 T2Y = T2U - T2X;
Chris@82 287 T3a = T2U + T2X;
Chris@82 288 T3t = Tz - TM;
Chris@82 289 T3u = T3o - T3k;
Chris@82 290 T3v = T3t + T3u;
Chris@82 291 T3x = T3u - T3t;
Chris@82 292 }
Chris@82 293 {
Chris@82 294 E T2Z, T32, T34, T37;
Chris@82 295 T2Z = T11 - T1e;
Chris@82 296 T32 = T30 - T31;
Chris@82 297 T33 = T2Z + T32;
Chris@82 298 T3b = T2Z - T32;
Chris@82 299 T34 = T1s - T1F;
Chris@82 300 T37 = T35 - T36;
Chris@82 301 T38 = T34 - T37;
Chris@82 302 T3c = T34 + T37;
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E T39, T3y, T3d, T3w;
Chris@82 306 T39 = T33 + T38;
Chris@82 307 ci[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
Chris@82 308 cr[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
Chris@82 309 T3y = T3c - T3b;
Chris@82 310 cr[WS(rs, 10)] = FMS(KP707106781, T3y, T3x);
Chris@82 311 ci[WS(rs, 13)] = FMA(KP707106781, T3y, T3x);
Chris@82 312 T3d = T3b + T3c;
Chris@82 313 cr[WS(rs, 6)] = FNMS(KP707106781, T3d, T3a);
Chris@82 314 ci[WS(rs, 1)] = FMA(KP707106781, T3d, T3a);
Chris@82 315 T3w = T38 - T33;
Chris@82 316 cr[WS(rs, 14)] = FMS(KP707106781, T3w, T3v);
Chris@82 317 ci[WS(rs, 9)] = FMA(KP707106781, T3w, T3v);
Chris@82 318 }
Chris@82 319 }
Chris@82 320 {
Chris@82 321 E T1O, T3B, T3H, T2E, T23, T3I, T2O, T2R, T2H, T3C, T2j, T2B, T2L, T2S, T2y;
Chris@82 322 E T2C;
Chris@82 323 {
Chris@82 324 E T1V, T22, T2b, T2i;
Chris@82 325 T1O = T1I - T1N;
Chris@82 326 T3B = T3z + T3A;
Chris@82 327 T3H = T3A - T3z;
Chris@82 328 T2E = T1I + T1N;
Chris@82 329 T1V = T1P - T1U;
Chris@82 330 T22 = T1W + T21;
Chris@82 331 T23 = T1V + T22;
Chris@82 332 T3I = T22 - T1V;
Chris@82 333 {
Chris@82 334 E T2M, T2N, T2F, T2G;
Chris@82 335 T2M = T2k + T2p;
Chris@82 336 T2N = T2w + T2r;
Chris@82 337 T2O = FNMS(KP414213562, T2N, T2M);
Chris@82 338 T2R = FMA(KP414213562, T2M, T2N);
Chris@82 339 T2F = T1P + T1U;
Chris@82 340 T2G = T1W - T21;
Chris@82 341 T2H = T2F + T2G;
Chris@82 342 T3C = T2F - T2G;
Chris@82 343 }
Chris@82 344 T2b = T25 - T2a;
Chris@82 345 T2i = T2g + T2h;
Chris@82 346 T2j = FNMS(KP414213562, T2i, T2b);
Chris@82 347 T2B = FMA(KP414213562, T2b, T2i);
Chris@82 348 {
Chris@82 349 E T2J, T2K, T2q, T2x;
Chris@82 350 T2J = T25 + T2a;
Chris@82 351 T2K = T2g - T2h;
Chris@82 352 T2L = FMA(KP414213562, T2K, T2J);
Chris@82 353 T2S = FNMS(KP414213562, T2J, T2K);
Chris@82 354 T2q = T2k - T2p;
Chris@82 355 T2x = T2r - T2w;
Chris@82 356 T2y = FNMS(KP414213562, T2x, T2q);
Chris@82 357 T2C = FMA(KP414213562, T2q, T2x);
Chris@82 358 }
Chris@82 359 }
Chris@82 360 {
Chris@82 361 E T24, T2z, T3J, T3K;
Chris@82 362 T24 = FMA(KP707106781, T23, T1O);
Chris@82 363 T2z = T2j + T2y;
Chris@82 364 cr[WS(rs, 7)] = FNMS(KP923879532, T2z, T24);
Chris@82 365 ci[0] = FMA(KP923879532, T2z, T24);
Chris@82 366 T3J = FMA(KP707106781, T3I, T3H);
Chris@82 367 T3K = T2S + T2R;
Chris@82 368 cr[WS(rs, 9)] = FMS(KP923879532, T3K, T3J);
Chris@82 369 ci[WS(rs, 14)] = FMA(KP923879532, T3K, T3J);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 E T3L, T3M, T2A, T2D;
Chris@82 373 T3L = FNMS(KP707106781, T3I, T3H);
Chris@82 374 T3M = T2O - T2L;
Chris@82 375 cr[WS(rs, 13)] = FMS(KP923879532, T3M, T3L);
Chris@82 376 ci[WS(rs, 10)] = FMA(KP923879532, T3M, T3L);
Chris@82 377 T2A = FNMS(KP707106781, T23, T1O);
Chris@82 378 T2D = T2B + T2C;
Chris@82 379 ci[WS(rs, 4)] = FNMS(KP923879532, T2D, T2A);
Chris@82 380 cr[WS(rs, 3)] = FMA(KP923879532, T2D, T2A);
Chris@82 381 }
Chris@82 382 {
Chris@82 383 E T2I, T2P, T3D, T3E;
Chris@82 384 T2I = FMA(KP707106781, T2H, T2E);
Chris@82 385 T2P = T2L + T2O;
Chris@82 386 ci[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
Chris@82 387 cr[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
Chris@82 388 T3D = FMA(KP707106781, T3C, T3B);
Chris@82 389 T3E = T2C - T2B;
Chris@82 390 cr[WS(rs, 15)] = FMS(KP923879532, T3E, T3D);
Chris@82 391 ci[WS(rs, 8)] = FMA(KP923879532, T3E, T3D);
Chris@82 392 }
Chris@82 393 {
Chris@82 394 E T3F, T3G, T2Q, T2T;
Chris@82 395 T3F = FNMS(KP707106781, T3C, T3B);
Chris@82 396 T3G = T2y - T2j;
Chris@82 397 cr[WS(rs, 11)] = FMS(KP923879532, T3G, T3F);
Chris@82 398 ci[WS(rs, 12)] = FMA(KP923879532, T3G, T3F);
Chris@82 399 T2Q = FNMS(KP707106781, T2H, T2E);
Chris@82 400 T2T = T2R - T2S;
Chris@82 401 cr[WS(rs, 5)] = FNMS(KP923879532, T2T, T2Q);
Chris@82 402 ci[WS(rs, 2)] = FMA(KP923879532, T2T, T2Q);
Chris@82 403 }
Chris@82 404 }
Chris@82 405 }
Chris@82 406 }
Chris@82 407 }
Chris@82 408
Chris@82 409 static const tw_instr twinstr[] = {
Chris@82 410 {TW_FULL, 1, 16},
Chris@82 411 {TW_NEXT, 1, 0}
Chris@82 412 };
Chris@82 413
Chris@82 414 static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, {104, 30, 70, 0} };
Chris@82 415
Chris@82 416 void X(codelet_hf_16) (planner *p) {
Chris@82 417 X(khc2hc_register) (p, hf_16, &desc);
Chris@82 418 }
Chris@82 419 #else
Chris@82 420
Chris@82 421 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
Chris@82 422
Chris@82 423 /*
Chris@82 424 * This function contains 174 FP additions, 84 FP multiplications,
Chris@82 425 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
Chris@82 426 * 52 stack variables, 3 constants, and 64 memory accesses
Chris@82 427 */
Chris@82 428 #include "rdft/scalar/hf.h"
Chris@82 429
Chris@82 430 static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 431 {
Chris@82 432 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 433 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 434 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 435 {
Chris@82 436 INT m;
Chris@82 437 for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 438 E T7, T38, T1t, T2U, Ti, T37, T1w, T2R, Tu, T2t, T1C, T2c, TF, T2s, T1H;
Chris@82 439 E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2k, T24, T2j, TS, T13, T2w, T2x;
Chris@82 440 E T2y, T2z, T1O, T2h, T1T, T2g;
Chris@82 441 {
Chris@82 442 E T1, T2T, T6, T2S;
Chris@82 443 T1 = cr[0];
Chris@82 444 T2T = ci[0];
Chris@82 445 {
Chris@82 446 E T3, T5, T2, T4;
Chris@82 447 T3 = cr[WS(rs, 8)];
Chris@82 448 T5 = ci[WS(rs, 8)];
Chris@82 449 T2 = W[14];
Chris@82 450 T4 = W[15];
Chris@82 451 T6 = FMA(T2, T3, T4 * T5);
Chris@82 452 T2S = FNMS(T4, T3, T2 * T5);
Chris@82 453 }
Chris@82 454 T7 = T1 + T6;
Chris@82 455 T38 = T2T - T2S;
Chris@82 456 T1t = T1 - T6;
Chris@82 457 T2U = T2S + T2T;
Chris@82 458 }
Chris@82 459 {
Chris@82 460 E Tc, T1u, Th, T1v;
Chris@82 461 {
Chris@82 462 E T9, Tb, T8, Ta;
Chris@82 463 T9 = cr[WS(rs, 4)];
Chris@82 464 Tb = ci[WS(rs, 4)];
Chris@82 465 T8 = W[6];
Chris@82 466 Ta = W[7];
Chris@82 467 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 468 T1u = FNMS(Ta, T9, T8 * Tb);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 E Te, Tg, Td, Tf;
Chris@82 472 Te = cr[WS(rs, 12)];
Chris@82 473 Tg = ci[WS(rs, 12)];
Chris@82 474 Td = W[22];
Chris@82 475 Tf = W[23];
Chris@82 476 Th = FMA(Td, Te, Tf * Tg);
Chris@82 477 T1v = FNMS(Tf, Te, Td * Tg);
Chris@82 478 }
Chris@82 479 Ti = Tc + Th;
Chris@82 480 T37 = Tc - Th;
Chris@82 481 T1w = T1u - T1v;
Chris@82 482 T2R = T1u + T1v;
Chris@82 483 }
Chris@82 484 {
Chris@82 485 E To, T1z, Tt, T1A, T1y, T1B;
Chris@82 486 {
Chris@82 487 E Tl, Tn, Tk, Tm;
Chris@82 488 Tl = cr[WS(rs, 2)];
Chris@82 489 Tn = ci[WS(rs, 2)];
Chris@82 490 Tk = W[2];
Chris@82 491 Tm = W[3];
Chris@82 492 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 493 T1z = FNMS(Tm, Tl, Tk * Tn);
Chris@82 494 }
Chris@82 495 {
Chris@82 496 E Tq, Ts, Tp, Tr;
Chris@82 497 Tq = cr[WS(rs, 10)];
Chris@82 498 Ts = ci[WS(rs, 10)];
Chris@82 499 Tp = W[18];
Chris@82 500 Tr = W[19];
Chris@82 501 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 502 T1A = FNMS(Tr, Tq, Tp * Ts);
Chris@82 503 }
Chris@82 504 Tu = To + Tt;
Chris@82 505 T2t = T1z + T1A;
Chris@82 506 T1y = To - Tt;
Chris@82 507 T1B = T1z - T1A;
Chris@82 508 T1C = T1y - T1B;
Chris@82 509 T2c = T1y + T1B;
Chris@82 510 }
Chris@82 511 {
Chris@82 512 E Tz, T1E, TE, T1F, T1D, T1G;
Chris@82 513 {
Chris@82 514 E Tw, Ty, Tv, Tx;
Chris@82 515 Tw = cr[WS(rs, 14)];
Chris@82 516 Ty = ci[WS(rs, 14)];
Chris@82 517 Tv = W[26];
Chris@82 518 Tx = W[27];
Chris@82 519 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 520 T1E = FNMS(Tx, Tw, Tv * Ty);
Chris@82 521 }
Chris@82 522 {
Chris@82 523 E TB, TD, TA, TC;
Chris@82 524 TB = cr[WS(rs, 6)];
Chris@82 525 TD = ci[WS(rs, 6)];
Chris@82 526 TA = W[10];
Chris@82 527 TC = W[11];
Chris@82 528 TE = FMA(TA, TB, TC * TD);
Chris@82 529 T1F = FNMS(TC, TB, TA * TD);
Chris@82 530 }
Chris@82 531 TF = Tz + TE;
Chris@82 532 T2s = T1E + T1F;
Chris@82 533 T1D = Tz - TE;
Chris@82 534 T1G = T1E - T1F;
Chris@82 535 T1H = T1D + T1G;
Chris@82 536 T2d = T1D - T1G;
Chris@82 537 }
Chris@82 538 {
Chris@82 539 E T19, T1V, T1p, T22, T1e, T1W, T1k, T21;
Chris@82 540 {
Chris@82 541 E T16, T18, T15, T17;
Chris@82 542 T16 = cr[WS(rs, 15)];
Chris@82 543 T18 = ci[WS(rs, 15)];
Chris@82 544 T15 = W[28];
Chris@82 545 T17 = W[29];
Chris@82 546 T19 = FMA(T15, T16, T17 * T18);
Chris@82 547 T1V = FNMS(T17, T16, T15 * T18);
Chris@82 548 }
Chris@82 549 {
Chris@82 550 E T1m, T1o, T1l, T1n;
Chris@82 551 T1m = cr[WS(rs, 11)];
Chris@82 552 T1o = ci[WS(rs, 11)];
Chris@82 553 T1l = W[20];
Chris@82 554 T1n = W[21];
Chris@82 555 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@82 556 T22 = FNMS(T1n, T1m, T1l * T1o);
Chris@82 557 }
Chris@82 558 {
Chris@82 559 E T1b, T1d, T1a, T1c;
Chris@82 560 T1b = cr[WS(rs, 7)];
Chris@82 561 T1d = ci[WS(rs, 7)];
Chris@82 562 T1a = W[12];
Chris@82 563 T1c = W[13];
Chris@82 564 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@82 565 T1W = FNMS(T1c, T1b, T1a * T1d);
Chris@82 566 }
Chris@82 567 {
Chris@82 568 E T1h, T1j, T1g, T1i;
Chris@82 569 T1h = cr[WS(rs, 3)];
Chris@82 570 T1j = ci[WS(rs, 3)];
Chris@82 571 T1g = W[4];
Chris@82 572 T1i = W[5];
Chris@82 573 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@82 574 T21 = FNMS(T1i, T1h, T1g * T1j);
Chris@82 575 }
Chris@82 576 T1f = T19 + T1e;
Chris@82 577 T1q = T1k + T1p;
Chris@82 578 T2B = T1f - T1q;
Chris@82 579 T2C = T1V + T1W;
Chris@82 580 T2D = T21 + T22;
Chris@82 581 T2E = T2C - T2D;
Chris@82 582 {
Chris@82 583 E T1X, T1Y, T20, T23;
Chris@82 584 T1X = T1V - T1W;
Chris@82 585 T1Y = T1k - T1p;
Chris@82 586 T1Z = T1X + T1Y;
Chris@82 587 T2k = T1X - T1Y;
Chris@82 588 T20 = T19 - T1e;
Chris@82 589 T23 = T21 - T22;
Chris@82 590 T24 = T20 - T23;
Chris@82 591 T2j = T20 + T23;
Chris@82 592 }
Chris@82 593 }
Chris@82 594 {
Chris@82 595 E TM, T1P, T12, T1M, TR, T1Q, TX, T1L;
Chris@82 596 {
Chris@82 597 E TJ, TL, TI, TK;
Chris@82 598 TJ = cr[WS(rs, 1)];
Chris@82 599 TL = ci[WS(rs, 1)];
Chris@82 600 TI = W[0];
Chris@82 601 TK = W[1];
Chris@82 602 TM = FMA(TI, TJ, TK * TL);
Chris@82 603 T1P = FNMS(TK, TJ, TI * TL);
Chris@82 604 }
Chris@82 605 {
Chris@82 606 E TZ, T11, TY, T10;
Chris@82 607 TZ = cr[WS(rs, 13)];
Chris@82 608 T11 = ci[WS(rs, 13)];
Chris@82 609 TY = W[24];
Chris@82 610 T10 = W[25];
Chris@82 611 T12 = FMA(TY, TZ, T10 * T11);
Chris@82 612 T1M = FNMS(T10, TZ, TY * T11);
Chris@82 613 }
Chris@82 614 {
Chris@82 615 E TO, TQ, TN, TP;
Chris@82 616 TO = cr[WS(rs, 9)];
Chris@82 617 TQ = ci[WS(rs, 9)];
Chris@82 618 TN = W[16];
Chris@82 619 TP = W[17];
Chris@82 620 TR = FMA(TN, TO, TP * TQ);
Chris@82 621 T1Q = FNMS(TP, TO, TN * TQ);
Chris@82 622 }
Chris@82 623 {
Chris@82 624 E TU, TW, TT, TV;
Chris@82 625 TU = cr[WS(rs, 5)];
Chris@82 626 TW = ci[WS(rs, 5)];
Chris@82 627 TT = W[8];
Chris@82 628 TV = W[9];
Chris@82 629 TX = FMA(TT, TU, TV * TW);
Chris@82 630 T1L = FNMS(TV, TU, TT * TW);
Chris@82 631 }
Chris@82 632 TS = TM + TR;
Chris@82 633 T13 = TX + T12;
Chris@82 634 T2w = TS - T13;
Chris@82 635 T2x = T1P + T1Q;
Chris@82 636 T2y = T1L + T1M;
Chris@82 637 T2z = T2x - T2y;
Chris@82 638 {
Chris@82 639 E T1K, T1N, T1R, T1S;
Chris@82 640 T1K = TM - TR;
Chris@82 641 T1N = T1L - T1M;
Chris@82 642 T1O = T1K - T1N;
Chris@82 643 T2h = T1K + T1N;
Chris@82 644 T1R = T1P - T1Q;
Chris@82 645 T1S = TX - T12;
Chris@82 646 T1T = T1R + T1S;
Chris@82 647 T2g = T1R - T1S;
Chris@82 648 }
Chris@82 649 }
Chris@82 650 {
Chris@82 651 E T1J, T27, T3a, T3c, T26, T3b, T2a, T35;
Chris@82 652 {
Chris@82 653 E T1x, T1I, T36, T39;
Chris@82 654 T1x = T1t - T1w;
Chris@82 655 T1I = KP707106781 * (T1C + T1H);
Chris@82 656 T1J = T1x + T1I;
Chris@82 657 T27 = T1x - T1I;
Chris@82 658 T36 = KP707106781 * (T2c - T2d);
Chris@82 659 T39 = T37 + T38;
Chris@82 660 T3a = T36 + T39;
Chris@82 661 T3c = T39 - T36;
Chris@82 662 }
Chris@82 663 {
Chris@82 664 E T1U, T25, T28, T29;
Chris@82 665 T1U = FNMS(KP382683432, T1T, KP923879532 * T1O);
Chris@82 666 T25 = FMA(KP382683432, T1Z, KP923879532 * T24);
Chris@82 667 T26 = T1U + T25;
Chris@82 668 T3b = T25 - T1U;
Chris@82 669 T28 = FMA(KP923879532, T1T, KP382683432 * T1O);
Chris@82 670 T29 = FNMS(KP923879532, T1Z, KP382683432 * T24);
Chris@82 671 T2a = T28 + T29;
Chris@82 672 T35 = T29 - T28;
Chris@82 673 }
Chris@82 674 cr[WS(rs, 7)] = T1J - T26;
Chris@82 675 cr[WS(rs, 11)] = T3b - T3c;
Chris@82 676 ci[WS(rs, 12)] = T3b + T3c;
Chris@82 677 ci[0] = T1J + T26;
Chris@82 678 ci[WS(rs, 4)] = T27 - T2a;
Chris@82 679 cr[WS(rs, 15)] = T35 - T3a;
Chris@82 680 ci[WS(rs, 8)] = T35 + T3a;
Chris@82 681 cr[WS(rs, 3)] = T27 + T2a;
Chris@82 682 }
Chris@82 683 {
Chris@82 684 E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
Chris@82 685 {
Chris@82 686 E Tj, TG, T2Q, T2V;
Chris@82 687 Tj = T7 + Ti;
Chris@82 688 TG = Tu + TF;
Chris@82 689 TH = Tj + TG;
Chris@82 690 T2L = Tj - TG;
Chris@82 691 T2Q = T2t + T2s;
Chris@82 692 T2V = T2R + T2U;
Chris@82 693 T2W = T2Q + T2V;
Chris@82 694 T2Y = T2V - T2Q;
Chris@82 695 }
Chris@82 696 {
Chris@82 697 E T14, T1r, T2M, T2N;
Chris@82 698 T14 = TS + T13;
Chris@82 699 T1r = T1f + T1q;
Chris@82 700 T1s = T14 + T1r;
Chris@82 701 T2X = T1r - T14;
Chris@82 702 T2M = T2C + T2D;
Chris@82 703 T2N = T2x + T2y;
Chris@82 704 T2O = T2M - T2N;
Chris@82 705 T2P = T2N + T2M;
Chris@82 706 }
Chris@82 707 ci[WS(rs, 7)] = TH - T1s;
Chris@82 708 cr[WS(rs, 12)] = T2X - T2Y;
Chris@82 709 ci[WS(rs, 11)] = T2X + T2Y;
Chris@82 710 cr[0] = TH + T1s;
Chris@82 711 cr[WS(rs, 4)] = T2L - T2O;
Chris@82 712 cr[WS(rs, 8)] = T2P - T2W;
Chris@82 713 ci[WS(rs, 15)] = T2P + T2W;
Chris@82 714 ci[WS(rs, 3)] = T2L + T2O;
Chris@82 715 }
Chris@82 716 {
Chris@82 717 E T2f, T2n, T3g, T3i, T2m, T3h, T2q, T3d;
Chris@82 718 {
Chris@82 719 E T2b, T2e, T3e, T3f;
Chris@82 720 T2b = T1t + T1w;
Chris@82 721 T2e = KP707106781 * (T2c + T2d);
Chris@82 722 T2f = T2b + T2e;
Chris@82 723 T2n = T2b - T2e;
Chris@82 724 T3e = KP707106781 * (T1H - T1C);
Chris@82 725 T3f = T38 - T37;
Chris@82 726 T3g = T3e + T3f;
Chris@82 727 T3i = T3f - T3e;
Chris@82 728 }
Chris@82 729 {
Chris@82 730 E T2i, T2l, T2o, T2p;
Chris@82 731 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
Chris@82 732 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
Chris@82 733 T2m = T2i + T2l;
Chris@82 734 T3h = T2l - T2i;
Chris@82 735 T2o = FNMS(KP923879532, T2g, KP382683432 * T2h);
Chris@82 736 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
Chris@82 737 T2q = T2o + T2p;
Chris@82 738 T3d = T2p - T2o;
Chris@82 739 }
Chris@82 740 ci[WS(rs, 6)] = T2f - T2m;
Chris@82 741 cr[WS(rs, 13)] = T3h - T3i;
Chris@82 742 ci[WS(rs, 10)] = T3h + T3i;
Chris@82 743 cr[WS(rs, 1)] = T2f + T2m;
Chris@82 744 cr[WS(rs, 5)] = T2n - T2q;
Chris@82 745 cr[WS(rs, 9)] = T3d - T3g;
Chris@82 746 ci[WS(rs, 14)] = T3d + T3g;
Chris@82 747 ci[WS(rs, 2)] = T2n + T2q;
Chris@82 748 }
Chris@82 749 {
Chris@82 750 E T2v, T2H, T32, T34, T2G, T2Z, T2K, T33;
Chris@82 751 {
Chris@82 752 E T2r, T2u, T30, T31;
Chris@82 753 T2r = T7 - Ti;
Chris@82 754 T2u = T2s - T2t;
Chris@82 755 T2v = T2r - T2u;
Chris@82 756 T2H = T2r + T2u;
Chris@82 757 T30 = Tu - TF;
Chris@82 758 T31 = T2U - T2R;
Chris@82 759 T32 = T30 + T31;
Chris@82 760 T34 = T31 - T30;
Chris@82 761 }
Chris@82 762 {
Chris@82 763 E T2A, T2F, T2I, T2J;
Chris@82 764 T2A = T2w + T2z;
Chris@82 765 T2F = T2B - T2E;
Chris@82 766 T2G = KP707106781 * (T2A + T2F);
Chris@82 767 T2Z = KP707106781 * (T2F - T2A);
Chris@82 768 T2I = T2w - T2z;
Chris@82 769 T2J = T2B + T2E;
Chris@82 770 T2K = KP707106781 * (T2I + T2J);
Chris@82 771 T33 = KP707106781 * (T2J - T2I);
Chris@82 772 }
Chris@82 773 ci[WS(rs, 5)] = T2v - T2G;
Chris@82 774 cr[WS(rs, 10)] = T33 - T34;
Chris@82 775 ci[WS(rs, 13)] = T33 + T34;
Chris@82 776 cr[WS(rs, 2)] = T2v + T2G;
Chris@82 777 cr[WS(rs, 6)] = T2H - T2K;
Chris@82 778 cr[WS(rs, 14)] = T2Z - T32;
Chris@82 779 ci[WS(rs, 9)] = T2Z + T32;
Chris@82 780 ci[WS(rs, 1)] = T2H + T2K;
Chris@82 781 }
Chris@82 782 }
Chris@82 783 }
Chris@82 784 }
Chris@82 785
Chris@82 786 static const tw_instr twinstr[] = {
Chris@82 787 {TW_FULL, 1, 16},
Chris@82 788 {TW_NEXT, 1, 0}
Chris@82 789 };
Chris@82 790
Chris@82 791 static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, {136, 46, 38, 0} };
Chris@82 792
Chris@82 793 void X(codelet_hf_16) (planner *p) {
Chris@82 794 X(khc2hc_register) (p, hf_16, &desc);
Chris@82 795 }
Chris@82 796 #endif