annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:18 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 61 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 43 E T1f, T1g, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
Chris@42 44 {
Chris@42 45 E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
Chris@42 46 E Tp, Tx, Tt, Tq, Tw;
Chris@42 47 {
Chris@42 48 E T3, T6, T2, T5;
Chris@42 49 T1 = cr[0];
Chris@42 50 T1m = ci[0];
Chris@42 51 T3 = cr[WS(rs, 4)];
Chris@42 52 T6 = ci[WS(rs, 4)];
Chris@42 53 T2 = W[6];
Chris@42 54 T5 = W[7];
Chris@42 55 {
Chris@42 56 E Ta, Td, T9, Tc;
Chris@42 57 {
Chris@42 58 E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
Chris@42 59 Tg = cr[WS(rs, 6)];
Chris@42 60 Tj = ci[WS(rs, 6)];
Chris@42 61 T1k = T2 * T6;
Chris@42 62 T4 = T2 * T3;
Chris@42 63 Tf = W[10];
Chris@42 64 Ti = W[11];
Chris@42 65 T1l = FNMS(T5, T3, T1k);
Chris@42 66 T7 = FMA(T5, T6, T4);
Chris@42 67 TR = Tf * Tj;
Chris@42 68 Th = Tf * Tg;
Chris@42 69 Ta = cr[WS(rs, 2)];
Chris@42 70 Td = ci[WS(rs, 2)];
Chris@42 71 TS = FNMS(Ti, Tg, TR);
Chris@42 72 Tk = FMA(Ti, Tj, Th);
Chris@42 73 T9 = W[2];
Chris@42 74 Tc = W[3];
Chris@42 75 }
Chris@42 76 {
Chris@42 77 E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
Chris@42 78 TB = cr[WS(rs, 7)];
Chris@42 79 TE = ci[WS(rs, 7)];
Chris@42 80 TP = T9 * Td;
Chris@42 81 Tb = T9 * Ta;
Chris@42 82 TA = W[12];
Chris@42 83 TH = cr[WS(rs, 3)];
Chris@42 84 TQ = FNMS(Tc, Ta, TP);
Chris@42 85 Te = FMA(Tc, Td, Tb);
Chris@42 86 T13 = TA * TE;
Chris@42 87 TC = TA * TB;
Chris@42 88 TK = ci[WS(rs, 3)];
Chris@42 89 TG = W[4];
Chris@42 90 TD = W[13];
Chris@42 91 TJ = W[5];
Chris@42 92 {
Chris@42 93 E T14, TF, T16, TL, T15, TI;
Chris@42 94 To = cr[WS(rs, 1)];
Chris@42 95 T15 = TG * TK;
Chris@42 96 TI = TG * TH;
Chris@42 97 T14 = FNMS(TD, TB, T13);
Chris@42 98 TF = FMA(TD, TE, TC);
Chris@42 99 T16 = FNMS(TJ, TH, T15);
Chris@42 100 TL = FMA(TJ, TK, TI);
Chris@42 101 Tr = ci[WS(rs, 1)];
Chris@42 102 Tn = W[0];
Chris@42 103 T17 = T14 - T16;
Chris@42 104 T1f = T14 + T16;
Chris@42 105 TM = TF + TL;
Chris@42 106 T12 = TF - TL;
Chris@42 107 }
Chris@42 108 Tu = cr[WS(rs, 5)];
Chris@42 109 TW = Tn * Tr;
Chris@42 110 Tp = Tn * To;
Chris@42 111 Tx = ci[WS(rs, 5)];
Chris@42 112 Tt = W[8];
Chris@42 113 Tq = W[1];
Chris@42 114 Tw = W[9];
Chris@42 115 }
Chris@42 116 }
Chris@42 117 }
Chris@42 118 {
Chris@42 119 E T8, T1j, Tl, Tz, T1a, TU, T1n, T1b, T1c, T1v, T1t, T1u, T19, T1w, T1d;
Chris@42 120 {
Chris@42 121 E T1r, T10, TV, T1s, T11, T18;
Chris@42 122 {
Chris@42 123 E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
Chris@42 124 T8 = T1 + T7;
Chris@42 125 TO = T1 - T7;
Chris@42 126 TY = Tt * Tx;
Chris@42 127 Tv = Tt * Tu;
Chris@42 128 TX = FNMS(Tq, To, TW);
Chris@42 129 Ts = FMA(Tq, Tr, Tp);
Chris@42 130 TZ = FNMS(Tw, Tu, TY);
Chris@42 131 Ty = FMA(Tw, Tx, Tv);
Chris@42 132 TT = TQ - TS;
Chris@42 133 T1j = TQ + TS;
Chris@42 134 Tl = Te + Tk;
Chris@42 135 T1r = Te - Tk;
Chris@42 136 T10 = TX - TZ;
Chris@42 137 T1g = TX + TZ;
Chris@42 138 Tz = Ts + Ty;
Chris@42 139 TV = Ts - Ty;
Chris@42 140 T1a = TO - TT;
Chris@42 141 TU = TO + TT;
Chris@42 142 T1s = T1m - T1l;
Chris@42 143 T1n = T1l + T1m;
Chris@42 144 }
Chris@42 145 T1b = TV - T10;
Chris@42 146 T11 = TV + T10;
Chris@42 147 T18 = T12 - T17;
Chris@42 148 T1c = T12 + T17;
Chris@42 149 T1v = T1s - T1r;
Chris@42 150 T1t = T1r + T1s;
Chris@42 151 T1u = T18 - T11;
Chris@42 152 T19 = T11 + T18;
Chris@42 153 }
Chris@42 154 ci[WS(rs, 4)] = FMA(KP707106781, T1u, T1t);
Chris@42 155 cr[WS(rs, 7)] = FMS(KP707106781, T1u, T1t);
Chris@42 156 cr[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Chris@42 157 ci[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
Chris@42 158 T1w = T1c - T1b;
Chris@42 159 T1d = T1b + T1c;
Chris@42 160 ci[WS(rs, 6)] = FMA(KP707106781, T1w, T1v);
Chris@42 161 cr[WS(rs, 5)] = FMS(KP707106781, T1w, T1v);
Chris@42 162 ci[0] = FMA(KP707106781, T1d, T1a);
Chris@42 163 cr[WS(rs, 3)] = FNMS(KP707106781, T1d, T1a);
Chris@42 164 T1e = T8 - Tl;
Chris@42 165 Tm = T8 + Tl;
Chris@42 166 T1q = T1n - T1j;
Chris@42 167 T1o = T1j + T1n;
Chris@42 168 T1p = TM - Tz;
Chris@42 169 TN = Tz + TM;
Chris@42 170 }
Chris@42 171 }
Chris@42 172 ci[WS(rs, 5)] = T1p + T1q;
Chris@42 173 cr[WS(rs, 6)] = T1p - T1q;
Chris@42 174 cr[0] = Tm + TN;
Chris@42 175 ci[WS(rs, 3)] = Tm - TN;
Chris@42 176 T1h = T1f - T1g;
Chris@42 177 T1i = T1g + T1f;
Chris@42 178 ci[WS(rs, 7)] = T1i + T1o;
Chris@42 179 cr[WS(rs, 4)] = T1i - T1o;
Chris@42 180 ci[WS(rs, 1)] = T1e + T1h;
Chris@42 181 cr[WS(rs, 2)] = T1e - T1h;
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185
Chris@42 186 static const tw_instr twinstr[] = {
Chris@42 187 {TW_FULL, 1, 8},
Chris@42 188 {TW_NEXT, 1, 0}
Chris@42 189 };
Chris@42 190
Chris@42 191 static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@42 192
Chris@42 193 void X(codelet_hf_8) (planner *p) {
Chris@42 194 X(khc2hc_register) (p, hf_8, &desc);
Chris@42 195 }
Chris@42 196 #else /* HAVE_FMA */
Chris@42 197
Chris@42 198 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include hf.h */
Chris@42 199
Chris@42 200 /*
Chris@42 201 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 202 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 203 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@42 204 */
Chris@42 205 #include "hf.h"
Chris@42 206
Chris@42 207 static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 208 {
Chris@42 209 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 210 {
Chris@42 211 INT m;
Chris@42 212 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 213 E T7, T1f, TH, T19, TF, T12, TR, TU, Ti, T1e, TK, T16, Tu, T13, TM;
Chris@42 214 E TP;
Chris@42 215 {
Chris@42 216 E T1, T18, T6, T17;
Chris@42 217 T1 = cr[0];
Chris@42 218 T18 = ci[0];
Chris@42 219 {
Chris@42 220 E T3, T5, T2, T4;
Chris@42 221 T3 = cr[WS(rs, 4)];
Chris@42 222 T5 = ci[WS(rs, 4)];
Chris@42 223 T2 = W[6];
Chris@42 224 T4 = W[7];
Chris@42 225 T6 = FMA(T2, T3, T4 * T5);
Chris@42 226 T17 = FNMS(T4, T3, T2 * T5);
Chris@42 227 }
Chris@42 228 T7 = T1 + T6;
Chris@42 229 T1f = T18 - T17;
Chris@42 230 TH = T1 - T6;
Chris@42 231 T19 = T17 + T18;
Chris@42 232 }
Chris@42 233 {
Chris@42 234 E Tz, TS, TE, TT;
Chris@42 235 {
Chris@42 236 E Tw, Ty, Tv, Tx;
Chris@42 237 Tw = cr[WS(rs, 7)];
Chris@42 238 Ty = ci[WS(rs, 7)];
Chris@42 239 Tv = W[12];
Chris@42 240 Tx = W[13];
Chris@42 241 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 242 TS = FNMS(Tx, Tw, Tv * Ty);
Chris@42 243 }
Chris@42 244 {
Chris@42 245 E TB, TD, TA, TC;
Chris@42 246 TB = cr[WS(rs, 3)];
Chris@42 247 TD = ci[WS(rs, 3)];
Chris@42 248 TA = W[4];
Chris@42 249 TC = W[5];
Chris@42 250 TE = FMA(TA, TB, TC * TD);
Chris@42 251 TT = FNMS(TC, TB, TA * TD);
Chris@42 252 }
Chris@42 253 TF = Tz + TE;
Chris@42 254 T12 = TS + TT;
Chris@42 255 TR = Tz - TE;
Chris@42 256 TU = TS - TT;
Chris@42 257 }
Chris@42 258 {
Chris@42 259 E Tc, TI, Th, TJ;
Chris@42 260 {
Chris@42 261 E T9, Tb, T8, Ta;
Chris@42 262 T9 = cr[WS(rs, 2)];
Chris@42 263 Tb = ci[WS(rs, 2)];
Chris@42 264 T8 = W[2];
Chris@42 265 Ta = W[3];
Chris@42 266 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 267 TI = FNMS(Ta, T9, T8 * Tb);
Chris@42 268 }
Chris@42 269 {
Chris@42 270 E Te, Tg, Td, Tf;
Chris@42 271 Te = cr[WS(rs, 6)];
Chris@42 272 Tg = ci[WS(rs, 6)];
Chris@42 273 Td = W[10];
Chris@42 274 Tf = W[11];
Chris@42 275 Th = FMA(Td, Te, Tf * Tg);
Chris@42 276 TJ = FNMS(Tf, Te, Td * Tg);
Chris@42 277 }
Chris@42 278 Ti = Tc + Th;
Chris@42 279 T1e = Tc - Th;
Chris@42 280 TK = TI - TJ;
Chris@42 281 T16 = TI + TJ;
Chris@42 282 }
Chris@42 283 {
Chris@42 284 E To, TN, Tt, TO;
Chris@42 285 {
Chris@42 286 E Tl, Tn, Tk, Tm;
Chris@42 287 Tl = cr[WS(rs, 1)];
Chris@42 288 Tn = ci[WS(rs, 1)];
Chris@42 289 Tk = W[0];
Chris@42 290 Tm = W[1];
Chris@42 291 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 292 TN = FNMS(Tm, Tl, Tk * Tn);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 E Tq, Ts, Tp, Tr;
Chris@42 296 Tq = cr[WS(rs, 5)];
Chris@42 297 Ts = ci[WS(rs, 5)];
Chris@42 298 Tp = W[8];
Chris@42 299 Tr = W[9];
Chris@42 300 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 301 TO = FNMS(Tr, Tq, Tp * Ts);
Chris@42 302 }
Chris@42 303 Tu = To + Tt;
Chris@42 304 T13 = TN + TO;
Chris@42 305 TM = To - Tt;
Chris@42 306 TP = TN - TO;
Chris@42 307 }
Chris@42 308 {
Chris@42 309 E Tj, TG, T1b, T1c;
Chris@42 310 Tj = T7 + Ti;
Chris@42 311 TG = Tu + TF;
Chris@42 312 ci[WS(rs, 3)] = Tj - TG;
Chris@42 313 cr[0] = Tj + TG;
Chris@42 314 T1b = TF - Tu;
Chris@42 315 T1c = T19 - T16;
Chris@42 316 cr[WS(rs, 6)] = T1b - T1c;
Chris@42 317 ci[WS(rs, 5)] = T1b + T1c;
Chris@42 318 {
Chris@42 319 E TX, T1i, T10, T1h, TY, TZ;
Chris@42 320 TX = TH - TK;
Chris@42 321 T1i = T1f - T1e;
Chris@42 322 TY = TM - TP;
Chris@42 323 TZ = TR + TU;
Chris@42 324 T10 = KP707106781 * (TY + TZ);
Chris@42 325 T1h = KP707106781 * (TZ - TY);
Chris@42 326 cr[WS(rs, 3)] = TX - T10;
Chris@42 327 ci[WS(rs, 6)] = T1h + T1i;
Chris@42 328 ci[0] = TX + T10;
Chris@42 329 cr[WS(rs, 5)] = T1h - T1i;
Chris@42 330 }
Chris@42 331 }
Chris@42 332 {
Chris@42 333 E T15, T1a, T11, T14;
Chris@42 334 T15 = T13 + T12;
Chris@42 335 T1a = T16 + T19;
Chris@42 336 cr[WS(rs, 4)] = T15 - T1a;
Chris@42 337 ci[WS(rs, 7)] = T15 + T1a;
Chris@42 338 T11 = T7 - Ti;
Chris@42 339 T14 = T12 - T13;
Chris@42 340 cr[WS(rs, 2)] = T11 - T14;
Chris@42 341 ci[WS(rs, 1)] = T11 + T14;
Chris@42 342 {
Chris@42 343 E TL, T1g, TW, T1d, TQ, TV;
Chris@42 344 TL = TH + TK;
Chris@42 345 T1g = T1e + T1f;
Chris@42 346 TQ = TM + TP;
Chris@42 347 TV = TR - TU;
Chris@42 348 TW = KP707106781 * (TQ + TV);
Chris@42 349 T1d = KP707106781 * (TV - TQ);
Chris@42 350 ci[WS(rs, 2)] = TL - TW;
Chris@42 351 ci[WS(rs, 4)] = T1d + T1g;
Chris@42 352 cr[WS(rs, 1)] = TL + TW;
Chris@42 353 cr[WS(rs, 7)] = T1d - T1g;
Chris@42 354 }
Chris@42 355 }
Chris@42 356 }
Chris@42 357 }
Chris@42 358 }
Chris@42 359
Chris@42 360 static const tw_instr twinstr[] = {
Chris@42 361 {TW_FULL, 1, 8},
Chris@42 362 {TW_NEXT, 1, 0}
Chris@42 363 };
Chris@42 364
Chris@42 365 static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@42 366
Chris@42 367 void X(codelet_hf_8) (planner *p) {
Chris@42 368 X(khc2hc_register) (p, hf_8, &desc);
Chris@42 369 }
Chris@42 370 #endif /* HAVE_FMA */