annotate src/fftw-3.3.5/rdft/scalar/r2cb/hb_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:41 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include hb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 72 FP additions, 66 FP multiplications,
Chris@42 32 * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
Chris@42 33 * 67 stack variables, 6 constants, and 28 memory accesses
Chris@42 34 */
Chris@42 35 #include "hb.h"
Chris@42 36
Chris@42 37 static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 40 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 41 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@42 42 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@42 43 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@42 44 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@42 48 E T1q, T1p, T1t, T1r, T1s, T1u;
Chris@42 49 {
Chris@42 50 E T1, T4, TC, T7, TB, Tt, TD, Ta, TA, T1l, TZ, T1b, Th, Tw, Td;
Chris@42 51 E TP, Ti, Tj, Tl, Tm, T8, T9, T1a;
Chris@42 52 T1 = cr[0];
Chris@42 53 {
Chris@42 54 E T2, T3, T5, T6;
Chris@42 55 T2 = cr[WS(rs, 1)];
Chris@42 56 T3 = ci[0];
Chris@42 57 T5 = cr[WS(rs, 2)];
Chris@42 58 T6 = ci[WS(rs, 1)];
Chris@42 59 T8 = cr[WS(rs, 3)];
Chris@42 60 T4 = T2 + T3;
Chris@42 61 TC = T2 - T3;
Chris@42 62 T7 = T5 + T6;
Chris@42 63 TB = T5 - T6;
Chris@42 64 T9 = ci[WS(rs, 2)];
Chris@42 65 }
Chris@42 66 Tt = ci[WS(rs, 6)];
Chris@42 67 TD = FNMS(KP554958132, TC, TB);
Chris@42 68 T1a = FNMS(KP356895867, T7, T4);
Chris@42 69 Ta = T8 + T9;
Chris@42 70 TA = T8 - T9;
Chris@42 71 {
Chris@42 72 E Tf, Tg, Tc, TO;
Chris@42 73 Tf = ci[WS(rs, 3)];
Chris@42 74 Tg = cr[WS(rs, 4)];
Chris@42 75 T1l = FMA(KP554958132, TA, TC);
Chris@42 76 TZ = FMA(KP554958132, TB, TA);
Chris@42 77 Tc = FNMS(KP356895867, Ta, T7);
Chris@42 78 TO = FNMS(KP356895867, T4, Ta);
Chris@42 79 T1b = FNMS(KP692021471, T1a, Ta);
Chris@42 80 Th = Tf + Tg;
Chris@42 81 Tw = Tf - Tg;
Chris@42 82 Td = FNMS(KP692021471, Tc, T4);
Chris@42 83 TP = FNMS(KP692021471, TO, T7);
Chris@42 84 }
Chris@42 85 Ti = ci[WS(rs, 4)];
Chris@42 86 Tj = cr[WS(rs, 5)];
Chris@42 87 Tl = ci[WS(rs, 5)];
Chris@42 88 Tm = cr[WS(rs, 6)];
Chris@42 89 {
Chris@42 90 E Ty, TS, TX, T1j, T1e, Tp, Tk, Tv;
Chris@42 91 cr[0] = T1 + T4 + T7 + Ta;
Chris@42 92 Tk = Ti + Tj;
Chris@42 93 Tv = Ti - Tj;
Chris@42 94 {
Chris@42 95 E Tn, Tu, Tx, TR;
Chris@42 96 Tn = Tl + Tm;
Chris@42 97 Tu = Tl - Tm;
Chris@42 98 Tx = FNMS(KP356895867, Tw, Tv);
Chris@42 99 TR = FMA(KP554958132, Tk, Th);
Chris@42 100 {
Chris@42 101 E TW, T1i, T1d, To;
Chris@42 102 TW = FNMS(KP356895867, Tu, Tw);
Chris@42 103 T1i = FNMS(KP356895867, Tv, Tu);
Chris@42 104 T1d = FMA(KP554958132, Th, Tn);
Chris@42 105 To = FNMS(KP554958132, Tn, Tk);
Chris@42 106 Ty = FNMS(KP692021471, Tx, Tu);
Chris@42 107 TS = FNMS(KP801937735, TR, Tn);
Chris@42 108 TX = FNMS(KP692021471, TW, Tv);
Chris@42 109 T1j = FNMS(KP692021471, T1i, Tw);
Chris@42 110 T1e = FMA(KP801937735, T1d, Tk);
Chris@42 111 Tp = FNMS(KP801937735, To, Th);
Chris@42 112 ci[0] = Tt + Tu + Tv + Tw;
Chris@42 113 }
Chris@42 114 }
Chris@42 115 {
Chris@42 116 E TL, TH, TK, TJ, TM, Te, Tz, TE;
Chris@42 117 Te = FNMS(KP900968867, Td, T1);
Chris@42 118 Tz = FNMS(KP900968867, Ty, Tt);
Chris@42 119 TE = FNMS(KP801937735, TD, TA);
Chris@42 120 {
Chris@42 121 E Tb, TI, Tq, TF, Ts, Tr, TG;
Chris@42 122 Tb = W[4];
Chris@42 123 TI = FMA(KP974927912, Tp, Te);
Chris@42 124 Tq = FNMS(KP974927912, Tp, Te);
Chris@42 125 TL = FNMS(KP974927912, TE, Tz);
Chris@42 126 TF = FMA(KP974927912, TE, Tz);
Chris@42 127 Ts = W[5];
Chris@42 128 Tr = Tb * Tq;
Chris@42 129 TH = W[6];
Chris@42 130 TK = W[7];
Chris@42 131 TG = Ts * Tq;
Chris@42 132 cr[WS(rs, 3)] = FNMS(Ts, TF, Tr);
Chris@42 133 TJ = TH * TI;
Chris@42 134 TM = TK * TI;
Chris@42 135 ci[WS(rs, 3)] = FMA(Tb, TF, TG);
Chris@42 136 }
Chris@42 137 {
Chris@42 138 E T14, T13, T17, T15, T16;
Chris@42 139 {
Chris@42 140 E TY, TT, T10, TQ;
Chris@42 141 TQ = FNMS(KP900968867, TP, T1);
Chris@42 142 cr[WS(rs, 4)] = FNMS(TK, TL, TJ);
Chris@42 143 ci[WS(rs, 4)] = FMA(TH, TL, TM);
Chris@42 144 TY = FNMS(KP900968867, TX, Tt);
Chris@42 145 TT = FNMS(KP974927912, TS, TQ);
Chris@42 146 T14 = FMA(KP974927912, TS, TQ);
Chris@42 147 T10 = FNMS(KP801937735, TZ, TC);
Chris@42 148 {
Chris@42 149 E TN, TV, T11, TU, T12;
Chris@42 150 TN = W[2];
Chris@42 151 TV = W[3];
Chris@42 152 T13 = W[8];
Chris@42 153 T11 = FMA(KP974927912, T10, TY);
Chris@42 154 T17 = FNMS(KP974927912, T10, TY);
Chris@42 155 TU = TN * TT;
Chris@42 156 T12 = TV * TT;
Chris@42 157 T15 = T13 * T14;
Chris@42 158 T16 = W[9];
Chris@42 159 cr[WS(rs, 2)] = FNMS(TV, T11, TU);
Chris@42 160 ci[WS(rs, 2)] = FMA(TN, T11, T12);
Chris@42 161 }
Chris@42 162 }
Chris@42 163 {
Chris@42 164 E T1k, T1f, T1m, T1c, T18;
Chris@42 165 T1c = FNMS(KP900968867, T1b, T1);
Chris@42 166 cr[WS(rs, 5)] = FNMS(T16, T17, T15);
Chris@42 167 T18 = T16 * T14;
Chris@42 168 T1k = FNMS(KP900968867, T1j, Tt);
Chris@42 169 T1f = FNMS(KP974927912, T1e, T1c);
Chris@42 170 T1q = FMA(KP974927912, T1e, T1c);
Chris@42 171 ci[WS(rs, 5)] = FMA(T13, T17, T18);
Chris@42 172 T1m = FMA(KP801937735, T1l, TB);
Chris@42 173 {
Chris@42 174 E T19, T1h, T1n, T1g, T1o;
Chris@42 175 T19 = W[0];
Chris@42 176 T1h = W[1];
Chris@42 177 T1p = W[10];
Chris@42 178 T1t = FNMS(KP974927912, T1m, T1k);
Chris@42 179 T1n = FMA(KP974927912, T1m, T1k);
Chris@42 180 T1g = T19 * T1f;
Chris@42 181 T1o = T1h * T1f;
Chris@42 182 T1r = T1p * T1q;
Chris@42 183 T1s = W[11];
Chris@42 184 cr[WS(rs, 1)] = FNMS(T1h, T1n, T1g);
Chris@42 185 ci[WS(rs, 1)] = FMA(T19, T1n, T1o);
Chris@42 186 }
Chris@42 187 }
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 }
Chris@42 192 cr[WS(rs, 6)] = FNMS(T1s, T1t, T1r);
Chris@42 193 T1u = T1s * T1q;
Chris@42 194 ci[WS(rs, 6)] = FMA(T1p, T1t, T1u);
Chris@42 195 }
Chris@42 196 }
Chris@42 197 }
Chris@42 198
Chris@42 199 static const tw_instr twinstr[] = {
Chris@42 200 {TW_FULL, 1, 7},
Chris@42 201 {TW_NEXT, 1, 0}
Chris@42 202 };
Chris@42 203
Chris@42 204 static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, {18, 12, 54, 0} };
Chris@42 205
Chris@42 206 void X(codelet_hb_7) (planner *p) {
Chris@42 207 X(khc2hc_register) (p, hb_7, &desc);
Chris@42 208 }
Chris@42 209 #else /* HAVE_FMA */
Chris@42 210
Chris@42 211 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 7 -dif -name hb_7 -include hb.h */
Chris@42 212
Chris@42 213 /*
Chris@42 214 * This function contains 72 FP additions, 60 FP multiplications,
Chris@42 215 * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
Chris@42 216 * 36 stack variables, 6 constants, and 28 memory accesses
Chris@42 217 */
Chris@42 218 #include "hb.h"
Chris@42 219
Chris@42 220 static void hb_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 221 {
Chris@42 222 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@42 223 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 224 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@42 225 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@42 226 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 227 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@42 228 {
Chris@42 229 INT m;
Chris@42 230 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@42 231 E T1, T4, T7, Ta, Tx, TI, TV, TQ, TE, Tm, Tb, Te, Th, Tk, Tq;
Chris@42 232 E TF, TR, TU, TJ, Tt;
Chris@42 233 {
Chris@42 234 E Tu, Tw, Tv, T2, T3;
Chris@42 235 T1 = cr[0];
Chris@42 236 T2 = cr[WS(rs, 1)];
Chris@42 237 T3 = ci[0];
Chris@42 238 T4 = T2 + T3;
Chris@42 239 Tu = T2 - T3;
Chris@42 240 {
Chris@42 241 E T5, T6, T8, T9;
Chris@42 242 T5 = cr[WS(rs, 2)];
Chris@42 243 T6 = ci[WS(rs, 1)];
Chris@42 244 T7 = T5 + T6;
Chris@42 245 Tw = T5 - T6;
Chris@42 246 T8 = cr[WS(rs, 3)];
Chris@42 247 T9 = ci[WS(rs, 2)];
Chris@42 248 Ta = T8 + T9;
Chris@42 249 Tv = T8 - T9;
Chris@42 250 }
Chris@42 251 Tx = FMA(KP433883739, Tu, KP974927912 * Tv) - (KP781831482 * Tw);
Chris@42 252 TI = FMA(KP781831482, Tu, KP974927912 * Tw) + (KP433883739 * Tv);
Chris@42 253 TV = FNMS(KP781831482, Tv, KP974927912 * Tu) - (KP433883739 * Tw);
Chris@42 254 TQ = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
Chris@42 255 TE = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
Chris@42 256 Tm = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
Chris@42 257 }
Chris@42 258 {
Chris@42 259 E Tp, Tn, To, Tc, Td;
Chris@42 260 Tb = ci[WS(rs, 6)];
Chris@42 261 Tc = ci[WS(rs, 5)];
Chris@42 262 Td = cr[WS(rs, 6)];
Chris@42 263 Te = Tc - Td;
Chris@42 264 Tp = Tc + Td;
Chris@42 265 {
Chris@42 266 E Tf, Tg, Ti, Tj;
Chris@42 267 Tf = ci[WS(rs, 4)];
Chris@42 268 Tg = cr[WS(rs, 5)];
Chris@42 269 Th = Tf - Tg;
Chris@42 270 Tn = Tf + Tg;
Chris@42 271 Ti = ci[WS(rs, 3)];
Chris@42 272 Tj = cr[WS(rs, 4)];
Chris@42 273 Tk = Ti - Tj;
Chris@42 274 To = Ti + Tj;
Chris@42 275 }
Chris@42 276 Tq = FNMS(KP974927912, To, KP781831482 * Tn) - (KP433883739 * Tp);
Chris@42 277 TF = FMA(KP781831482, Tp, KP974927912 * Tn) + (KP433883739 * To);
Chris@42 278 TR = FMA(KP433883739, Tn, KP781831482 * To) - (KP974927912 * Tp);
Chris@42 279 TU = FMA(KP623489801, Tk, Tb) + FNMA(KP900968867, Th, KP222520933 * Te);
Chris@42 280 TJ = FMA(KP623489801, Te, Tb) + FNMA(KP900968867, Tk, KP222520933 * Th);
Chris@42 281 Tt = FMA(KP623489801, Th, Tb) + FNMA(KP222520933, Tk, KP900968867 * Te);
Chris@42 282 }
Chris@42 283 cr[0] = T1 + T4 + T7 + Ta;
Chris@42 284 ci[0] = Tb + Te + Th + Tk;
Chris@42 285 {
Chris@42 286 E Tr, Ty, Tl, Ts;
Chris@42 287 Tr = Tm - Tq;
Chris@42 288 Ty = Tt - Tx;
Chris@42 289 Tl = W[6];
Chris@42 290 Ts = W[7];
Chris@42 291 cr[WS(rs, 4)] = FNMS(Ts, Ty, Tl * Tr);
Chris@42 292 ci[WS(rs, 4)] = FMA(Tl, Ty, Ts * Tr);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 E TY, T10, TX, TZ;
Chris@42 296 TY = TQ + TR;
Chris@42 297 T10 = TV + TU;
Chris@42 298 TX = W[2];
Chris@42 299 TZ = W[3];
Chris@42 300 cr[WS(rs, 2)] = FNMS(TZ, T10, TX * TY);
Chris@42 301 ci[WS(rs, 2)] = FMA(TX, T10, TZ * TY);
Chris@42 302 }
Chris@42 303 {
Chris@42 304 E TA, TC, Tz, TB;
Chris@42 305 TA = Tm + Tq;
Chris@42 306 TC = Tx + Tt;
Chris@42 307 Tz = W[4];
Chris@42 308 TB = W[5];
Chris@42 309 cr[WS(rs, 3)] = FNMS(TB, TC, Tz * TA);
Chris@42 310 ci[WS(rs, 3)] = FMA(Tz, TC, TB * TA);
Chris@42 311 }
Chris@42 312 {
Chris@42 313 E TM, TO, TL, TN;
Chris@42 314 TM = TE + TF;
Chris@42 315 TO = TJ - TI;
Chris@42 316 TL = W[10];
Chris@42 317 TN = W[11];
Chris@42 318 cr[WS(rs, 6)] = FNMS(TN, TO, TL * TM);
Chris@42 319 ci[WS(rs, 6)] = FMA(TL, TO, TN * TM);
Chris@42 320 }
Chris@42 321 {
Chris@42 322 E TS, TW, TP, TT;
Chris@42 323 TS = TQ - TR;
Chris@42 324 TW = TU - TV;
Chris@42 325 TP = W[8];
Chris@42 326 TT = W[9];
Chris@42 327 cr[WS(rs, 5)] = FNMS(TT, TW, TP * TS);
Chris@42 328 ci[WS(rs, 5)] = FMA(TP, TW, TT * TS);
Chris@42 329 }
Chris@42 330 {
Chris@42 331 E TG, TK, TD, TH;
Chris@42 332 TG = TE - TF;
Chris@42 333 TK = TI + TJ;
Chris@42 334 TD = W[0];
Chris@42 335 TH = W[1];
Chris@42 336 cr[WS(rs, 1)] = FNMS(TH, TK, TD * TG);
Chris@42 337 ci[WS(rs, 1)] = FMA(TD, TK, TH * TG);
Chris@42 338 }
Chris@42 339 }
Chris@42 340 }
Chris@42 341 }
Chris@42 342
Chris@42 343 static const tw_instr twinstr[] = {
Chris@42 344 {TW_FULL, 1, 7},
Chris@42 345 {TW_NEXT, 1, 0}
Chris@42 346 };
Chris@42 347
Chris@42 348 static const hc2hc_desc desc = { 7, "hb_7", twinstr, &GENUS, {36, 24, 36, 0} };
Chris@42 349
Chris@42 350 void X(codelet_hb_7) (planner *p) {
Chris@42 351 X(khc2hc_register) (p, hb_7, &desc);
Chris@42 352 }
Chris@42 353 #endif /* HAVE_FMA */