annotate src/fftw-3.3.5/rdft/scalar/r2cf/hf_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:15 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include hf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 72 FP additions, 66 FP multiplications,
Chris@42 32 * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
Chris@42 33 * 62 stack variables, 6 constants, and 28 memory accesses
Chris@42 34 */
Chris@42 35 #include "hf.h"
Chris@42 36
Chris@42 37 static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 40 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@42 41 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 42 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@42 43 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@42 44 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@42 48 E T1, TR, T18, T10, T12, T16, T11, T13;
Chris@42 49 {
Chris@42 50 E T19, T1a, T1i, Te, Tt, Tw, T1b, TM, T1h, Tr, Tu, TS, Tz, TC, Ty;
Chris@42 51 E Tv, TB;
Chris@42 52 T1 = cr[0];
Chris@42 53 T19 = ci[0];
Chris@42 54 {
Chris@42 55 E T9, Tc, TP, Ta, Tb, TO, T7;
Chris@42 56 {
Chris@42 57 E T3, T6, T8, TN, T4, T2, T5;
Chris@42 58 T3 = cr[WS(rs, 1)];
Chris@42 59 T6 = ci[WS(rs, 1)];
Chris@42 60 T2 = W[0];
Chris@42 61 T9 = cr[WS(rs, 6)];
Chris@42 62 Tc = ci[WS(rs, 6)];
Chris@42 63 T8 = W[10];
Chris@42 64 TN = T2 * T6;
Chris@42 65 T4 = T2 * T3;
Chris@42 66 T5 = W[1];
Chris@42 67 TP = T8 * Tc;
Chris@42 68 Ta = T8 * T9;
Chris@42 69 Tb = W[11];
Chris@42 70 TO = FNMS(T5, T3, TN);
Chris@42 71 T7 = FMA(T5, T6, T4);
Chris@42 72 }
Chris@42 73 {
Chris@42 74 E Tg, Tj, Th, TI, Tm, Tp, Tl, Ti, To, TQ, Td, Tf;
Chris@42 75 Tg = cr[WS(rs, 2)];
Chris@42 76 TQ = FNMS(Tb, T9, TP);
Chris@42 77 Td = FMA(Tb, Tc, Ta);
Chris@42 78 Tj = ci[WS(rs, 2)];
Chris@42 79 Tf = W[2];
Chris@42 80 T1a = TO + TQ;
Chris@42 81 TR = TO - TQ;
Chris@42 82 T1i = Td - T7;
Chris@42 83 Te = T7 + Td;
Chris@42 84 Th = Tf * Tg;
Chris@42 85 TI = Tf * Tj;
Chris@42 86 Tm = cr[WS(rs, 5)];
Chris@42 87 Tp = ci[WS(rs, 5)];
Chris@42 88 Tl = W[8];
Chris@42 89 Ti = W[3];
Chris@42 90 To = W[9];
Chris@42 91 {
Chris@42 92 E TJ, Tk, TL, Tq, TK, Tn, Ts;
Chris@42 93 Tt = cr[WS(rs, 3)];
Chris@42 94 TK = Tl * Tp;
Chris@42 95 Tn = Tl * Tm;
Chris@42 96 TJ = FNMS(Ti, Tg, TI);
Chris@42 97 Tk = FMA(Ti, Tj, Th);
Chris@42 98 TL = FNMS(To, Tm, TK);
Chris@42 99 Tq = FMA(To, Tp, Tn);
Chris@42 100 Tw = ci[WS(rs, 3)];
Chris@42 101 Ts = W[4];
Chris@42 102 T1b = TJ + TL;
Chris@42 103 TM = TJ - TL;
Chris@42 104 T1h = Tq - Tk;
Chris@42 105 Tr = Tk + Tq;
Chris@42 106 Tu = Ts * Tt;
Chris@42 107 TS = Ts * Tw;
Chris@42 108 }
Chris@42 109 Tz = cr[WS(rs, 4)];
Chris@42 110 TC = ci[WS(rs, 4)];
Chris@42 111 Ty = W[6];
Chris@42 112 Tv = W[5];
Chris@42 113 TB = W[7];
Chris@42 114 }
Chris@42 115 }
Chris@42 116 {
Chris@42 117 E TF, TT, Tx, TV, TD, T1q, TU, TA;
Chris@42 118 TF = FNMS(KP356895867, Tr, Te);
Chris@42 119 TU = Ty * TC;
Chris@42 120 TA = Ty * Tz;
Chris@42 121 TT = FNMS(Tv, Tt, TS);
Chris@42 122 Tx = FMA(Tv, Tw, Tu);
Chris@42 123 TV = FNMS(TB, Tz, TU);
Chris@42 124 TD = FMA(TB, TC, TA);
Chris@42 125 T1q = FNMS(KP356895867, T1b, T1a);
Chris@42 126 {
Chris@42 127 E TW, TE, T1k, T1f;
Chris@42 128 {
Chris@42 129 E T1e, T1s, TY, T1p, T1u, TH, T1n, T1j, T1c, T1g;
Chris@42 130 T1j = FNMS(KP554958132, T1i, T1h);
Chris@42 131 T1c = TT + TV;
Chris@42 132 TW = TT - TV;
Chris@42 133 T1g = TD - Tx;
Chris@42 134 TE = Tx + TD;
Chris@42 135 {
Chris@42 136 E T1d, T1l, T1r, TX;
Chris@42 137 T1d = FNMS(KP356895867, T1c, T1b);
Chris@42 138 T1l = FNMS(KP356895867, T1a, T1c);
Chris@42 139 T1r = FNMS(KP692021471, T1q, T1c);
Chris@42 140 ci[WS(rs, 6)] = T1a + T1b + T1c + T19;
Chris@42 141 TX = FMA(KP554958132, TW, TR);
Chris@42 142 {
Chris@42 143 E T1o, T1t, TG, T1m;
Chris@42 144 T1o = FMA(KP554958132, T1h, T1g);
Chris@42 145 T1t = FMA(KP554958132, T1g, T1i);
Chris@42 146 TG = FNMS(KP692021471, TF, TE);
Chris@42 147 cr[0] = T1 + Te + Tr + TE;
Chris@42 148 T1e = FNMS(KP692021471, T1d, T1a);
Chris@42 149 T1m = FNMS(KP692021471, T1l, T1b);
Chris@42 150 T1s = FNMS(KP900968867, T1r, T19);
Chris@42 151 TY = FMA(KP801937735, TX, TM);
Chris@42 152 T1p = FNMS(KP801937735, T1o, T1i);
Chris@42 153 T1u = FMA(KP801937735, T1t, T1h);
Chris@42 154 TH = FNMS(KP900968867, TG, T1);
Chris@42 155 T1n = FNMS(KP900968867, T1m, T19);
Chris@42 156 T1k = FNMS(KP801937735, T1j, T1g);
Chris@42 157 }
Chris@42 158 }
Chris@42 159 ci[WS(rs, 5)] = FMA(KP974927912, T1u, T1s);
Chris@42 160 cr[WS(rs, 6)] = FMS(KP974927912, T1u, T1s);
Chris@42 161 cr[WS(rs, 1)] = FMA(KP974927912, TY, TH);
Chris@42 162 ci[0] = FNMS(KP974927912, TY, TH);
Chris@42 163 ci[WS(rs, 4)] = FMA(KP974927912, T1p, T1n);
Chris@42 164 cr[WS(rs, 5)] = FMS(KP974927912, T1p, T1n);
Chris@42 165 T1f = FNMS(KP900968867, T1e, T19);
Chris@42 166 }
Chris@42 167 {
Chris@42 168 E T14, T17, T15, TZ;
Chris@42 169 T14 = FNMS(KP356895867, TE, Tr);
Chris@42 170 T17 = FNMS(KP554958132, TR, TM);
Chris@42 171 TZ = FNMS(KP356895867, Te, TE);
Chris@42 172 ci[WS(rs, 3)] = FMA(KP974927912, T1k, T1f);
Chris@42 173 cr[WS(rs, 4)] = FMS(KP974927912, T1k, T1f);
Chris@42 174 T15 = FNMS(KP692021471, T14, Te);
Chris@42 175 T18 = FNMS(KP801937735, T17, TW);
Chris@42 176 T10 = FNMS(KP692021471, TZ, Tr);
Chris@42 177 T12 = FMA(KP554958132, TM, TW);
Chris@42 178 T16 = FNMS(KP900968867, T15, T1);
Chris@42 179 }
Chris@42 180 }
Chris@42 181 }
Chris@42 182 }
Chris@42 183 T11 = FNMS(KP900968867, T10, T1);
Chris@42 184 T13 = FNMS(KP801937735, T12, TR);
Chris@42 185 cr[WS(rs, 3)] = FMA(KP974927912, T18, T16);
Chris@42 186 ci[WS(rs, 2)] = FNMS(KP974927912, T18, T16);
Chris@42 187 cr[WS(rs, 2)] = FMA(KP974927912, T13, T11);
Chris@42 188 ci[WS(rs, 1)] = FNMS(KP974927912, T13, T11);
Chris@42 189 }
Chris@42 190 }
Chris@42 191 }
Chris@42 192
Chris@42 193 static const tw_instr twinstr[] = {
Chris@42 194 {TW_FULL, 1, 7},
Chris@42 195 {TW_NEXT, 1, 0}
Chris@42 196 };
Chris@42 197
Chris@42 198 static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {18, 12, 54, 0} };
Chris@42 199
Chris@42 200 void X(codelet_hf_7) (planner *p) {
Chris@42 201 X(khc2hc_register) (p, hf_7, &desc);
Chris@42 202 }
Chris@42 203 #else /* HAVE_FMA */
Chris@42 204
Chris@42 205 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include hf.h */
Chris@42 206
Chris@42 207 /*
Chris@42 208 * This function contains 72 FP additions, 60 FP multiplications,
Chris@42 209 * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
Chris@42 210 * 29 stack variables, 6 constants, and 28 memory accesses
Chris@42 211 */
Chris@42 212 #include "hf.h"
Chris@42 213
Chris@42 214 static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 215 {
Chris@42 216 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@42 217 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@42 218 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@42 219 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@42 220 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@42 221 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@42 222 {
Chris@42 223 INT m;
Chris@42 224 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@42 225 E T1, TT, Tc, TV, TC, TO, Tn, TS, TI, TP, Ty, TU, TF, TQ;
Chris@42 226 T1 = cr[0];
Chris@42 227 TT = ci[0];
Chris@42 228 {
Chris@42 229 E T6, TA, Tb, TB;
Chris@42 230 {
Chris@42 231 E T3, T5, T2, T4;
Chris@42 232 T3 = cr[WS(rs, 1)];
Chris@42 233 T5 = ci[WS(rs, 1)];
Chris@42 234 T2 = W[0];
Chris@42 235 T4 = W[1];
Chris@42 236 T6 = FMA(T2, T3, T4 * T5);
Chris@42 237 TA = FNMS(T4, T3, T2 * T5);
Chris@42 238 }
Chris@42 239 {
Chris@42 240 E T8, Ta, T7, T9;
Chris@42 241 T8 = cr[WS(rs, 6)];
Chris@42 242 Ta = ci[WS(rs, 6)];
Chris@42 243 T7 = W[10];
Chris@42 244 T9 = W[11];
Chris@42 245 Tb = FMA(T7, T8, T9 * Ta);
Chris@42 246 TB = FNMS(T9, T8, T7 * Ta);
Chris@42 247 }
Chris@42 248 Tc = T6 + Tb;
Chris@42 249 TV = TA + TB;
Chris@42 250 TC = TA - TB;
Chris@42 251 TO = Tb - T6;
Chris@42 252 }
Chris@42 253 {
Chris@42 254 E Th, TG, Tm, TH;
Chris@42 255 {
Chris@42 256 E Te, Tg, Td, Tf;
Chris@42 257 Te = cr[WS(rs, 2)];
Chris@42 258 Tg = ci[WS(rs, 2)];
Chris@42 259 Td = W[2];
Chris@42 260 Tf = W[3];
Chris@42 261 Th = FMA(Td, Te, Tf * Tg);
Chris@42 262 TG = FNMS(Tf, Te, Td * Tg);
Chris@42 263 }
Chris@42 264 {
Chris@42 265 E Tj, Tl, Ti, Tk;
Chris@42 266 Tj = cr[WS(rs, 5)];
Chris@42 267 Tl = ci[WS(rs, 5)];
Chris@42 268 Ti = W[8];
Chris@42 269 Tk = W[9];
Chris@42 270 Tm = FMA(Ti, Tj, Tk * Tl);
Chris@42 271 TH = FNMS(Tk, Tj, Ti * Tl);
Chris@42 272 }
Chris@42 273 Tn = Th + Tm;
Chris@42 274 TS = TG + TH;
Chris@42 275 TI = TG - TH;
Chris@42 276 TP = Th - Tm;
Chris@42 277 }
Chris@42 278 {
Chris@42 279 E Ts, TD, Tx, TE;
Chris@42 280 {
Chris@42 281 E Tp, Tr, To, Tq;
Chris@42 282 Tp = cr[WS(rs, 3)];
Chris@42 283 Tr = ci[WS(rs, 3)];
Chris@42 284 To = W[4];
Chris@42 285 Tq = W[5];
Chris@42 286 Ts = FMA(To, Tp, Tq * Tr);
Chris@42 287 TD = FNMS(Tq, Tp, To * Tr);
Chris@42 288 }
Chris@42 289 {
Chris@42 290 E Tu, Tw, Tt, Tv;
Chris@42 291 Tu = cr[WS(rs, 4)];
Chris@42 292 Tw = ci[WS(rs, 4)];
Chris@42 293 Tt = W[6];
Chris@42 294 Tv = W[7];
Chris@42 295 Tx = FMA(Tt, Tu, Tv * Tw);
Chris@42 296 TE = FNMS(Tv, Tu, Tt * Tw);
Chris@42 297 }
Chris@42 298 Ty = Ts + Tx;
Chris@42 299 TU = TD + TE;
Chris@42 300 TF = TD - TE;
Chris@42 301 TQ = Tx - Ts;
Chris@42 302 }
Chris@42 303 {
Chris@42 304 E TL, TK, TZ, T10;
Chris@42 305 cr[0] = T1 + Tc + Tn + Ty;
Chris@42 306 TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
Chris@42 307 TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
Chris@42 308 ci[0] = TK - TL;
Chris@42 309 cr[WS(rs, 1)] = TK + TL;
Chris@42 310 ci[WS(rs, 6)] = TV + TS + TU + TT;
Chris@42 311 TZ = FMA(KP781831482, TO, KP433883739 * TQ) - (KP974927912 * TP);
Chris@42 312 T10 = FMA(KP623489801, TV, TT) + FNMA(KP900968867, TU, KP222520933 * TS);
Chris@42 313 cr[WS(rs, 6)] = TZ - T10;
Chris@42 314 ci[WS(rs, 5)] = TZ + T10;
Chris@42 315 }
Chris@42 316 {
Chris@42 317 E TX, TY, TR, TW;
Chris@42 318 TX = FMA(KP974927912, TO, KP433883739 * TP) - (KP781831482 * TQ);
Chris@42 319 TY = FMA(KP623489801, TU, TT) + FNMA(KP900968867, TS, KP222520933 * TV);
Chris@42 320 cr[WS(rs, 5)] = TX - TY;
Chris@42 321 ci[WS(rs, 4)] = TX + TY;
Chris@42 322 TR = FMA(KP433883739, TO, KP781831482 * TP) + (KP974927912 * TQ);
Chris@42 323 TW = FMA(KP623489801, TS, TT) + FNMA(KP222520933, TU, KP900968867 * TV);
Chris@42 324 cr[WS(rs, 4)] = TR - TW;
Chris@42 325 ci[WS(rs, 3)] = TR + TW;
Chris@42 326 }
Chris@42 327 {
Chris@42 328 E TN, TM, TJ, Tz;
Chris@42 329 TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
Chris@42 330 TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
Chris@42 331 ci[WS(rs, 2)] = TM - TN;
Chris@42 332 cr[WS(rs, 3)] = TM + TN;
Chris@42 333 TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Chris@42 334 Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
Chris@42 335 ci[WS(rs, 1)] = Tz - TJ;
Chris@42 336 cr[WS(rs, 2)] = Tz + TJ;
Chris@42 337 }
Chris@42 338 }
Chris@42 339 }
Chris@42 340 }
Chris@42 341
Chris@42 342 static const tw_instr twinstr[] = {
Chris@42 343 {TW_FULL, 1, 7},
Chris@42 344 {TW_NEXT, 1, 0}
Chris@42 345 };
Chris@42 346
Chris@42 347 static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {36, 24, 36, 0} };
Chris@42 348
Chris@42 349 void X(codelet_hf_7) (planner *p) {
Chris@42 350 X(khc2hc_register) (p, hf_7, &desc);
Chris@42 351 }
Chris@42 352 #endif /* HAVE_FMA */