annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_7.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:28 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 72 FP additions, 66 FP multiplications,
Chris@82 32 * (or, 18 additions, 12 multiplications, 54 fused multiply/add),
Chris@82 33 * 37 stack variables, 6 constants, and 28 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@82 40 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@82 41 DK(KP801937735, +0.801937735804838252472204639014890102331838324);
Chris@82 42 DK(KP554958132, +0.554958132087371191422194871006410481067288862);
Chris@82 43 DK(KP692021471, +0.692021471630095869627814897002069140197260599);
Chris@82 44 DK(KP356895867, +0.356895867892209443894399510021300583399127187);
Chris@82 45 {
Chris@82 46 INT m;
Chris@82 47 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@82 48 E T1, T19, Te, T1i, TR, T1a, Tr, T1h, TM, T1b, TE, T1g, TW, T1c;
Chris@82 49 T1 = cr[0];
Chris@82 50 T19 = ci[0];
Chris@82 51 {
Chris@82 52 E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
Chris@82 53 T3 = cr[WS(rs, 1)];
Chris@82 54 T6 = ci[WS(rs, 1)];
Chris@82 55 T2 = W[0];
Chris@82 56 T4 = T2 * T3;
Chris@82 57 TN = T2 * T6;
Chris@82 58 T9 = cr[WS(rs, 6)];
Chris@82 59 Tc = ci[WS(rs, 6)];
Chris@82 60 T8 = W[10];
Chris@82 61 Ta = T8 * T9;
Chris@82 62 TP = T8 * Tc;
Chris@82 63 {
Chris@82 64 E T7, TO, Td, TQ, T5, Tb;
Chris@82 65 T5 = W[1];
Chris@82 66 T7 = FMA(T5, T6, T4);
Chris@82 67 TO = FNMS(T5, T3, TN);
Chris@82 68 Tb = W[11];
Chris@82 69 Td = FMA(Tb, Tc, Ta);
Chris@82 70 TQ = FNMS(Tb, T9, TP);
Chris@82 71 Te = T7 + Td;
Chris@82 72 T1i = Td - T7;
Chris@82 73 TR = TO - TQ;
Chris@82 74 T1a = TO + TQ;
Chris@82 75 }
Chris@82 76 }
Chris@82 77 {
Chris@82 78 E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
Chris@82 79 Tg = cr[WS(rs, 2)];
Chris@82 80 Tj = ci[WS(rs, 2)];
Chris@82 81 Tf = W[2];
Chris@82 82 Th = Tf * Tg;
Chris@82 83 TI = Tf * Tj;
Chris@82 84 Tm = cr[WS(rs, 5)];
Chris@82 85 Tp = ci[WS(rs, 5)];
Chris@82 86 Tl = W[8];
Chris@82 87 Tn = Tl * Tm;
Chris@82 88 TK = Tl * Tp;
Chris@82 89 {
Chris@82 90 E Tk, TJ, Tq, TL, Ti, To;
Chris@82 91 Ti = W[3];
Chris@82 92 Tk = FMA(Ti, Tj, Th);
Chris@82 93 TJ = FNMS(Ti, Tg, TI);
Chris@82 94 To = W[9];
Chris@82 95 Tq = FMA(To, Tp, Tn);
Chris@82 96 TL = FNMS(To, Tm, TK);
Chris@82 97 Tr = Tk + Tq;
Chris@82 98 T1h = Tq - Tk;
Chris@82 99 TM = TJ - TL;
Chris@82 100 T1b = TJ + TL;
Chris@82 101 }
Chris@82 102 }
Chris@82 103 {
Chris@82 104 E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
Chris@82 105 Tt = cr[WS(rs, 3)];
Chris@82 106 Tw = ci[WS(rs, 3)];
Chris@82 107 Ts = W[4];
Chris@82 108 Tu = Ts * Tt;
Chris@82 109 TS = Ts * Tw;
Chris@82 110 Tz = cr[WS(rs, 4)];
Chris@82 111 TC = ci[WS(rs, 4)];
Chris@82 112 Ty = W[6];
Chris@82 113 TA = Ty * Tz;
Chris@82 114 TU = Ty * TC;
Chris@82 115 {
Chris@82 116 E Tx, TT, TD, TV, Tv, TB;
Chris@82 117 Tv = W[5];
Chris@82 118 Tx = FMA(Tv, Tw, Tu);
Chris@82 119 TT = FNMS(Tv, Tt, TS);
Chris@82 120 TB = W[7];
Chris@82 121 TD = FMA(TB, TC, TA);
Chris@82 122 TV = FNMS(TB, Tz, TU);
Chris@82 123 TE = Tx + TD;
Chris@82 124 T1g = TD - Tx;
Chris@82 125 TW = TT - TV;
Chris@82 126 T1c = TT + TV;
Chris@82 127 }
Chris@82 128 }
Chris@82 129 cr[0] = T1 + Te + Tr + TE;
Chris@82 130 {
Chris@82 131 E TG, TY, TF, TX, TH;
Chris@82 132 TF = FNMS(KP356895867, Tr, Te);
Chris@82 133 TG = FNMS(KP692021471, TF, TE);
Chris@82 134 TX = FMA(KP554958132, TW, TR);
Chris@82 135 TY = FMA(KP801937735, TX, TM);
Chris@82 136 TH = FNMS(KP900968867, TG, T1);
Chris@82 137 ci[0] = FNMS(KP974927912, TY, TH);
Chris@82 138 cr[WS(rs, 1)] = FMA(KP974927912, TY, TH);
Chris@82 139 }
Chris@82 140 ci[WS(rs, 6)] = T1a + T1b + T1c + T19;
Chris@82 141 {
Chris@82 142 E T1r, T1u, T1q, T1t, T1s;
Chris@82 143 T1q = FNMS(KP356895867, T1b, T1a);
Chris@82 144 T1r = FNMS(KP692021471, T1q, T1c);
Chris@82 145 T1t = FMA(KP554958132, T1g, T1i);
Chris@82 146 T1u = FMA(KP801937735, T1t, T1h);
Chris@82 147 T1s = FNMS(KP900968867, T1r, T19);
Chris@82 148 cr[WS(rs, 6)] = FMS(KP974927912, T1u, T1s);
Chris@82 149 ci[WS(rs, 5)] = FMA(KP974927912, T1u, T1s);
Chris@82 150 }
Chris@82 151 {
Chris@82 152 E T1m, T1p, T1l, T1o, T1n;
Chris@82 153 T1l = FNMS(KP356895867, T1a, T1c);
Chris@82 154 T1m = FNMS(KP692021471, T1l, T1b);
Chris@82 155 T1o = FMA(KP554958132, T1h, T1g);
Chris@82 156 T1p = FNMS(KP801937735, T1o, T1i);
Chris@82 157 T1n = FNMS(KP900968867, T1m, T19);
Chris@82 158 cr[WS(rs, 5)] = FMS(KP974927912, T1p, T1n);
Chris@82 159 ci[WS(rs, 4)] = FMA(KP974927912, T1p, T1n);
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E T1e, T1k, T1d, T1j, T1f;
Chris@82 163 T1d = FNMS(KP356895867, T1c, T1b);
Chris@82 164 T1e = FNMS(KP692021471, T1d, T1a);
Chris@82 165 T1j = FNMS(KP554958132, T1i, T1h);
Chris@82 166 T1k = FNMS(KP801937735, T1j, T1g);
Chris@82 167 T1f = FNMS(KP900968867, T1e, T19);
Chris@82 168 cr[WS(rs, 4)] = FMS(KP974927912, T1k, T1f);
Chris@82 169 ci[WS(rs, 3)] = FMA(KP974927912, T1k, T1f);
Chris@82 170 }
Chris@82 171 {
Chris@82 172 E T15, T18, T14, T17, T16;
Chris@82 173 T14 = FNMS(KP356895867, TE, Tr);
Chris@82 174 T15 = FNMS(KP692021471, T14, Te);
Chris@82 175 T17 = FNMS(KP554958132, TR, TM);
Chris@82 176 T18 = FNMS(KP801937735, T17, TW);
Chris@82 177 T16 = FNMS(KP900968867, T15, T1);
Chris@82 178 ci[WS(rs, 2)] = FNMS(KP974927912, T18, T16);
Chris@82 179 cr[WS(rs, 3)] = FMA(KP974927912, T18, T16);
Chris@82 180 }
Chris@82 181 {
Chris@82 182 E T10, T13, TZ, T12, T11;
Chris@82 183 TZ = FNMS(KP356895867, Te, TE);
Chris@82 184 T10 = FNMS(KP692021471, TZ, Tr);
Chris@82 185 T12 = FMA(KP554958132, TM, TW);
Chris@82 186 T13 = FNMS(KP801937735, T12, TR);
Chris@82 187 T11 = FNMS(KP900968867, T10, T1);
Chris@82 188 ci[WS(rs, 1)] = FNMS(KP974927912, T13, T11);
Chris@82 189 cr[WS(rs, 2)] = FMA(KP974927912, T13, T11);
Chris@82 190 }
Chris@82 191 }
Chris@82 192 }
Chris@82 193 }
Chris@82 194
Chris@82 195 static const tw_instr twinstr[] = {
Chris@82 196 {TW_FULL, 1, 7},
Chris@82 197 {TW_NEXT, 1, 0}
Chris@82 198 };
Chris@82 199
Chris@82 200 static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {18, 12, 54, 0} };
Chris@82 201
Chris@82 202 void X(codelet_hf_7) (planner *p) {
Chris@82 203 X(khc2hc_register) (p, hf_7, &desc);
Chris@82 204 }
Chris@82 205 #else
Chris@82 206
Chris@82 207 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
Chris@82 208
Chris@82 209 /*
Chris@82 210 * This function contains 72 FP additions, 60 FP multiplications,
Chris@82 211 * (or, 36 additions, 24 multiplications, 36 fused multiply/add),
Chris@82 212 * 29 stack variables, 6 constants, and 28 memory accesses
Chris@82 213 */
Chris@82 214 #include "rdft/scalar/hf.h"
Chris@82 215
Chris@82 216 static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 217 {
Chris@82 218 DK(KP222520933, +0.222520933956314404288902564496794759466355569);
Chris@82 219 DK(KP900968867, +0.900968867902419126236102319507445051165919162);
Chris@82 220 DK(KP623489801, +0.623489801858733530525004884004239810632274731);
Chris@82 221 DK(KP433883739, +0.433883739117558120475768332848358754609990728);
Chris@82 222 DK(KP974927912, +0.974927912181823607018131682993931217232785801);
Chris@82 223 DK(KP781831482, +0.781831482468029808708444526674057750232334519);
Chris@82 224 {
Chris@82 225 INT m;
Chris@82 226 for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
Chris@82 227 E T1, TT, Tc, TV, TC, TO, Tn, TS, TI, TP, Ty, TU, TF, TQ;
Chris@82 228 T1 = cr[0];
Chris@82 229 TT = ci[0];
Chris@82 230 {
Chris@82 231 E T6, TA, Tb, TB;
Chris@82 232 {
Chris@82 233 E T3, T5, T2, T4;
Chris@82 234 T3 = cr[WS(rs, 1)];
Chris@82 235 T5 = ci[WS(rs, 1)];
Chris@82 236 T2 = W[0];
Chris@82 237 T4 = W[1];
Chris@82 238 T6 = FMA(T2, T3, T4 * T5);
Chris@82 239 TA = FNMS(T4, T3, T2 * T5);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 E T8, Ta, T7, T9;
Chris@82 243 T8 = cr[WS(rs, 6)];
Chris@82 244 Ta = ci[WS(rs, 6)];
Chris@82 245 T7 = W[10];
Chris@82 246 T9 = W[11];
Chris@82 247 Tb = FMA(T7, T8, T9 * Ta);
Chris@82 248 TB = FNMS(T9, T8, T7 * Ta);
Chris@82 249 }
Chris@82 250 Tc = T6 + Tb;
Chris@82 251 TV = TA + TB;
Chris@82 252 TC = TA - TB;
Chris@82 253 TO = Tb - T6;
Chris@82 254 }
Chris@82 255 {
Chris@82 256 E Th, TG, Tm, TH;
Chris@82 257 {
Chris@82 258 E Te, Tg, Td, Tf;
Chris@82 259 Te = cr[WS(rs, 2)];
Chris@82 260 Tg = ci[WS(rs, 2)];
Chris@82 261 Td = W[2];
Chris@82 262 Tf = W[3];
Chris@82 263 Th = FMA(Td, Te, Tf * Tg);
Chris@82 264 TG = FNMS(Tf, Te, Td * Tg);
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E Tj, Tl, Ti, Tk;
Chris@82 268 Tj = cr[WS(rs, 5)];
Chris@82 269 Tl = ci[WS(rs, 5)];
Chris@82 270 Ti = W[8];
Chris@82 271 Tk = W[9];
Chris@82 272 Tm = FMA(Ti, Tj, Tk * Tl);
Chris@82 273 TH = FNMS(Tk, Tj, Ti * Tl);
Chris@82 274 }
Chris@82 275 Tn = Th + Tm;
Chris@82 276 TS = TG + TH;
Chris@82 277 TI = TG - TH;
Chris@82 278 TP = Th - Tm;
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E Ts, TD, Tx, TE;
Chris@82 282 {
Chris@82 283 E Tp, Tr, To, Tq;
Chris@82 284 Tp = cr[WS(rs, 3)];
Chris@82 285 Tr = ci[WS(rs, 3)];
Chris@82 286 To = W[4];
Chris@82 287 Tq = W[5];
Chris@82 288 Ts = FMA(To, Tp, Tq * Tr);
Chris@82 289 TD = FNMS(Tq, Tp, To * Tr);
Chris@82 290 }
Chris@82 291 {
Chris@82 292 E Tu, Tw, Tt, Tv;
Chris@82 293 Tu = cr[WS(rs, 4)];
Chris@82 294 Tw = ci[WS(rs, 4)];
Chris@82 295 Tt = W[6];
Chris@82 296 Tv = W[7];
Chris@82 297 Tx = FMA(Tt, Tu, Tv * Tw);
Chris@82 298 TE = FNMS(Tv, Tu, Tt * Tw);
Chris@82 299 }
Chris@82 300 Ty = Ts + Tx;
Chris@82 301 TU = TD + TE;
Chris@82 302 TF = TD - TE;
Chris@82 303 TQ = Tx - Ts;
Chris@82 304 }
Chris@82 305 {
Chris@82 306 E TL, TK, TZ, T10;
Chris@82 307 cr[0] = T1 + Tc + Tn + Ty;
Chris@82 308 TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
Chris@82 309 TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
Chris@82 310 ci[0] = TK - TL;
Chris@82 311 cr[WS(rs, 1)] = TK + TL;
Chris@82 312 ci[WS(rs, 6)] = TV + TS + TU + TT;
Chris@82 313 TZ = FMA(KP781831482, TO, KP433883739 * TQ) - (KP974927912 * TP);
Chris@82 314 T10 = FMA(KP623489801, TV, TT) + FNMA(KP900968867, TU, KP222520933 * TS);
Chris@82 315 cr[WS(rs, 6)] = TZ - T10;
Chris@82 316 ci[WS(rs, 5)] = TZ + T10;
Chris@82 317 }
Chris@82 318 {
Chris@82 319 E TX, TY, TR, TW;
Chris@82 320 TX = FMA(KP974927912, TO, KP433883739 * TP) - (KP781831482 * TQ);
Chris@82 321 TY = FMA(KP623489801, TU, TT) + FNMA(KP900968867, TS, KP222520933 * TV);
Chris@82 322 cr[WS(rs, 5)] = TX - TY;
Chris@82 323 ci[WS(rs, 4)] = TX + TY;
Chris@82 324 TR = FMA(KP433883739, TO, KP781831482 * TP) + (KP974927912 * TQ);
Chris@82 325 TW = FMA(KP623489801, TS, TT) + FNMA(KP222520933, TU, KP900968867 * TV);
Chris@82 326 cr[WS(rs, 4)] = TR - TW;
Chris@82 327 ci[WS(rs, 3)] = TR + TW;
Chris@82 328 }
Chris@82 329 {
Chris@82 330 E TN, TM, TJ, Tz;
Chris@82 331 TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
Chris@82 332 TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
Chris@82 333 ci[WS(rs, 2)] = TM - TN;
Chris@82 334 cr[WS(rs, 3)] = TM + TN;
Chris@82 335 TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Chris@82 336 Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
Chris@82 337 ci[WS(rs, 1)] = Tz - TJ;
Chris@82 338 cr[WS(rs, 2)] = Tz + TJ;
Chris@82 339 }
Chris@82 340 }
Chris@82 341 }
Chris@82 342 }
Chris@82 343
Chris@82 344 static const tw_instr twinstr[] = {
Chris@82 345 {TW_FULL, 1, 7},
Chris@82 346 {TW_NEXT, 1, 0}
Chris@82 347 };
Chris@82 348
Chris@82 349 static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, {36, 24, 36, 0} };
Chris@82 350
Chris@82 351 void X(codelet_hf_7) (planner *p) {
Chris@82 352 X(khc2hc_register) (p, hf_7, &desc);
Chris@82 353 }
Chris@82 354 #endif