annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cfdft_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 58 FP additions, 44 FP multiplications,
Chris@82 32 * (or, 36 additions, 22 multiplications, 22 fused multiply/add),
Chris@82 33 * 27 stack variables, 2 constants, and 24 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 44 E T3, TQ, TJ, T12, Tu, TX, TB, T10, Td, TS, Tk, TV;
Chris@82 45 {
Chris@82 46 E T1, T2, TI, TD, TE, TF;
Chris@82 47 T1 = Ip[0];
Chris@82 48 T2 = Im[0];
Chris@82 49 TI = T1 + T2;
Chris@82 50 TD = Rm[0];
Chris@82 51 TE = Rp[0];
Chris@82 52 TF = TD - TE;
Chris@82 53 T3 = T1 - T2;
Chris@82 54 TQ = TE + TD;
Chris@82 55 {
Chris@82 56 E TC, TG, TH, T11;
Chris@82 57 TC = W[0];
Chris@82 58 TG = TC * TF;
Chris@82 59 TH = W[1];
Chris@82 60 T11 = TH * TF;
Chris@82 61 TJ = FNMS(TH, TI, TG);
Chris@82 62 T12 = FMA(TC, TI, T11);
Chris@82 63 }
Chris@82 64 }
Chris@82 65 {
Chris@82 66 E To, TA, Tt, Tx;
Chris@82 67 {
Chris@82 68 E Tm, Tn, Tr, Ts;
Chris@82 69 Tm = Rm[WS(rs, 2)];
Chris@82 70 Tn = Rp[WS(rs, 2)];
Chris@82 71 To = Tm - Tn;
Chris@82 72 TA = Tn + Tm;
Chris@82 73 Tr = Ip[WS(rs, 2)];
Chris@82 74 Ts = Im[WS(rs, 2)];
Chris@82 75 Tt = Tr + Ts;
Chris@82 76 Tx = Tr - Ts;
Chris@82 77 }
Chris@82 78 {
Chris@82 79 E Tp, TW, Tl, Tq;
Chris@82 80 Tl = W[8];
Chris@82 81 Tp = Tl * To;
Chris@82 82 TW = Tl * Tt;
Chris@82 83 Tq = W[9];
Chris@82 84 Tu = FNMS(Tq, Tt, Tp);
Chris@82 85 TX = FMA(Tq, To, TW);
Chris@82 86 }
Chris@82 87 {
Chris@82 88 E Tw, Ty, Tz, TZ;
Chris@82 89 Tw = W[6];
Chris@82 90 Ty = Tw * Tx;
Chris@82 91 Tz = W[7];
Chris@82 92 TZ = Tz * Tx;
Chris@82 93 TB = FNMS(Tz, TA, Ty);
Chris@82 94 T10 = FMA(Tw, TA, TZ);
Chris@82 95 }
Chris@82 96 }
Chris@82 97 {
Chris@82 98 E T7, Tg, Tc, Tj;
Chris@82 99 {
Chris@82 100 E T5, T6, Ta, Tb;
Chris@82 101 T5 = Ip[WS(rs, 1)];
Chris@82 102 T6 = Im[WS(rs, 1)];
Chris@82 103 T7 = T5 + T6;
Chris@82 104 Tg = T5 - T6;
Chris@82 105 Ta = Rp[WS(rs, 1)];
Chris@82 106 Tb = Rm[WS(rs, 1)];
Chris@82 107 Tc = Ta - Tb;
Chris@82 108 Tj = Ta + Tb;
Chris@82 109 }
Chris@82 110 {
Chris@82 111 E T4, T8, T9, TR;
Chris@82 112 T4 = W[5];
Chris@82 113 T8 = T4 * T7;
Chris@82 114 T9 = W[4];
Chris@82 115 TR = T9 * T7;
Chris@82 116 Td = FMA(T9, Tc, T8);
Chris@82 117 TS = FNMS(T4, Tc, TR);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E Tf, Th, Ti, TU;
Chris@82 121 Tf = W[2];
Chris@82 122 Th = Tf * Tg;
Chris@82 123 Ti = W[3];
Chris@82 124 TU = Ti * Tg;
Chris@82 125 Tk = FNMS(Ti, Tj, Th);
Chris@82 126 TV = FMA(Tf, Tj, TU);
Chris@82 127 }
Chris@82 128 }
Chris@82 129 {
Chris@82 130 E Te, T1d, TL, T1g, T1c, T1e, T19, T1f;
Chris@82 131 Te = T3 - Td;
Chris@82 132 T1d = TQ + TS;
Chris@82 133 {
Chris@82 134 E Tv, TK, T1a, T1b;
Chris@82 135 Tv = Tk + Tu;
Chris@82 136 TK = TB + TJ;
Chris@82 137 TL = Tv + TK;
Chris@82 138 T1g = Tv - TK;
Chris@82 139 T1a = TV + TX;
Chris@82 140 T1b = T10 + T12;
Chris@82 141 T1c = T1a - T1b;
Chris@82 142 T1e = T1a + T1b;
Chris@82 143 }
Chris@82 144 Ip[0] = KP500000000 * (Te + TL);
Chris@82 145 Rp[0] = KP500000000 * (T1d + T1e);
Chris@82 146 T19 = FNMS(KP500000000, TL, Te);
Chris@82 147 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP866025403, T1c, T19));
Chris@82 148 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP866025403, T1c, T19)));
Chris@82 149 T1f = FNMS(KP500000000, T1e, T1d);
Chris@82 150 Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP866025403, T1g, T1f));
Chris@82 151 Rm[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T1g, T1f));
Chris@82 152 }
Chris@82 153 {
Chris@82 154 E TP, TT, TO, T16, T14, T18, T15, T17;
Chris@82 155 TP = Td + T3;
Chris@82 156 TT = TQ - TS;
Chris@82 157 {
Chris@82 158 E TM, TN, TY, T13;
Chris@82 159 TM = Tu - Tk;
Chris@82 160 TN = TJ - TB;
Chris@82 161 TO = TM + TN;
Chris@82 162 T16 = TN - TM;
Chris@82 163 TY = TV - TX;
Chris@82 164 T13 = T10 - T12;
Chris@82 165 T14 = TY + T13;
Chris@82 166 T18 = T13 - TY;
Chris@82 167 }
Chris@82 168 Im[WS(rs, 2)] = KP500000000 * (TO - TP);
Chris@82 169 Rm[WS(rs, 2)] = KP500000000 * (TT + T14);
Chris@82 170 T15 = FNMS(KP500000000, T14, TT);
Chris@82 171 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T16, T15));
Chris@82 172 Rm[0] = KP500000000 * (FNMS(KP866025403, T16, T15));
Chris@82 173 T17 = FMA(KP500000000, TO, TP);
Chris@82 174 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T18, T17));
Chris@82 175 Im[0] = -(KP500000000 * (FNMS(KP866025403, T18, T17)));
Chris@82 176 }
Chris@82 177 }
Chris@82 178 }
Chris@82 179 }
Chris@82 180
Chris@82 181 static const tw_instr twinstr[] = {
Chris@82 182 {TW_FULL, 1, 6},
Chris@82 183 {TW_NEXT, 1, 0}
Chris@82 184 };
Chris@82 185
Chris@82 186 static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, {36, 22, 22, 0} };
Chris@82 187
Chris@82 188 void X(codelet_hc2cfdft_6) (planner *p) {
Chris@82 189 X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
Chris@82 190 }
Chris@82 191 #else
Chris@82 192
Chris@82 193 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
Chris@82 194
Chris@82 195 /*
Chris@82 196 * This function contains 58 FP additions, 36 FP multiplications,
Chris@82 197 * (or, 44 additions, 22 multiplications, 14 fused multiply/add),
Chris@82 198 * 40 stack variables, 3 constants, and 24 memory accesses
Chris@82 199 */
Chris@82 200 #include "rdft/scalar/hc2cf.h"
Chris@82 201
Chris@82 202 static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 203 {
Chris@82 204 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 205 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 206 DK(KP433012701, +0.433012701892219323381861585376468091735701313);
Chris@82 207 {
Chris@82 208 INT m;
Chris@82 209 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 210 E T3, TM, Tc, TN, Ts, T10, TI, TR, TF, T11, TH, TU;
Chris@82 211 {
Chris@82 212 E T1, T2, TD, Tz, TA, TB, T7, Tf, Tb, Th, Tq, Tw, Tm, Tu, T4;
Chris@82 213 E T8;
Chris@82 214 {
Chris@82 215 E T5, T6, T9, Ta;
Chris@82 216 T1 = Ip[0];
Chris@82 217 T2 = Im[0];
Chris@82 218 TD = T1 + T2;
Chris@82 219 Tz = Rm[0];
Chris@82 220 TA = Rp[0];
Chris@82 221 TB = Tz - TA;
Chris@82 222 T5 = Ip[WS(rs, 1)];
Chris@82 223 T6 = Im[WS(rs, 1)];
Chris@82 224 T7 = T5 + T6;
Chris@82 225 Tf = T5 - T6;
Chris@82 226 T9 = Rp[WS(rs, 1)];
Chris@82 227 Ta = Rm[WS(rs, 1)];
Chris@82 228 Tb = T9 - Ta;
Chris@82 229 Th = T9 + Ta;
Chris@82 230 {
Chris@82 231 E To, Tp, Tk, Tl;
Chris@82 232 To = Rp[WS(rs, 2)];
Chris@82 233 Tp = Rm[WS(rs, 2)];
Chris@82 234 Tq = To - Tp;
Chris@82 235 Tw = To + Tp;
Chris@82 236 Tk = Ip[WS(rs, 2)];
Chris@82 237 Tl = Im[WS(rs, 2)];
Chris@82 238 Tm = Tk + Tl;
Chris@82 239 Tu = Tk - Tl;
Chris@82 240 }
Chris@82 241 }
Chris@82 242 T3 = T1 - T2;
Chris@82 243 TM = TA + Tz;
Chris@82 244 T4 = W[5];
Chris@82 245 T8 = W[4];
Chris@82 246 Tc = FMA(T4, T7, T8 * Tb);
Chris@82 247 TN = FNMS(T4, Tb, T8 * T7);
Chris@82 248 {
Chris@82 249 E Ti, TP, Tr, TQ;
Chris@82 250 {
Chris@82 251 E Te, Tg, Tj, Tn;
Chris@82 252 Te = W[2];
Chris@82 253 Tg = W[3];
Chris@82 254 Ti = FNMS(Tg, Th, Te * Tf);
Chris@82 255 TP = FMA(Tg, Tf, Te * Th);
Chris@82 256 Tj = W[9];
Chris@82 257 Tn = W[8];
Chris@82 258 Tr = FMA(Tj, Tm, Tn * Tq);
Chris@82 259 TQ = FNMS(Tj, Tq, Tn * Tm);
Chris@82 260 }
Chris@82 261 Ts = Ti - Tr;
Chris@82 262 T10 = TP + TQ;
Chris@82 263 TI = Ti + Tr;
Chris@82 264 TR = TP - TQ;
Chris@82 265 }
Chris@82 266 {
Chris@82 267 E Tx, TS, TE, TT;
Chris@82 268 {
Chris@82 269 E Tt, Tv, Ty, TC;
Chris@82 270 Tt = W[6];
Chris@82 271 Tv = W[7];
Chris@82 272 Tx = FNMS(Tv, Tw, Tt * Tu);
Chris@82 273 TS = FMA(Tv, Tu, Tt * Tw);
Chris@82 274 Ty = W[0];
Chris@82 275 TC = W[1];
Chris@82 276 TE = FNMS(TC, TD, Ty * TB);
Chris@82 277 TT = FMA(TC, TB, Ty * TD);
Chris@82 278 }
Chris@82 279 TF = Tx + TE;
Chris@82 280 T11 = TS + TT;
Chris@82 281 TH = TE - Tx;
Chris@82 282 TU = TS - TT;
Chris@82 283 }
Chris@82 284 }
Chris@82 285 {
Chris@82 286 E T12, Td, TG, TZ;
Chris@82 287 T12 = KP433012701 * (T10 - T11);
Chris@82 288 Td = T3 - Tc;
Chris@82 289 TG = Ts + TF;
Chris@82 290 TZ = FNMS(KP250000000, TG, KP500000000 * Td);
Chris@82 291 Ip[0] = KP500000000 * (Td + TG);
Chris@82 292 Im[WS(rs, 1)] = T12 - TZ;
Chris@82 293 Ip[WS(rs, 2)] = TZ + T12;
Chris@82 294 }
Chris@82 295 {
Chris@82 296 E T16, T13, T14, T15;
Chris@82 297 T16 = KP433012701 * (Ts - TF);
Chris@82 298 T13 = TM + TN;
Chris@82 299 T14 = T10 + T11;
Chris@82 300 T15 = FNMS(KP250000000, T14, KP500000000 * T13);
Chris@82 301 Rp[WS(rs, 2)] = T15 - T16;
Chris@82 302 Rp[0] = KP500000000 * (T13 + T14);
Chris@82 303 Rm[WS(rs, 1)] = T16 + T15;
Chris@82 304 }
Chris@82 305 {
Chris@82 306 E TY, TJ, TK, TX;
Chris@82 307 TY = KP433012701 * (TU - TR);
Chris@82 308 TJ = TH - TI;
Chris@82 309 TK = Tc + T3;
Chris@82 310 TX = FMA(KP500000000, TK, KP250000000 * TJ);
Chris@82 311 Im[WS(rs, 2)] = KP500000000 * (TJ - TK);
Chris@82 312 Im[0] = TY - TX;
Chris@82 313 Ip[WS(rs, 1)] = TX + TY;
Chris@82 314 }
Chris@82 315 {
Chris@82 316 E TL, TO, TV, TW;
Chris@82 317 TL = KP433012701 * (TI + TH);
Chris@82 318 TO = TM - TN;
Chris@82 319 TV = TR + TU;
Chris@82 320 TW = FNMS(KP250000000, TV, KP500000000 * TO);
Chris@82 321 Rp[WS(rs, 1)] = TL + TW;
Chris@82 322 Rm[WS(rs, 2)] = KP500000000 * (TO + TV);
Chris@82 323 Rm[0] = TW - TL;
Chris@82 324 }
Chris@82 325 }
Chris@82 326 }
Chris@82 327 }
Chris@82 328
Chris@82 329 static const tw_instr twinstr[] = {
Chris@82 330 {TW_FULL, 1, 6},
Chris@82 331 {TW_NEXT, 1, 0}
Chris@82 332 };
Chris@82 333
Chris@82 334 static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, {44, 22, 14, 0} };
Chris@82 335
Chris@82 336 void X(codelet_hc2cfdft_6) (planner *p) {
Chris@82 337 X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
Chris@82 338 }
Chris@82 339 #endif