annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cf_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:05 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 61 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 43 E T1g, T1f, T1e, Tm, T1q, T1o, T1p, TN, T1h, T1i;
Chris@42 44 {
Chris@42 45 E T1, T1m, T1l, T7, TS, Tk, TQ, Te, To, Tr, T17, TM, T12, Tu, TW;
Chris@42 46 E Tp, Tx, Tt, Tq, Tw;
Chris@42 47 {
Chris@42 48 E T3, T6, T2, T5;
Chris@42 49 T1 = Rp[0];
Chris@42 50 T1m = Rm[0];
Chris@42 51 T3 = Rp[WS(rs, 2)];
Chris@42 52 T6 = Rm[WS(rs, 2)];
Chris@42 53 T2 = W[6];
Chris@42 54 T5 = W[7];
Chris@42 55 {
Chris@42 56 E Ta, Td, T9, Tc;
Chris@42 57 {
Chris@42 58 E Tg, Tj, Ti, TR, Th, T1k, T4, Tf;
Chris@42 59 Tg = Rp[WS(rs, 3)];
Chris@42 60 Tj = Rm[WS(rs, 3)];
Chris@42 61 T1k = T2 * T6;
Chris@42 62 T4 = T2 * T3;
Chris@42 63 Tf = W[10];
Chris@42 64 Ti = W[11];
Chris@42 65 T1l = FNMS(T5, T3, T1k);
Chris@42 66 T7 = FMA(T5, T6, T4);
Chris@42 67 TR = Tf * Tj;
Chris@42 68 Th = Tf * Tg;
Chris@42 69 Ta = Rp[WS(rs, 1)];
Chris@42 70 Td = Rm[WS(rs, 1)];
Chris@42 71 TS = FNMS(Ti, Tg, TR);
Chris@42 72 Tk = FMA(Ti, Tj, Th);
Chris@42 73 T9 = W[2];
Chris@42 74 Tc = W[3];
Chris@42 75 }
Chris@42 76 {
Chris@42 77 E TB, TE, TH, T13, TC, TK, TG, TD, TJ, TP, Tb, TA, Tn;
Chris@42 78 TB = Ip[WS(rs, 3)];
Chris@42 79 TE = Im[WS(rs, 3)];
Chris@42 80 TP = T9 * Td;
Chris@42 81 Tb = T9 * Ta;
Chris@42 82 TA = W[12];
Chris@42 83 TH = Ip[WS(rs, 1)];
Chris@42 84 TQ = FNMS(Tc, Ta, TP);
Chris@42 85 Te = FMA(Tc, Td, Tb);
Chris@42 86 T13 = TA * TE;
Chris@42 87 TC = TA * TB;
Chris@42 88 TK = Im[WS(rs, 1)];
Chris@42 89 TG = W[4];
Chris@42 90 TD = W[13];
Chris@42 91 TJ = W[5];
Chris@42 92 {
Chris@42 93 E T14, TF, T16, TL, T15, TI;
Chris@42 94 To = Ip[0];
Chris@42 95 T15 = TG * TK;
Chris@42 96 TI = TG * TH;
Chris@42 97 T14 = FNMS(TD, TB, T13);
Chris@42 98 TF = FMA(TD, TE, TC);
Chris@42 99 T16 = FNMS(TJ, TH, T15);
Chris@42 100 TL = FMA(TJ, TK, TI);
Chris@42 101 Tr = Im[0];
Chris@42 102 Tn = W[0];
Chris@42 103 T17 = T14 - T16;
Chris@42 104 T1g = T14 + T16;
Chris@42 105 TM = TF + TL;
Chris@42 106 T12 = TF - TL;
Chris@42 107 }
Chris@42 108 Tu = Ip[WS(rs, 2)];
Chris@42 109 TW = Tn * Tr;
Chris@42 110 Tp = Tn * To;
Chris@42 111 Tx = Im[WS(rs, 2)];
Chris@42 112 Tt = W[8];
Chris@42 113 Tq = W[1];
Chris@42 114 Tw = W[9];
Chris@42 115 }
Chris@42 116 }
Chris@42 117 }
Chris@42 118 {
Chris@42 119 E T8, T1j, T1n, Tz, T1a, TU, Tl, T1b, T1c, T1v, T1t, T1w, T19, T1u, T1d;
Chris@42 120 {
Chris@42 121 E T1r, T10, TV, T1s, T11, T18;
Chris@42 122 {
Chris@42 123 E TO, TX, Ts, TZ, Ty, TT, TY, Tv;
Chris@42 124 T8 = T1 + T7;
Chris@42 125 TO = T1 - T7;
Chris@42 126 TY = Tt * Tx;
Chris@42 127 Tv = Tt * Tu;
Chris@42 128 TX = FNMS(Tq, To, TW);
Chris@42 129 Ts = FMA(Tq, Tr, Tp);
Chris@42 130 TZ = FNMS(Tw, Tu, TY);
Chris@42 131 Ty = FMA(Tw, Tx, Tv);
Chris@42 132 TT = TQ - TS;
Chris@42 133 T1j = TQ + TS;
Chris@42 134 T1n = T1l + T1m;
Chris@42 135 T1r = T1m - T1l;
Chris@42 136 T10 = TX - TZ;
Chris@42 137 T1f = TX + TZ;
Chris@42 138 Tz = Ts + Ty;
Chris@42 139 TV = Ts - Ty;
Chris@42 140 T1a = TO - TT;
Chris@42 141 TU = TO + TT;
Chris@42 142 T1s = Te - Tk;
Chris@42 143 Tl = Te + Tk;
Chris@42 144 }
Chris@42 145 T1b = T10 - TV;
Chris@42 146 T11 = TV + T10;
Chris@42 147 T18 = T12 - T17;
Chris@42 148 T1c = T12 + T17;
Chris@42 149 T1v = T1s + T1r;
Chris@42 150 T1t = T1r - T1s;
Chris@42 151 T1w = T18 - T11;
Chris@42 152 T19 = T11 + T18;
Chris@42 153 }
Chris@42 154 Ip[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
Chris@42 155 Im[0] = FMS(KP707106781, T1w, T1v);
Chris@42 156 Rp[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Chris@42 157 Rm[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
Chris@42 158 T1u = T1b + T1c;
Chris@42 159 T1d = T1b - T1c;
Chris@42 160 Ip[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
Chris@42 161 Im[WS(rs, 2)] = FMS(KP707106781, T1u, T1t);
Chris@42 162 Rp[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
Chris@42 163 Rm[0] = FNMS(KP707106781, T1d, T1a);
Chris@42 164 T1e = T8 - Tl;
Chris@42 165 Tm = T8 + Tl;
Chris@42 166 T1q = T1n - T1j;
Chris@42 167 T1o = T1j + T1n;
Chris@42 168 T1p = TM - Tz;
Chris@42 169 TN = Tz + TM;
Chris@42 170 }
Chris@42 171 }
Chris@42 172 Ip[WS(rs, 2)] = T1p + T1q;
Chris@42 173 Im[WS(rs, 1)] = T1p - T1q;
Chris@42 174 Rp[0] = Tm + TN;
Chris@42 175 Rm[WS(rs, 3)] = Tm - TN;
Chris@42 176 T1h = T1f - T1g;
Chris@42 177 T1i = T1f + T1g;
Chris@42 178 Ip[0] = T1i + T1o;
Chris@42 179 Im[WS(rs, 3)] = T1i - T1o;
Chris@42 180 Rp[WS(rs, 2)] = T1e + T1h;
Chris@42 181 Rm[WS(rs, 1)] = T1e - T1h;
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185
Chris@42 186 static const tw_instr twinstr[] = {
Chris@42 187 {TW_FULL, 1, 8},
Chris@42 188 {TW_NEXT, 1, 0}
Chris@42 189 };
Chris@42 190
Chris@42 191 static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@42 192
Chris@42 193 void X(codelet_hc2cf_8) (planner *p) {
Chris@42 194 X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
Chris@42 195 }
Chris@42 196 #else /* HAVE_FMA */
Chris@42 197
Chris@42 198 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include hc2cf.h */
Chris@42 199
Chris@42 200 /*
Chris@42 201 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 202 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 203 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@42 204 */
Chris@42 205 #include "hc2cf.h"
Chris@42 206
Chris@42 207 static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 208 {
Chris@42 209 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 210 {
Chris@42 211 INT m;
Chris@42 212 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 213 E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
Chris@42 214 E TP;
Chris@42 215 {
Chris@42 216 E T1, T18, T6, T17;
Chris@42 217 T1 = Rp[0];
Chris@42 218 T18 = Rm[0];
Chris@42 219 {
Chris@42 220 E T3, T5, T2, T4;
Chris@42 221 T3 = Rp[WS(rs, 2)];
Chris@42 222 T5 = Rm[WS(rs, 2)];
Chris@42 223 T2 = W[6];
Chris@42 224 T4 = W[7];
Chris@42 225 T6 = FMA(T2, T3, T4 * T5);
Chris@42 226 T17 = FNMS(T4, T3, T2 * T5);
Chris@42 227 }
Chris@42 228 T7 = T1 + T6;
Chris@42 229 T1e = T18 - T17;
Chris@42 230 TH = T1 - T6;
Chris@42 231 T19 = T17 + T18;
Chris@42 232 }
Chris@42 233 {
Chris@42 234 E Tz, TS, TE, TT;
Chris@42 235 {
Chris@42 236 E Tw, Ty, Tv, Tx;
Chris@42 237 Tw = Ip[WS(rs, 3)];
Chris@42 238 Ty = Im[WS(rs, 3)];
Chris@42 239 Tv = W[12];
Chris@42 240 Tx = W[13];
Chris@42 241 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 242 TS = FNMS(Tx, Tw, Tv * Ty);
Chris@42 243 }
Chris@42 244 {
Chris@42 245 E TB, TD, TA, TC;
Chris@42 246 TB = Ip[WS(rs, 1)];
Chris@42 247 TD = Im[WS(rs, 1)];
Chris@42 248 TA = W[4];
Chris@42 249 TC = W[5];
Chris@42 250 TE = FMA(TA, TB, TC * TD);
Chris@42 251 TT = FNMS(TC, TB, TA * TD);
Chris@42 252 }
Chris@42 253 TF = Tz + TE;
Chris@42 254 T13 = TS + TT;
Chris@42 255 TR = Tz - TE;
Chris@42 256 TU = TS - TT;
Chris@42 257 }
Chris@42 258 {
Chris@42 259 E Tc, TI, Th, TJ;
Chris@42 260 {
Chris@42 261 E T9, Tb, T8, Ta;
Chris@42 262 T9 = Rp[WS(rs, 1)];
Chris@42 263 Tb = Rm[WS(rs, 1)];
Chris@42 264 T8 = W[2];
Chris@42 265 Ta = W[3];
Chris@42 266 Tc = FMA(T8, T9, Ta * Tb);
Chris@42 267 TI = FNMS(Ta, T9, T8 * Tb);
Chris@42 268 }
Chris@42 269 {
Chris@42 270 E Te, Tg, Td, Tf;
Chris@42 271 Te = Rp[WS(rs, 3)];
Chris@42 272 Tg = Rm[WS(rs, 3)];
Chris@42 273 Td = W[10];
Chris@42 274 Tf = W[11];
Chris@42 275 Th = FMA(Td, Te, Tf * Tg);
Chris@42 276 TJ = FNMS(Tf, Te, Td * Tg);
Chris@42 277 }
Chris@42 278 Ti = Tc + Th;
Chris@42 279 T1f = Tc - Th;
Chris@42 280 TK = TI - TJ;
Chris@42 281 T16 = TI + TJ;
Chris@42 282 }
Chris@42 283 {
Chris@42 284 E To, TN, Tt, TO;
Chris@42 285 {
Chris@42 286 E Tl, Tn, Tk, Tm;
Chris@42 287 Tl = Ip[0];
Chris@42 288 Tn = Im[0];
Chris@42 289 Tk = W[0];
Chris@42 290 Tm = W[1];
Chris@42 291 To = FMA(Tk, Tl, Tm * Tn);
Chris@42 292 TN = FNMS(Tm, Tl, Tk * Tn);
Chris@42 293 }
Chris@42 294 {
Chris@42 295 E Tq, Ts, Tp, Tr;
Chris@42 296 Tq = Ip[WS(rs, 2)];
Chris@42 297 Ts = Im[WS(rs, 2)];
Chris@42 298 Tp = W[8];
Chris@42 299 Tr = W[9];
Chris@42 300 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@42 301 TO = FNMS(Tr, Tq, Tp * Ts);
Chris@42 302 }
Chris@42 303 Tu = To + Tt;
Chris@42 304 T12 = TN + TO;
Chris@42 305 TM = To - Tt;
Chris@42 306 TP = TN - TO;
Chris@42 307 }
Chris@42 308 {
Chris@42 309 E Tj, TG, T1b, T1c;
Chris@42 310 Tj = T7 + Ti;
Chris@42 311 TG = Tu + TF;
Chris@42 312 Rm[WS(rs, 3)] = Tj - TG;
Chris@42 313 Rp[0] = Tj + TG;
Chris@42 314 {
Chris@42 315 E T15, T1a, T11, T14;
Chris@42 316 T15 = T12 + T13;
Chris@42 317 T1a = T16 + T19;
Chris@42 318 Im[WS(rs, 3)] = T15 - T1a;
Chris@42 319 Ip[0] = T15 + T1a;
Chris@42 320 T11 = T7 - Ti;
Chris@42 321 T14 = T12 - T13;
Chris@42 322 Rm[WS(rs, 1)] = T11 - T14;
Chris@42 323 Rp[WS(rs, 2)] = T11 + T14;
Chris@42 324 }
Chris@42 325 T1b = TF - Tu;
Chris@42 326 T1c = T19 - T16;
Chris@42 327 Im[WS(rs, 1)] = T1b - T1c;
Chris@42 328 Ip[WS(rs, 2)] = T1b + T1c;
Chris@42 329 {
Chris@42 330 E TX, T1g, T10, T1d, TY, TZ;
Chris@42 331 TX = TH - TK;
Chris@42 332 T1g = T1e - T1f;
Chris@42 333 TY = TP - TM;
Chris@42 334 TZ = TR + TU;
Chris@42 335 T10 = KP707106781 * (TY - TZ);
Chris@42 336 T1d = KP707106781 * (TY + TZ);
Chris@42 337 Rm[0] = TX - T10;
Chris@42 338 Ip[WS(rs, 1)] = T1d + T1g;
Chris@42 339 Rp[WS(rs, 3)] = TX + T10;
Chris@42 340 Im[WS(rs, 2)] = T1d - T1g;
Chris@42 341 }
Chris@42 342 {
Chris@42 343 E TL, T1i, TW, T1h, TQ, TV;
Chris@42 344 TL = TH + TK;
Chris@42 345 T1i = T1f + T1e;
Chris@42 346 TQ = TM + TP;
Chris@42 347 TV = TR - TU;
Chris@42 348 TW = KP707106781 * (TQ + TV);
Chris@42 349 T1h = KP707106781 * (TV - TQ);
Chris@42 350 Rm[WS(rs, 2)] = TL - TW;
Chris@42 351 Ip[WS(rs, 3)] = T1h + T1i;
Chris@42 352 Rp[WS(rs, 1)] = TL + TW;
Chris@42 353 Im[0] = T1h - T1i;
Chris@42 354 }
Chris@42 355 }
Chris@42 356 }
Chris@42 357 }
Chris@42 358 }
Chris@42 359
Chris@42 360 static const tw_instr twinstr[] = {
Chris@42 361 {TW_FULL, 1, 8},
Chris@42 362 {TW_NEXT, 1, 0}
Chris@42 363 };
Chris@42 364
Chris@42 365 static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@42 366
Chris@42 367 void X(codelet_hc2cf_8) (planner *p) {
Chris@42 368 X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
Chris@42 369 }
Chris@42 370 #endif /* HAVE_FMA */