annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cf_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:55 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 34 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 43 E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
Chris@82 44 E TX, Ty, TZ, TV, T10;
Chris@82 45 T1 = Rp[0];
Chris@82 46 T1m = Rm[0];
Chris@82 47 {
Chris@82 48 E T3, T6, T4, T1k, T2, T5;
Chris@82 49 T3 = Rp[WS(rs, 2)];
Chris@82 50 T6 = Rm[WS(rs, 2)];
Chris@82 51 T2 = W[6];
Chris@82 52 T4 = T2 * T3;
Chris@82 53 T1k = T2 * T6;
Chris@82 54 T5 = W[7];
Chris@82 55 T7 = FMA(T5, T6, T4);
Chris@82 56 T1l = FNMS(T5, T3, T1k);
Chris@82 57 }
Chris@82 58 {
Chris@82 59 E Tg, Tj, Th, TR, Tf, Ti;
Chris@82 60 Tg = Rp[WS(rs, 3)];
Chris@82 61 Tj = Rm[WS(rs, 3)];
Chris@82 62 Tf = W[10];
Chris@82 63 Th = Tf * Tg;
Chris@82 64 TR = Tf * Tj;
Chris@82 65 Ti = W[11];
Chris@82 66 Tk = FMA(Ti, Tj, Th);
Chris@82 67 TS = FNMS(Ti, Tg, TR);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 E Ta, Td, Tb, TP, T9, Tc;
Chris@82 71 Ta = Rp[WS(rs, 1)];
Chris@82 72 Td = Rm[WS(rs, 1)];
Chris@82 73 T9 = W[2];
Chris@82 74 Tb = T9 * Ta;
Chris@82 75 TP = T9 * Td;
Chris@82 76 Tc = W[3];
Chris@82 77 Te = FMA(Tc, Td, Tb);
Chris@82 78 TQ = FNMS(Tc, Ta, TP);
Chris@82 79 }
Chris@82 80 {
Chris@82 81 E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
Chris@82 82 TB = Ip[WS(rs, 3)];
Chris@82 83 TE = Im[WS(rs, 3)];
Chris@82 84 TA = W[12];
Chris@82 85 TC = TA * TB;
Chris@82 86 T13 = TA * TE;
Chris@82 87 TH = Ip[WS(rs, 1)];
Chris@82 88 TK = Im[WS(rs, 1)];
Chris@82 89 TG = W[4];
Chris@82 90 TI = TG * TH;
Chris@82 91 T15 = TG * TK;
Chris@82 92 TD = W[13];
Chris@82 93 TF = FMA(TD, TE, TC);
Chris@82 94 T14 = FNMS(TD, TB, T13);
Chris@82 95 TJ = W[5];
Chris@82 96 TL = FMA(TJ, TK, TI);
Chris@82 97 T16 = FNMS(TJ, TH, T15);
Chris@82 98 T12 = TF - TL;
Chris@82 99 T17 = T14 - T16;
Chris@82 100 }
Chris@82 101 {
Chris@82 102 E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
Chris@82 103 To = Ip[0];
Chris@82 104 Tr = Im[0];
Chris@82 105 Tn = W[0];
Chris@82 106 Tp = Tn * To;
Chris@82 107 TW = Tn * Tr;
Chris@82 108 Tu = Ip[WS(rs, 2)];
Chris@82 109 Tx = Im[WS(rs, 2)];
Chris@82 110 Tt = W[8];
Chris@82 111 Tv = Tt * Tu;
Chris@82 112 TY = Tt * Tx;
Chris@82 113 Tq = W[1];
Chris@82 114 Ts = FMA(Tq, Tr, Tp);
Chris@82 115 TX = FNMS(Tq, To, TW);
Chris@82 116 Tw = W[9];
Chris@82 117 Ty = FMA(Tw, Tx, Tv);
Chris@82 118 TZ = FNMS(Tw, Tu, TY);
Chris@82 119 TV = Ts - Ty;
Chris@82 120 T10 = TX - TZ;
Chris@82 121 }
Chris@82 122 {
Chris@82 123 E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
Chris@82 124 {
Chris@82 125 E TO, TT, T1r, T1s;
Chris@82 126 TO = T1 - T7;
Chris@82 127 TT = TQ - TS;
Chris@82 128 TU = TO + TT;
Chris@82 129 T1a = TO - TT;
Chris@82 130 T1r = T1m - T1l;
Chris@82 131 T1s = Te - Tk;
Chris@82 132 T1t = T1r - T1s;
Chris@82 133 T1v = T1s + T1r;
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E T11, T18, T1b, T1c;
Chris@82 137 T11 = TV + T10;
Chris@82 138 T18 = T12 - T17;
Chris@82 139 T19 = T11 + T18;
Chris@82 140 T1w = T18 - T11;
Chris@82 141 T1b = T10 - TV;
Chris@82 142 T1c = T12 + T17;
Chris@82 143 T1d = T1b - T1c;
Chris@82 144 T1u = T1b + T1c;
Chris@82 145 }
Chris@82 146 Rm[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
Chris@82 147 Im[WS(rs, 2)] = FMS(KP707106781, T1u, T1t);
Chris@82 148 Rp[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Chris@82 149 Ip[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
Chris@82 150 Rm[0] = FNMS(KP707106781, T1d, T1a);
Chris@82 151 Im[0] = FMS(KP707106781, T1w, T1v);
Chris@82 152 Rp[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
Chris@82 153 Ip[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
Chris@82 154 }
Chris@82 155 {
Chris@82 156 E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
Chris@82 157 {
Chris@82 158 E T8, Tl, T1j, T1n;
Chris@82 159 T8 = T1 + T7;
Chris@82 160 Tl = Te + Tk;
Chris@82 161 Tm = T8 + Tl;
Chris@82 162 T1e = T8 - Tl;
Chris@82 163 T1j = TQ + TS;
Chris@82 164 T1n = T1l + T1m;
Chris@82 165 T1o = T1j + T1n;
Chris@82 166 T1q = T1n - T1j;
Chris@82 167 }
Chris@82 168 {
Chris@82 169 E Tz, TM, T1f, T1g;
Chris@82 170 Tz = Ts + Ty;
Chris@82 171 TM = TF + TL;
Chris@82 172 TN = Tz + TM;
Chris@82 173 T1p = TM - Tz;
Chris@82 174 T1f = TX + TZ;
Chris@82 175 T1g = T14 + T16;
Chris@82 176 T1h = T1f - T1g;
Chris@82 177 T1i = T1f + T1g;
Chris@82 178 }
Chris@82 179 Rm[WS(rs, 3)] = Tm - TN;
Chris@82 180 Im[WS(rs, 3)] = T1i - T1o;
Chris@82 181 Rp[0] = Tm + TN;
Chris@82 182 Ip[0] = T1i + T1o;
Chris@82 183 Rm[WS(rs, 1)] = T1e - T1h;
Chris@82 184 Im[WS(rs, 1)] = T1p - T1q;
Chris@82 185 Rp[WS(rs, 2)] = T1e + T1h;
Chris@82 186 Ip[WS(rs, 2)] = T1p + T1q;
Chris@82 187 }
Chris@82 188 }
Chris@82 189 }
Chris@82 190 }
Chris@82 191
Chris@82 192 static const tw_instr twinstr[] = {
Chris@82 193 {TW_FULL, 1, 8},
Chris@82 194 {TW_NEXT, 1, 0}
Chris@82 195 };
Chris@82 196
Chris@82 197 static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@82 198
Chris@82 199 void X(codelet_hc2cf_8) (planner *p) {
Chris@82 200 X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
Chris@82 201 }
Chris@82 202 #else
Chris@82 203
Chris@82 204 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
Chris@82 205
Chris@82 206 /*
Chris@82 207 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 208 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 209 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@82 210 */
Chris@82 211 #include "rdft/scalar/hc2cf.h"
Chris@82 212
Chris@82 213 static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 214 {
Chris@82 215 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 216 {
Chris@82 217 INT m;
Chris@82 218 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 219 E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
Chris@82 220 E TP;
Chris@82 221 {
Chris@82 222 E T1, T18, T6, T17;
Chris@82 223 T1 = Rp[0];
Chris@82 224 T18 = Rm[0];
Chris@82 225 {
Chris@82 226 E T3, T5, T2, T4;
Chris@82 227 T3 = Rp[WS(rs, 2)];
Chris@82 228 T5 = Rm[WS(rs, 2)];
Chris@82 229 T2 = W[6];
Chris@82 230 T4 = W[7];
Chris@82 231 T6 = FMA(T2, T3, T4 * T5);
Chris@82 232 T17 = FNMS(T4, T3, T2 * T5);
Chris@82 233 }
Chris@82 234 T7 = T1 + T6;
Chris@82 235 T1e = T18 - T17;
Chris@82 236 TH = T1 - T6;
Chris@82 237 T19 = T17 + T18;
Chris@82 238 }
Chris@82 239 {
Chris@82 240 E Tz, TS, TE, TT;
Chris@82 241 {
Chris@82 242 E Tw, Ty, Tv, Tx;
Chris@82 243 Tw = Ip[WS(rs, 3)];
Chris@82 244 Ty = Im[WS(rs, 3)];
Chris@82 245 Tv = W[12];
Chris@82 246 Tx = W[13];
Chris@82 247 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 248 TS = FNMS(Tx, Tw, Tv * Ty);
Chris@82 249 }
Chris@82 250 {
Chris@82 251 E TB, TD, TA, TC;
Chris@82 252 TB = Ip[WS(rs, 1)];
Chris@82 253 TD = Im[WS(rs, 1)];
Chris@82 254 TA = W[4];
Chris@82 255 TC = W[5];
Chris@82 256 TE = FMA(TA, TB, TC * TD);
Chris@82 257 TT = FNMS(TC, TB, TA * TD);
Chris@82 258 }
Chris@82 259 TF = Tz + TE;
Chris@82 260 T13 = TS + TT;
Chris@82 261 TR = Tz - TE;
Chris@82 262 TU = TS - TT;
Chris@82 263 }
Chris@82 264 {
Chris@82 265 E Tc, TI, Th, TJ;
Chris@82 266 {
Chris@82 267 E T9, Tb, T8, Ta;
Chris@82 268 T9 = Rp[WS(rs, 1)];
Chris@82 269 Tb = Rm[WS(rs, 1)];
Chris@82 270 T8 = W[2];
Chris@82 271 Ta = W[3];
Chris@82 272 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 273 TI = FNMS(Ta, T9, T8 * Tb);
Chris@82 274 }
Chris@82 275 {
Chris@82 276 E Te, Tg, Td, Tf;
Chris@82 277 Te = Rp[WS(rs, 3)];
Chris@82 278 Tg = Rm[WS(rs, 3)];
Chris@82 279 Td = W[10];
Chris@82 280 Tf = W[11];
Chris@82 281 Th = FMA(Td, Te, Tf * Tg);
Chris@82 282 TJ = FNMS(Tf, Te, Td * Tg);
Chris@82 283 }
Chris@82 284 Ti = Tc + Th;
Chris@82 285 T1f = Tc - Th;
Chris@82 286 TK = TI - TJ;
Chris@82 287 T16 = TI + TJ;
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E To, TN, Tt, TO;
Chris@82 291 {
Chris@82 292 E Tl, Tn, Tk, Tm;
Chris@82 293 Tl = Ip[0];
Chris@82 294 Tn = Im[0];
Chris@82 295 Tk = W[0];
Chris@82 296 Tm = W[1];
Chris@82 297 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 298 TN = FNMS(Tm, Tl, Tk * Tn);
Chris@82 299 }
Chris@82 300 {
Chris@82 301 E Tq, Ts, Tp, Tr;
Chris@82 302 Tq = Ip[WS(rs, 2)];
Chris@82 303 Ts = Im[WS(rs, 2)];
Chris@82 304 Tp = W[8];
Chris@82 305 Tr = W[9];
Chris@82 306 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 307 TO = FNMS(Tr, Tq, Tp * Ts);
Chris@82 308 }
Chris@82 309 Tu = To + Tt;
Chris@82 310 T12 = TN + TO;
Chris@82 311 TM = To - Tt;
Chris@82 312 TP = TN - TO;
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E Tj, TG, T1b, T1c;
Chris@82 316 Tj = T7 + Ti;
Chris@82 317 TG = Tu + TF;
Chris@82 318 Rm[WS(rs, 3)] = Tj - TG;
Chris@82 319 Rp[0] = Tj + TG;
Chris@82 320 {
Chris@82 321 E T15, T1a, T11, T14;
Chris@82 322 T15 = T12 + T13;
Chris@82 323 T1a = T16 + T19;
Chris@82 324 Im[WS(rs, 3)] = T15 - T1a;
Chris@82 325 Ip[0] = T15 + T1a;
Chris@82 326 T11 = T7 - Ti;
Chris@82 327 T14 = T12 - T13;
Chris@82 328 Rm[WS(rs, 1)] = T11 - T14;
Chris@82 329 Rp[WS(rs, 2)] = T11 + T14;
Chris@82 330 }
Chris@82 331 T1b = TF - Tu;
Chris@82 332 T1c = T19 - T16;
Chris@82 333 Im[WS(rs, 1)] = T1b - T1c;
Chris@82 334 Ip[WS(rs, 2)] = T1b + T1c;
Chris@82 335 {
Chris@82 336 E TX, T1g, T10, T1d, TY, TZ;
Chris@82 337 TX = TH - TK;
Chris@82 338 T1g = T1e - T1f;
Chris@82 339 TY = TP - TM;
Chris@82 340 TZ = TR + TU;
Chris@82 341 T10 = KP707106781 * (TY - TZ);
Chris@82 342 T1d = KP707106781 * (TY + TZ);
Chris@82 343 Rm[0] = TX - T10;
Chris@82 344 Ip[WS(rs, 1)] = T1d + T1g;
Chris@82 345 Rp[WS(rs, 3)] = TX + T10;
Chris@82 346 Im[WS(rs, 2)] = T1d - T1g;
Chris@82 347 }
Chris@82 348 {
Chris@82 349 E TL, T1i, TW, T1h, TQ, TV;
Chris@82 350 TL = TH + TK;
Chris@82 351 T1i = T1f + T1e;
Chris@82 352 TQ = TM + TP;
Chris@82 353 TV = TR - TU;
Chris@82 354 TW = KP707106781 * (TQ + TV);
Chris@82 355 T1h = KP707106781 * (TV - TQ);
Chris@82 356 Rm[WS(rs, 2)] = TL - TW;
Chris@82 357 Ip[WS(rs, 3)] = T1h + T1i;
Chris@82 358 Rp[WS(rs, 1)] = TL + TW;
Chris@82 359 Im[0] = T1h - T1i;
Chris@82 360 }
Chris@82 361 }
Chris@82 362 }
Chris@82 363 }
Chris@82 364 }
Chris@82 365
Chris@82 366 static const tw_instr twinstr[] = {
Chris@82 367 {TW_FULL, 1, 8},
Chris@82 368 {TW_NEXT, 1, 0}
Chris@82 369 };
Chris@82 370
Chris@82 371 static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@82 372
Chris@82 373 void X(codelet_hc2cf_8) (planner *p) {
Chris@82 374 X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
Chris@82 375 }
Chris@82 376 #endif