annotate src/fftw-3.3.8/rdft/scalar/r2cf/hf_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:29 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 34 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hf.h"
Chris@82 36
Chris@82 37 static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 43 E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
Chris@82 44 E TX, Ty, TZ, TV, T10;
Chris@82 45 T1 = cr[0];
Chris@82 46 T1m = ci[0];
Chris@82 47 {
Chris@82 48 E T3, T6, T4, T1k, T2, T5;
Chris@82 49 T3 = cr[WS(rs, 4)];
Chris@82 50 T6 = ci[WS(rs, 4)];
Chris@82 51 T2 = W[6];
Chris@82 52 T4 = T2 * T3;
Chris@82 53 T1k = T2 * T6;
Chris@82 54 T5 = W[7];
Chris@82 55 T7 = FMA(T5, T6, T4);
Chris@82 56 T1l = FNMS(T5, T3, T1k);
Chris@82 57 }
Chris@82 58 {
Chris@82 59 E Tg, Tj, Th, TR, Tf, Ti;
Chris@82 60 Tg = cr[WS(rs, 6)];
Chris@82 61 Tj = ci[WS(rs, 6)];
Chris@82 62 Tf = W[10];
Chris@82 63 Th = Tf * Tg;
Chris@82 64 TR = Tf * Tj;
Chris@82 65 Ti = W[11];
Chris@82 66 Tk = FMA(Ti, Tj, Th);
Chris@82 67 TS = FNMS(Ti, Tg, TR);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 E Ta, Td, Tb, TP, T9, Tc;
Chris@82 71 Ta = cr[WS(rs, 2)];
Chris@82 72 Td = ci[WS(rs, 2)];
Chris@82 73 T9 = W[2];
Chris@82 74 Tb = T9 * Ta;
Chris@82 75 TP = T9 * Td;
Chris@82 76 Tc = W[3];
Chris@82 77 Te = FMA(Tc, Td, Tb);
Chris@82 78 TQ = FNMS(Tc, Ta, TP);
Chris@82 79 }
Chris@82 80 {
Chris@82 81 E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
Chris@82 82 TB = cr[WS(rs, 7)];
Chris@82 83 TE = ci[WS(rs, 7)];
Chris@82 84 TA = W[12];
Chris@82 85 TC = TA * TB;
Chris@82 86 T13 = TA * TE;
Chris@82 87 TH = cr[WS(rs, 3)];
Chris@82 88 TK = ci[WS(rs, 3)];
Chris@82 89 TG = W[4];
Chris@82 90 TI = TG * TH;
Chris@82 91 T15 = TG * TK;
Chris@82 92 TD = W[13];
Chris@82 93 TF = FMA(TD, TE, TC);
Chris@82 94 T14 = FNMS(TD, TB, T13);
Chris@82 95 TJ = W[5];
Chris@82 96 TL = FMA(TJ, TK, TI);
Chris@82 97 T16 = FNMS(TJ, TH, T15);
Chris@82 98 T12 = TF - TL;
Chris@82 99 T17 = T14 - T16;
Chris@82 100 }
Chris@82 101 {
Chris@82 102 E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
Chris@82 103 To = cr[WS(rs, 1)];
Chris@82 104 Tr = ci[WS(rs, 1)];
Chris@82 105 Tn = W[0];
Chris@82 106 Tp = Tn * To;
Chris@82 107 TW = Tn * Tr;
Chris@82 108 Tu = cr[WS(rs, 5)];
Chris@82 109 Tx = ci[WS(rs, 5)];
Chris@82 110 Tt = W[8];
Chris@82 111 Tv = Tt * Tu;
Chris@82 112 TY = Tt * Tx;
Chris@82 113 Tq = W[1];
Chris@82 114 Ts = FMA(Tq, Tr, Tp);
Chris@82 115 TX = FNMS(Tq, To, TW);
Chris@82 116 Tw = W[9];
Chris@82 117 Ty = FMA(Tw, Tx, Tv);
Chris@82 118 TZ = FNMS(Tw, Tu, TY);
Chris@82 119 TV = Ts - Ty;
Chris@82 120 T10 = TX - TZ;
Chris@82 121 }
Chris@82 122 {
Chris@82 123 E TU, T1a, T1t, T1v, T19, T1u, T1d, T1w;
Chris@82 124 {
Chris@82 125 E TO, TT, T1r, T1s;
Chris@82 126 TO = T1 - T7;
Chris@82 127 TT = TQ - TS;
Chris@82 128 TU = TO + TT;
Chris@82 129 T1a = TO - TT;
Chris@82 130 T1r = Te - Tk;
Chris@82 131 T1s = T1m - T1l;
Chris@82 132 T1t = T1r + T1s;
Chris@82 133 T1v = T1s - T1r;
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E T11, T18, T1b, T1c;
Chris@82 137 T11 = TV + T10;
Chris@82 138 T18 = T12 - T17;
Chris@82 139 T19 = T11 + T18;
Chris@82 140 T1u = T18 - T11;
Chris@82 141 T1b = TV - T10;
Chris@82 142 T1c = T12 + T17;
Chris@82 143 T1d = T1b + T1c;
Chris@82 144 T1w = T1c - T1b;
Chris@82 145 }
Chris@82 146 ci[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
Chris@82 147 cr[WS(rs, 5)] = FMS(KP707106781, T1w, T1v);
Chris@82 148 ci[WS(rs, 6)] = FMA(KP707106781, T1w, T1v);
Chris@82 149 cr[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Chris@82 150 cr[WS(rs, 3)] = FNMS(KP707106781, T1d, T1a);
Chris@82 151 cr[WS(rs, 7)] = FMS(KP707106781, T1u, T1t);
Chris@82 152 ci[WS(rs, 4)] = FMA(KP707106781, T1u, T1t);
Chris@82 153 ci[0] = FMA(KP707106781, T1d, T1a);
Chris@82 154 }
Chris@82 155 {
Chris@82 156 E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
Chris@82 157 {
Chris@82 158 E T8, Tl, T1j, T1n;
Chris@82 159 T8 = T1 + T7;
Chris@82 160 Tl = Te + Tk;
Chris@82 161 Tm = T8 + Tl;
Chris@82 162 T1e = T8 - Tl;
Chris@82 163 T1j = TQ + TS;
Chris@82 164 T1n = T1l + T1m;
Chris@82 165 T1o = T1j + T1n;
Chris@82 166 T1q = T1n - T1j;
Chris@82 167 }
Chris@82 168 {
Chris@82 169 E Tz, TM, T1f, T1g;
Chris@82 170 Tz = Ts + Ty;
Chris@82 171 TM = TF + TL;
Chris@82 172 TN = Tz + TM;
Chris@82 173 T1p = TM - Tz;
Chris@82 174 T1f = T14 + T16;
Chris@82 175 T1g = TX + TZ;
Chris@82 176 T1h = T1f - T1g;
Chris@82 177 T1i = T1g + T1f;
Chris@82 178 }
Chris@82 179 ci[WS(rs, 3)] = Tm - TN;
Chris@82 180 cr[WS(rs, 6)] = T1p - T1q;
Chris@82 181 ci[WS(rs, 5)] = T1p + T1q;
Chris@82 182 cr[0] = Tm + TN;
Chris@82 183 cr[WS(rs, 2)] = T1e - T1h;
Chris@82 184 cr[WS(rs, 4)] = T1i - T1o;
Chris@82 185 ci[WS(rs, 7)] = T1i + T1o;
Chris@82 186 ci[WS(rs, 1)] = T1e + T1h;
Chris@82 187 }
Chris@82 188 }
Chris@82 189 }
Chris@82 190 }
Chris@82 191
Chris@82 192 static const tw_instr twinstr[] = {
Chris@82 193 {TW_FULL, 1, 8},
Chris@82 194 {TW_NEXT, 1, 0}
Chris@82 195 };
Chris@82 196
Chris@82 197 static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@82 198
Chris@82 199 void X(codelet_hf_8) (planner *p) {
Chris@82 200 X(khc2hc_register) (p, hf_8, &desc);
Chris@82 201 }
Chris@82 202 #else
Chris@82 203
Chris@82 204 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
Chris@82 205
Chris@82 206 /*
Chris@82 207 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 208 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 209 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@82 210 */
Chris@82 211 #include "rdft/scalar/hf.h"
Chris@82 212
Chris@82 213 static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 214 {
Chris@82 215 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 216 {
Chris@82 217 INT m;
Chris@82 218 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 219 E T7, T1f, TH, T19, TF, T12, TR, TU, Ti, T1e, TK, T16, Tu, T13, TM;
Chris@82 220 E TP;
Chris@82 221 {
Chris@82 222 E T1, T18, T6, T17;
Chris@82 223 T1 = cr[0];
Chris@82 224 T18 = ci[0];
Chris@82 225 {
Chris@82 226 E T3, T5, T2, T4;
Chris@82 227 T3 = cr[WS(rs, 4)];
Chris@82 228 T5 = ci[WS(rs, 4)];
Chris@82 229 T2 = W[6];
Chris@82 230 T4 = W[7];
Chris@82 231 T6 = FMA(T2, T3, T4 * T5);
Chris@82 232 T17 = FNMS(T4, T3, T2 * T5);
Chris@82 233 }
Chris@82 234 T7 = T1 + T6;
Chris@82 235 T1f = T18 - T17;
Chris@82 236 TH = T1 - T6;
Chris@82 237 T19 = T17 + T18;
Chris@82 238 }
Chris@82 239 {
Chris@82 240 E Tz, TS, TE, TT;
Chris@82 241 {
Chris@82 242 E Tw, Ty, Tv, Tx;
Chris@82 243 Tw = cr[WS(rs, 7)];
Chris@82 244 Ty = ci[WS(rs, 7)];
Chris@82 245 Tv = W[12];
Chris@82 246 Tx = W[13];
Chris@82 247 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 248 TS = FNMS(Tx, Tw, Tv * Ty);
Chris@82 249 }
Chris@82 250 {
Chris@82 251 E TB, TD, TA, TC;
Chris@82 252 TB = cr[WS(rs, 3)];
Chris@82 253 TD = ci[WS(rs, 3)];
Chris@82 254 TA = W[4];
Chris@82 255 TC = W[5];
Chris@82 256 TE = FMA(TA, TB, TC * TD);
Chris@82 257 TT = FNMS(TC, TB, TA * TD);
Chris@82 258 }
Chris@82 259 TF = Tz + TE;
Chris@82 260 T12 = TS + TT;
Chris@82 261 TR = Tz - TE;
Chris@82 262 TU = TS - TT;
Chris@82 263 }
Chris@82 264 {
Chris@82 265 E Tc, TI, Th, TJ;
Chris@82 266 {
Chris@82 267 E T9, Tb, T8, Ta;
Chris@82 268 T9 = cr[WS(rs, 2)];
Chris@82 269 Tb = ci[WS(rs, 2)];
Chris@82 270 T8 = W[2];
Chris@82 271 Ta = W[3];
Chris@82 272 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 273 TI = FNMS(Ta, T9, T8 * Tb);
Chris@82 274 }
Chris@82 275 {
Chris@82 276 E Te, Tg, Td, Tf;
Chris@82 277 Te = cr[WS(rs, 6)];
Chris@82 278 Tg = ci[WS(rs, 6)];
Chris@82 279 Td = W[10];
Chris@82 280 Tf = W[11];
Chris@82 281 Th = FMA(Td, Te, Tf * Tg);
Chris@82 282 TJ = FNMS(Tf, Te, Td * Tg);
Chris@82 283 }
Chris@82 284 Ti = Tc + Th;
Chris@82 285 T1e = Tc - Th;
Chris@82 286 TK = TI - TJ;
Chris@82 287 T16 = TI + TJ;
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E To, TN, Tt, TO;
Chris@82 291 {
Chris@82 292 E Tl, Tn, Tk, Tm;
Chris@82 293 Tl = cr[WS(rs, 1)];
Chris@82 294 Tn = ci[WS(rs, 1)];
Chris@82 295 Tk = W[0];
Chris@82 296 Tm = W[1];
Chris@82 297 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 298 TN = FNMS(Tm, Tl, Tk * Tn);
Chris@82 299 }
Chris@82 300 {
Chris@82 301 E Tq, Ts, Tp, Tr;
Chris@82 302 Tq = cr[WS(rs, 5)];
Chris@82 303 Ts = ci[WS(rs, 5)];
Chris@82 304 Tp = W[8];
Chris@82 305 Tr = W[9];
Chris@82 306 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 307 TO = FNMS(Tr, Tq, Tp * Ts);
Chris@82 308 }
Chris@82 309 Tu = To + Tt;
Chris@82 310 T13 = TN + TO;
Chris@82 311 TM = To - Tt;
Chris@82 312 TP = TN - TO;
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E Tj, TG, T1b, T1c;
Chris@82 316 Tj = T7 + Ti;
Chris@82 317 TG = Tu + TF;
Chris@82 318 ci[WS(rs, 3)] = Tj - TG;
Chris@82 319 cr[0] = Tj + TG;
Chris@82 320 T1b = TF - Tu;
Chris@82 321 T1c = T19 - T16;
Chris@82 322 cr[WS(rs, 6)] = T1b - T1c;
Chris@82 323 ci[WS(rs, 5)] = T1b + T1c;
Chris@82 324 {
Chris@82 325 E TX, T1i, T10, T1h, TY, TZ;
Chris@82 326 TX = TH - TK;
Chris@82 327 T1i = T1f - T1e;
Chris@82 328 TY = TM - TP;
Chris@82 329 TZ = TR + TU;
Chris@82 330 T10 = KP707106781 * (TY + TZ);
Chris@82 331 T1h = KP707106781 * (TZ - TY);
Chris@82 332 cr[WS(rs, 3)] = TX - T10;
Chris@82 333 ci[WS(rs, 6)] = T1h + T1i;
Chris@82 334 ci[0] = TX + T10;
Chris@82 335 cr[WS(rs, 5)] = T1h - T1i;
Chris@82 336 }
Chris@82 337 }
Chris@82 338 {
Chris@82 339 E T15, T1a, T11, T14;
Chris@82 340 T15 = T13 + T12;
Chris@82 341 T1a = T16 + T19;
Chris@82 342 cr[WS(rs, 4)] = T15 - T1a;
Chris@82 343 ci[WS(rs, 7)] = T15 + T1a;
Chris@82 344 T11 = T7 - Ti;
Chris@82 345 T14 = T12 - T13;
Chris@82 346 cr[WS(rs, 2)] = T11 - T14;
Chris@82 347 ci[WS(rs, 1)] = T11 + T14;
Chris@82 348 {
Chris@82 349 E TL, T1g, TW, T1d, TQ, TV;
Chris@82 350 TL = TH + TK;
Chris@82 351 T1g = T1e + T1f;
Chris@82 352 TQ = TM + TP;
Chris@82 353 TV = TR - TU;
Chris@82 354 TW = KP707106781 * (TQ + TV);
Chris@82 355 T1d = KP707106781 * (TV - TQ);
Chris@82 356 ci[WS(rs, 2)] = TL - TW;
Chris@82 357 ci[WS(rs, 4)] = T1d + T1g;
Chris@82 358 cr[WS(rs, 1)] = TL + TW;
Chris@82 359 cr[WS(rs, 7)] = T1d - T1g;
Chris@82 360 }
Chris@82 361 }
Chris@82 362 }
Chris@82 363 }
Chris@82 364 }
Chris@82 365
Chris@82 366 static const tw_instr twinstr[] = {
Chris@82 367 {TW_FULL, 1, 8},
Chris@82 368 {TW_NEXT, 1, 0}
Chris@82 369 };
Chris@82 370
Chris@82 371 static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@82 372
Chris@82 373 void X(codelet_hf_8) (planner *p) {
Chris@82 374 X(khc2hc_register) (p, hf_8, &desc);
Chris@82 375 }
Chris@82 376 #endif