annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cb_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:29 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@42 33 * 52 stack variables, 1 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 43 E Tw, TH, Tf, Ty, Tx, TI;
Chris@42 44 {
Chris@42 45 E TV, TD, T1i, T7, T1b, T1n, TQ, Tk, Tp, TE, Te, T1o, T1e, T1j, Tu;
Chris@42 46 E TF;
Chris@42 47 {
Chris@42 48 E T4, Tg, T3, T19, TC, T5, Th, Ti;
Chris@42 49 {
Chris@42 50 E T1, T2, TA, TB;
Chris@42 51 T1 = Rp[0];
Chris@42 52 T2 = Rm[WS(rs, 3)];
Chris@42 53 TA = Ip[0];
Chris@42 54 TB = Im[WS(rs, 3)];
Chris@42 55 T4 = Rp[WS(rs, 2)];
Chris@42 56 Tg = T1 - T2;
Chris@42 57 T3 = T1 + T2;
Chris@42 58 T19 = TA - TB;
Chris@42 59 TC = TA + TB;
Chris@42 60 T5 = Rm[WS(rs, 1)];
Chris@42 61 Th = Ip[WS(rs, 2)];
Chris@42 62 Ti = Im[WS(rs, 1)];
Chris@42 63 }
Chris@42 64 {
Chris@42 65 E Tb, Tl, Ta, T1c, To, Tc, Tr, Ts;
Chris@42 66 {
Chris@42 67 E T8, T9, Tm, Tn;
Chris@42 68 T8 = Rp[WS(rs, 1)];
Chris@42 69 {
Chris@42 70 E Tz, T6, T1a, Tj;
Chris@42 71 Tz = T4 - T5;
Chris@42 72 T6 = T4 + T5;
Chris@42 73 T1a = Th - Ti;
Chris@42 74 Tj = Th + Ti;
Chris@42 75 TV = TC - Tz;
Chris@42 76 TD = Tz + TC;
Chris@42 77 T1i = T3 - T6;
Chris@42 78 T7 = T3 + T6;
Chris@42 79 T1b = T19 + T1a;
Chris@42 80 T1n = T19 - T1a;
Chris@42 81 TQ = Tg + Tj;
Chris@42 82 Tk = Tg - Tj;
Chris@42 83 T9 = Rm[WS(rs, 2)];
Chris@42 84 }
Chris@42 85 Tm = Ip[WS(rs, 1)];
Chris@42 86 Tn = Im[WS(rs, 2)];
Chris@42 87 Tb = Rm[0];
Chris@42 88 Tl = T8 - T9;
Chris@42 89 Ta = T8 + T9;
Chris@42 90 T1c = Tm - Tn;
Chris@42 91 To = Tm + Tn;
Chris@42 92 Tc = Rp[WS(rs, 3)];
Chris@42 93 Tr = Ip[WS(rs, 3)];
Chris@42 94 Ts = Im[0];
Chris@42 95 }
Chris@42 96 {
Chris@42 97 E Tq, Td, T1d, Tt;
Chris@42 98 Tp = Tl - To;
Chris@42 99 TE = Tl + To;
Chris@42 100 Tq = Tb - Tc;
Chris@42 101 Td = Tb + Tc;
Chris@42 102 T1d = Tr - Ts;
Chris@42 103 Tt = Tr + Ts;
Chris@42 104 Te = Ta + Td;
Chris@42 105 T1o = Ta - Td;
Chris@42 106 T1e = T1c + T1d;
Chris@42 107 T1j = T1d - T1c;
Chris@42 108 Tu = Tq - Tt;
Chris@42 109 TF = Tq + Tt;
Chris@42 110 }
Chris@42 111 }
Chris@42 112 }
Chris@42 113 {
Chris@42 114 E TG, Tv, T10, T13, T1s, T1k, T1p, T1v, T1u, T1w, T1t, TR, TW;
Chris@42 115 Rp[0] = T7 + Te;
Chris@42 116 Rm[0] = T1b + T1e;
Chris@42 117 TG = TE - TF;
Chris@42 118 TR = TE + TF;
Chris@42 119 TW = Tp - Tu;
Chris@42 120 Tv = Tp + Tu;
Chris@42 121 {
Chris@42 122 E TP, TS, TX, TU, T1r, TT, TY;
Chris@42 123 TP = W[4];
Chris@42 124 T10 = FMA(KP707106781, TR, TQ);
Chris@42 125 TS = FNMS(KP707106781, TR, TQ);
Chris@42 126 TX = FMA(KP707106781, TW, TV);
Chris@42 127 T13 = FNMS(KP707106781, TW, TV);
Chris@42 128 TU = W[5];
Chris@42 129 T1s = T1i + T1j;
Chris@42 130 T1k = T1i - T1j;
Chris@42 131 TT = TP * TS;
Chris@42 132 TY = TP * TX;
Chris@42 133 T1p = T1n - T1o;
Chris@42 134 T1v = T1o + T1n;
Chris@42 135 T1r = W[2];
Chris@42 136 Ip[WS(rs, 1)] = FNMS(TU, TX, TT);
Chris@42 137 Im[WS(rs, 1)] = FMA(TU, TS, TY);
Chris@42 138 T1u = W[3];
Chris@42 139 T1w = T1r * T1v;
Chris@42 140 T1t = T1r * T1s;
Chris@42 141 }
Chris@42 142 {
Chris@42 143 E T1f, T15, T18, T17, T1g, T1h, T1m;
Chris@42 144 {
Chris@42 145 E TZ, T12, T16, T14, T11;
Chris@42 146 Rm[WS(rs, 1)] = FMA(T1u, T1s, T1w);
Chris@42 147 Rp[WS(rs, 1)] = FNMS(T1u, T1v, T1t);
Chris@42 148 TZ = W[12];
Chris@42 149 T12 = W[13];
Chris@42 150 T1f = T1b - T1e;
Chris@42 151 T16 = T7 - Te;
Chris@42 152 T14 = TZ * T13;
Chris@42 153 T11 = TZ * T10;
Chris@42 154 T15 = W[6];
Chris@42 155 T18 = W[7];
Chris@42 156 Im[WS(rs, 3)] = FMA(T12, T10, T14);
Chris@42 157 Ip[WS(rs, 3)] = FNMS(T12, T13, T11);
Chris@42 158 T17 = T15 * T16;
Chris@42 159 T1g = T18 * T16;
Chris@42 160 }
Chris@42 161 Rp[WS(rs, 2)] = FNMS(T18, T1f, T17);
Chris@42 162 Rm[WS(rs, 2)] = FMA(T15, T1f, T1g);
Chris@42 163 T1h = W[10];
Chris@42 164 T1m = W[11];
Chris@42 165 {
Chris@42 166 E TN, TJ, TM, TL, TO, TK, T1q, T1l;
Chris@42 167 Tw = FNMS(KP707106781, Tv, Tk);
Chris@42 168 TK = FMA(KP707106781, Tv, Tk);
Chris@42 169 T1q = T1h * T1p;
Chris@42 170 T1l = T1h * T1k;
Chris@42 171 TN = FMA(KP707106781, TG, TD);
Chris@42 172 TH = FNMS(KP707106781, TG, TD);
Chris@42 173 Rm[WS(rs, 3)] = FMA(T1m, T1k, T1q);
Chris@42 174 Rp[WS(rs, 3)] = FNMS(T1m, T1p, T1l);
Chris@42 175 TJ = W[0];
Chris@42 176 TM = W[1];
Chris@42 177 Tf = W[8];
Chris@42 178 TL = TJ * TK;
Chris@42 179 TO = TM * TK;
Chris@42 180 Ty = W[9];
Chris@42 181 Tx = Tf * Tw;
Chris@42 182 Ip[0] = FNMS(TM, TN, TL);
Chris@42 183 Im[0] = FMA(TJ, TN, TO);
Chris@42 184 }
Chris@42 185 }
Chris@42 186 }
Chris@42 187 }
Chris@42 188 Ip[WS(rs, 2)] = FNMS(Ty, TH, Tx);
Chris@42 189 TI = Ty * Tw;
Chris@42 190 Im[WS(rs, 2)] = FMA(Tf, TH, TI);
Chris@42 191 }
Chris@42 192 }
Chris@42 193 }
Chris@42 194
Chris@42 195 static const tw_instr twinstr[] = {
Chris@42 196 {TW_FULL, 1, 8},
Chris@42 197 {TW_NEXT, 1, 0}
Chris@42 198 };
Chris@42 199
Chris@42 200 static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@42 201
Chris@42 202 void X(codelet_hc2cb_8) (planner *p) {
Chris@42 203 X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
Chris@42 204 }
Chris@42 205 #else /* HAVE_FMA */
Chris@42 206
Chris@42 207 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include hc2cb.h */
Chris@42 208
Chris@42 209 /*
Chris@42 210 * This function contains 66 FP additions, 32 FP multiplications,
Chris@42 211 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@42 212 * 30 stack variables, 1 constants, and 32 memory accesses
Chris@42 213 */
Chris@42 214 #include "hc2cb.h"
Chris@42 215
Chris@42 216 static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 217 {
Chris@42 218 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 219 {
Chris@42 220 INT m;
Chris@42 221 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@42 222 E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
Chris@42 223 E TD;
Chris@42 224 {
Chris@42 225 E T3, TK, Tk, TX, T6, TW, Tn, TL;
Chris@42 226 {
Chris@42 227 E T1, T2, Ti, Tj;
Chris@42 228 T1 = Rp[0];
Chris@42 229 T2 = Rm[WS(rs, 3)];
Chris@42 230 T3 = T1 + T2;
Chris@42 231 TK = T1 - T2;
Chris@42 232 Ti = Ip[0];
Chris@42 233 Tj = Im[WS(rs, 3)];
Chris@42 234 Tk = Ti - Tj;
Chris@42 235 TX = Ti + Tj;
Chris@42 236 }
Chris@42 237 {
Chris@42 238 E T4, T5, Tl, Tm;
Chris@42 239 T4 = Rp[WS(rs, 2)];
Chris@42 240 T5 = Rm[WS(rs, 1)];
Chris@42 241 T6 = T4 + T5;
Chris@42 242 TW = T4 - T5;
Chris@42 243 Tl = Ip[WS(rs, 2)];
Chris@42 244 Tm = Im[WS(rs, 1)];
Chris@42 245 Tn = Tl - Tm;
Chris@42 246 TL = Tl + Tm;
Chris@42 247 }
Chris@42 248 T7 = T3 + T6;
Chris@42 249 T18 = TK + TL;
Chris@42 250 T1c = TX - TW;
Chris@42 251 To = Tk + Tn;
Chris@42 252 Ty = T3 - T6;
Chris@42 253 TM = TK - TL;
Chris@42 254 TY = TW + TX;
Chris@42 255 TC = Tk - Tn;
Chris@42 256 }
Chris@42 257 {
Chris@42 258 E Ta, TN, Tr, TO, Td, TQ, Tu, TR;
Chris@42 259 {
Chris@42 260 E T8, T9, Tp, Tq;
Chris@42 261 T8 = Rp[WS(rs, 1)];
Chris@42 262 T9 = Rm[WS(rs, 2)];
Chris@42 263 Ta = T8 + T9;
Chris@42 264 TN = T8 - T9;
Chris@42 265 Tp = Ip[WS(rs, 1)];
Chris@42 266 Tq = Im[WS(rs, 2)];
Chris@42 267 Tr = Tp - Tq;
Chris@42 268 TO = Tp + Tq;
Chris@42 269 }
Chris@42 270 {
Chris@42 271 E Tb, Tc, Ts, Tt;
Chris@42 272 Tb = Rm[0];
Chris@42 273 Tc = Rp[WS(rs, 3)];
Chris@42 274 Td = Tb + Tc;
Chris@42 275 TQ = Tb - Tc;
Chris@42 276 Ts = Ip[WS(rs, 3)];
Chris@42 277 Tt = Im[0];
Chris@42 278 Tu = Ts - Tt;
Chris@42 279 TR = Ts + Tt;
Chris@42 280 }
Chris@42 281 Te = Ta + Td;
Chris@42 282 TZ = TN + TO;
Chris@42 283 T10 = TQ + TR;
Chris@42 284 Tv = Tr + Tu;
Chris@42 285 Tz = Tu - Tr;
Chris@42 286 TP = TN - TO;
Chris@42 287 TS = TQ - TR;
Chris@42 288 TD = Ta - Td;
Chris@42 289 }
Chris@42 290 Rp[0] = T7 + Te;
Chris@42 291 Rm[0] = To + Tv;
Chris@42 292 {
Chris@42 293 E Tg, Tw, Tf, Th;
Chris@42 294 Tg = T7 - Te;
Chris@42 295 Tw = To - Tv;
Chris@42 296 Tf = W[6];
Chris@42 297 Th = W[7];
Chris@42 298 Rp[WS(rs, 2)] = FNMS(Th, Tw, Tf * Tg);
Chris@42 299 Rm[WS(rs, 2)] = FMA(Th, Tg, Tf * Tw);
Chris@42 300 }
Chris@42 301 {
Chris@42 302 E TG, TI, TF, TH;
Chris@42 303 TG = Ty + Tz;
Chris@42 304 TI = TD + TC;
Chris@42 305 TF = W[2];
Chris@42 306 TH = W[3];
Chris@42 307 Rp[WS(rs, 1)] = FNMS(TH, TI, TF * TG);
Chris@42 308 Rm[WS(rs, 1)] = FMA(TF, TI, TH * TG);
Chris@42 309 }
Chris@42 310 {
Chris@42 311 E TA, TE, Tx, TB;
Chris@42 312 TA = Ty - Tz;
Chris@42 313 TE = TC - TD;
Chris@42 314 Tx = W[10];
Chris@42 315 TB = W[11];
Chris@42 316 Rp[WS(rs, 3)] = FNMS(TB, TE, Tx * TA);
Chris@42 317 Rm[WS(rs, 3)] = FMA(Tx, TE, TB * TA);
Chris@42 318 }
Chris@42 319 {
Chris@42 320 E T1a, T1g, T1e, T1i, T19, T1d;
Chris@42 321 T19 = KP707106781 * (TZ + T10);
Chris@42 322 T1a = T18 - T19;
Chris@42 323 T1g = T18 + T19;
Chris@42 324 T1d = KP707106781 * (TP - TS);
Chris@42 325 T1e = T1c + T1d;
Chris@42 326 T1i = T1c - T1d;
Chris@42 327 {
Chris@42 328 E T17, T1b, T1f, T1h;
Chris@42 329 T17 = W[4];
Chris@42 330 T1b = W[5];
Chris@42 331 Ip[WS(rs, 1)] = FNMS(T1b, T1e, T17 * T1a);
Chris@42 332 Im[WS(rs, 1)] = FMA(T17, T1e, T1b * T1a);
Chris@42 333 T1f = W[12];
Chris@42 334 T1h = W[13];
Chris@42 335 Ip[WS(rs, 3)] = FNMS(T1h, T1i, T1f * T1g);
Chris@42 336 Im[WS(rs, 3)] = FMA(T1f, T1i, T1h * T1g);
Chris@42 337 }
Chris@42 338 }
Chris@42 339 {
Chris@42 340 E TU, T14, T12, T16, TT, T11;
Chris@42 341 TT = KP707106781 * (TP + TS);
Chris@42 342 TU = TM - TT;
Chris@42 343 T14 = TM + TT;
Chris@42 344 T11 = KP707106781 * (TZ - T10);
Chris@42 345 T12 = TY - T11;
Chris@42 346 T16 = TY + T11;
Chris@42 347 {
Chris@42 348 E TJ, TV, T13, T15;
Chris@42 349 TJ = W[8];
Chris@42 350 TV = W[9];
Chris@42 351 Ip[WS(rs, 2)] = FNMS(TV, T12, TJ * TU);
Chris@42 352 Im[WS(rs, 2)] = FMA(TV, TU, TJ * T12);
Chris@42 353 T13 = W[0];
Chris@42 354 T15 = W[1];
Chris@42 355 Ip[0] = FNMS(T15, T16, T13 * T14);
Chris@42 356 Im[0] = FMA(T15, T14, T13 * T16);
Chris@42 357 }
Chris@42 358 }
Chris@42 359 }
Chris@42 360 }
Chris@42 361 }
Chris@42 362
Chris@42 363 static const tw_instr twinstr[] = {
Chris@42 364 {TW_FULL, 1, 8},
Chris@42 365 {TW_NEXT, 1, 0}
Chris@42 366 };
Chris@42 367
Chris@42 368 static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@42 369
Chris@42 370 void X(codelet_hc2cb_8) (planner *p) {
Chris@42 371 X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
Chris@42 372 }
Chris@42 373 #endif /* HAVE_FMA */