annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cb_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:51 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 33 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 43 E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
Chris@82 44 E Tv, TW;
Chris@82 45 {
Chris@82 46 E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
Chris@82 47 {
Chris@82 48 E T1, T2, TA, TB;
Chris@82 49 T1 = Rp[0];
Chris@82 50 T2 = Rm[WS(rs, 3)];
Chris@82 51 T3 = T1 + T2;
Chris@82 52 Tg = T1 - T2;
Chris@82 53 TA = Ip[0];
Chris@82 54 TB = Im[WS(rs, 3)];
Chris@82 55 TC = TA + TB;
Chris@82 56 T19 = TA - TB;
Chris@82 57 }
Chris@82 58 {
Chris@82 59 E T4, T5, Th, Ti;
Chris@82 60 T4 = Rp[WS(rs, 2)];
Chris@82 61 T5 = Rm[WS(rs, 1)];
Chris@82 62 T6 = T4 + T5;
Chris@82 63 Tz = T4 - T5;
Chris@82 64 Th = Ip[WS(rs, 2)];
Chris@82 65 Ti = Im[WS(rs, 1)];
Chris@82 66 Tj = Th + Ti;
Chris@82 67 T1a = Th - Ti;
Chris@82 68 }
Chris@82 69 T7 = T3 + T6;
Chris@82 70 T1i = T3 - T6;
Chris@82 71 T1n = T19 - T1a;
Chris@82 72 Tk = Tg - Tj;
Chris@82 73 TD = Tz + TC;
Chris@82 74 TV = TC - Tz;
Chris@82 75 T1b = T19 + T1a;
Chris@82 76 TQ = Tg + Tj;
Chris@82 77 }
Chris@82 78 {
Chris@82 79 E Ta, Tl, To, T1c, Td, Tq, Tt, T1d, Tp, Tu;
Chris@82 80 {
Chris@82 81 E T8, T9, Tm, Tn;
Chris@82 82 T8 = Rp[WS(rs, 1)];
Chris@82 83 T9 = Rm[WS(rs, 2)];
Chris@82 84 Ta = T8 + T9;
Chris@82 85 Tl = T8 - T9;
Chris@82 86 Tm = Ip[WS(rs, 1)];
Chris@82 87 Tn = Im[WS(rs, 2)];
Chris@82 88 To = Tm + Tn;
Chris@82 89 T1c = Tm - Tn;
Chris@82 90 }
Chris@82 91 {
Chris@82 92 E Tb, Tc, Tr, Ts;
Chris@82 93 Tb = Rm[0];
Chris@82 94 Tc = Rp[WS(rs, 3)];
Chris@82 95 Td = Tb + Tc;
Chris@82 96 Tq = Tb - Tc;
Chris@82 97 Tr = Ip[WS(rs, 3)];
Chris@82 98 Ts = Im[0];
Chris@82 99 Tt = Tr + Ts;
Chris@82 100 T1d = Tr - Ts;
Chris@82 101 }
Chris@82 102 Te = Ta + Td;
Chris@82 103 T1e = T1c + T1d;
Chris@82 104 T1o = Ta - Td;
Chris@82 105 T1j = T1d - T1c;
Chris@82 106 TE = Tl + To;
Chris@82 107 TF = Tq + Tt;
Chris@82 108 TR = TE + TF;
Chris@82 109 Tp = Tl - To;
Chris@82 110 Tu = Tq - Tt;
Chris@82 111 Tv = Tp + Tu;
Chris@82 112 TW = Tp - Tu;
Chris@82 113 }
Chris@82 114 Rp[0] = T7 + Te;
Chris@82 115 Rm[0] = T1b + T1e;
Chris@82 116 {
Chris@82 117 E TS, TX, TT, TY, TP, TU;
Chris@82 118 TS = FNMS(KP707106781, TR, TQ);
Chris@82 119 TX = FMA(KP707106781, TW, TV);
Chris@82 120 TP = W[4];
Chris@82 121 TT = TP * TS;
Chris@82 122 TY = TP * TX;
Chris@82 123 TU = W[5];
Chris@82 124 Ip[WS(rs, 1)] = FNMS(TU, TX, TT);
Chris@82 125 Im[WS(rs, 1)] = FMA(TU, TS, TY);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E T1s, T1v, T1t, T1w, T1r, T1u;
Chris@82 129 T1s = T1i + T1j;
Chris@82 130 T1v = T1o + T1n;
Chris@82 131 T1r = W[2];
Chris@82 132 T1t = T1r * T1s;
Chris@82 133 T1w = T1r * T1v;
Chris@82 134 T1u = W[3];
Chris@82 135 Rp[WS(rs, 1)] = FNMS(T1u, T1v, T1t);
Chris@82 136 Rm[WS(rs, 1)] = FMA(T1u, T1s, T1w);
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E T10, T13, T11, T14, TZ, T12;
Chris@82 140 T10 = FMA(KP707106781, TR, TQ);
Chris@82 141 T13 = FNMS(KP707106781, TW, TV);
Chris@82 142 TZ = W[12];
Chris@82 143 T11 = TZ * T10;
Chris@82 144 T14 = TZ * T13;
Chris@82 145 T12 = W[13];
Chris@82 146 Ip[WS(rs, 3)] = FNMS(T12, T13, T11);
Chris@82 147 Im[WS(rs, 3)] = FMA(T12, T10, T14);
Chris@82 148 }
Chris@82 149 {
Chris@82 150 E T1f, T15, T17, T18, T1g, T16;
Chris@82 151 T1f = T1b - T1e;
Chris@82 152 T16 = T7 - Te;
Chris@82 153 T15 = W[6];
Chris@82 154 T17 = T15 * T16;
Chris@82 155 T18 = W[7];
Chris@82 156 T1g = T18 * T16;
Chris@82 157 Rp[WS(rs, 2)] = FNMS(T18, T1f, T17);
Chris@82 158 Rm[WS(rs, 2)] = FMA(T15, T1f, T1g);
Chris@82 159 }
Chris@82 160 {
Chris@82 161 E T1k, T1p, T1l, T1q, T1h, T1m;
Chris@82 162 T1k = T1i - T1j;
Chris@82 163 T1p = T1n - T1o;
Chris@82 164 T1h = W[10];
Chris@82 165 T1l = T1h * T1k;
Chris@82 166 T1q = T1h * T1p;
Chris@82 167 T1m = W[11];
Chris@82 168 Rp[WS(rs, 3)] = FNMS(T1m, T1p, T1l);
Chris@82 169 Rm[WS(rs, 3)] = FMA(T1m, T1k, T1q);
Chris@82 170 }
Chris@82 171 {
Chris@82 172 E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
Chris@82 173 TG = TE - TF;
Chris@82 174 TH = FNMS(KP707106781, TG, TD);
Chris@82 175 TN = FMA(KP707106781, TG, TD);
Chris@82 176 TK = FMA(KP707106781, Tv, Tk);
Chris@82 177 TJ = W[0];
Chris@82 178 TL = TJ * TK;
Chris@82 179 TM = W[1];
Chris@82 180 TO = TM * TK;
Chris@82 181 Tw = FNMS(KP707106781, Tv, Tk);
Chris@82 182 Tf = W[8];
Chris@82 183 Tx = Tf * Tw;
Chris@82 184 Ty = W[9];
Chris@82 185 TI = Ty * Tw;
Chris@82 186 Ip[WS(rs, 2)] = FNMS(Ty, TH, Tx);
Chris@82 187 Im[WS(rs, 2)] = FMA(Tf, TH, TI);
Chris@82 188 Ip[0] = FNMS(TM, TN, TL);
Chris@82 189 Im[0] = FMA(TJ, TN, TO);
Chris@82 190 }
Chris@82 191 }
Chris@82 192 }
Chris@82 193 }
Chris@82 194
Chris@82 195 static const tw_instr twinstr[] = {
Chris@82 196 {TW_FULL, 1, 8},
Chris@82 197 {TW_NEXT, 1, 0}
Chris@82 198 };
Chris@82 199
Chris@82 200 static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@82 201
Chris@82 202 void X(codelet_hc2cb_8) (planner *p) {
Chris@82 203 X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
Chris@82 204 }
Chris@82 205 #else
Chris@82 206
Chris@82 207 /* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hc2cb_8 -include rdft/scalar/hc2cb.h */
Chris@82 208
Chris@82 209 /*
Chris@82 210 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 211 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 212 * 30 stack variables, 1 constants, and 32 memory accesses
Chris@82 213 */
Chris@82 214 #include "rdft/scalar/hc2cb.h"
Chris@82 215
Chris@82 216 static void hc2cb_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 217 {
Chris@82 218 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 219 {
Chris@82 220 INT m;
Chris@82 221 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 222 E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
Chris@82 223 E TD;
Chris@82 224 {
Chris@82 225 E T3, TK, Tk, TX, T6, TW, Tn, TL;
Chris@82 226 {
Chris@82 227 E T1, T2, Ti, Tj;
Chris@82 228 T1 = Rp[0];
Chris@82 229 T2 = Rm[WS(rs, 3)];
Chris@82 230 T3 = T1 + T2;
Chris@82 231 TK = T1 - T2;
Chris@82 232 Ti = Ip[0];
Chris@82 233 Tj = Im[WS(rs, 3)];
Chris@82 234 Tk = Ti - Tj;
Chris@82 235 TX = Ti + Tj;
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T4, T5, Tl, Tm;
Chris@82 239 T4 = Rp[WS(rs, 2)];
Chris@82 240 T5 = Rm[WS(rs, 1)];
Chris@82 241 T6 = T4 + T5;
Chris@82 242 TW = T4 - T5;
Chris@82 243 Tl = Ip[WS(rs, 2)];
Chris@82 244 Tm = Im[WS(rs, 1)];
Chris@82 245 Tn = Tl - Tm;
Chris@82 246 TL = Tl + Tm;
Chris@82 247 }
Chris@82 248 T7 = T3 + T6;
Chris@82 249 T18 = TK + TL;
Chris@82 250 T1c = TX - TW;
Chris@82 251 To = Tk + Tn;
Chris@82 252 Ty = T3 - T6;
Chris@82 253 TM = TK - TL;
Chris@82 254 TY = TW + TX;
Chris@82 255 TC = Tk - Tn;
Chris@82 256 }
Chris@82 257 {
Chris@82 258 E Ta, TN, Tr, TO, Td, TQ, Tu, TR;
Chris@82 259 {
Chris@82 260 E T8, T9, Tp, Tq;
Chris@82 261 T8 = Rp[WS(rs, 1)];
Chris@82 262 T9 = Rm[WS(rs, 2)];
Chris@82 263 Ta = T8 + T9;
Chris@82 264 TN = T8 - T9;
Chris@82 265 Tp = Ip[WS(rs, 1)];
Chris@82 266 Tq = Im[WS(rs, 2)];
Chris@82 267 Tr = Tp - Tq;
Chris@82 268 TO = Tp + Tq;
Chris@82 269 }
Chris@82 270 {
Chris@82 271 E Tb, Tc, Ts, Tt;
Chris@82 272 Tb = Rm[0];
Chris@82 273 Tc = Rp[WS(rs, 3)];
Chris@82 274 Td = Tb + Tc;
Chris@82 275 TQ = Tb - Tc;
Chris@82 276 Ts = Ip[WS(rs, 3)];
Chris@82 277 Tt = Im[0];
Chris@82 278 Tu = Ts - Tt;
Chris@82 279 TR = Ts + Tt;
Chris@82 280 }
Chris@82 281 Te = Ta + Td;
Chris@82 282 TZ = TN + TO;
Chris@82 283 T10 = TQ + TR;
Chris@82 284 Tv = Tr + Tu;
Chris@82 285 Tz = Tu - Tr;
Chris@82 286 TP = TN - TO;
Chris@82 287 TS = TQ - TR;
Chris@82 288 TD = Ta - Td;
Chris@82 289 }
Chris@82 290 Rp[0] = T7 + Te;
Chris@82 291 Rm[0] = To + Tv;
Chris@82 292 {
Chris@82 293 E Tg, Tw, Tf, Th;
Chris@82 294 Tg = T7 - Te;
Chris@82 295 Tw = To - Tv;
Chris@82 296 Tf = W[6];
Chris@82 297 Th = W[7];
Chris@82 298 Rp[WS(rs, 2)] = FNMS(Th, Tw, Tf * Tg);
Chris@82 299 Rm[WS(rs, 2)] = FMA(Th, Tg, Tf * Tw);
Chris@82 300 }
Chris@82 301 {
Chris@82 302 E TG, TI, TF, TH;
Chris@82 303 TG = Ty + Tz;
Chris@82 304 TI = TD + TC;
Chris@82 305 TF = W[2];
Chris@82 306 TH = W[3];
Chris@82 307 Rp[WS(rs, 1)] = FNMS(TH, TI, TF * TG);
Chris@82 308 Rm[WS(rs, 1)] = FMA(TF, TI, TH * TG);
Chris@82 309 }
Chris@82 310 {
Chris@82 311 E TA, TE, Tx, TB;
Chris@82 312 TA = Ty - Tz;
Chris@82 313 TE = TC - TD;
Chris@82 314 Tx = W[10];
Chris@82 315 TB = W[11];
Chris@82 316 Rp[WS(rs, 3)] = FNMS(TB, TE, Tx * TA);
Chris@82 317 Rm[WS(rs, 3)] = FMA(Tx, TE, TB * TA);
Chris@82 318 }
Chris@82 319 {
Chris@82 320 E T1a, T1g, T1e, T1i, T19, T1d;
Chris@82 321 T19 = KP707106781 * (TZ + T10);
Chris@82 322 T1a = T18 - T19;
Chris@82 323 T1g = T18 + T19;
Chris@82 324 T1d = KP707106781 * (TP - TS);
Chris@82 325 T1e = T1c + T1d;
Chris@82 326 T1i = T1c - T1d;
Chris@82 327 {
Chris@82 328 E T17, T1b, T1f, T1h;
Chris@82 329 T17 = W[4];
Chris@82 330 T1b = W[5];
Chris@82 331 Ip[WS(rs, 1)] = FNMS(T1b, T1e, T17 * T1a);
Chris@82 332 Im[WS(rs, 1)] = FMA(T17, T1e, T1b * T1a);
Chris@82 333 T1f = W[12];
Chris@82 334 T1h = W[13];
Chris@82 335 Ip[WS(rs, 3)] = FNMS(T1h, T1i, T1f * T1g);
Chris@82 336 Im[WS(rs, 3)] = FMA(T1f, T1i, T1h * T1g);
Chris@82 337 }
Chris@82 338 }
Chris@82 339 {
Chris@82 340 E TU, T14, T12, T16, TT, T11;
Chris@82 341 TT = KP707106781 * (TP + TS);
Chris@82 342 TU = TM - TT;
Chris@82 343 T14 = TM + TT;
Chris@82 344 T11 = KP707106781 * (TZ - T10);
Chris@82 345 T12 = TY - T11;
Chris@82 346 T16 = TY + T11;
Chris@82 347 {
Chris@82 348 E TJ, TV, T13, T15;
Chris@82 349 TJ = W[8];
Chris@82 350 TV = W[9];
Chris@82 351 Ip[WS(rs, 2)] = FNMS(TV, T12, TJ * TU);
Chris@82 352 Im[WS(rs, 2)] = FMA(TV, TU, TJ * T12);
Chris@82 353 T13 = W[0];
Chris@82 354 T15 = W[1];
Chris@82 355 Ip[0] = FNMS(T15, T16, T13 * T14);
Chris@82 356 Im[0] = FMA(T15, T14, T13 * T16);
Chris@82 357 }
Chris@82 358 }
Chris@82 359 }
Chris@82 360 }
Chris@82 361 }
Chris@82 362
Chris@82 363 static const tw_instr twinstr[] = {
Chris@82 364 {TW_FULL, 1, 8},
Chris@82 365 {TW_NEXT, 1, 0}
Chris@82 366 };
Chris@82 367
Chris@82 368 static const hc2c_desc desc = { 8, "hc2cb_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@82 369
Chris@82 370 void X(codelet_hc2cb_8) (planner *p) {
Chris@82 371 X(khc2c_register) (p, hc2cb_8, &desc, HC2C_VIA_RDFT);
Chris@82 372 }
Chris@82 373 #endif