annotate src/fftw-3.3.8/rdft/scalar/r2cb/hc2cbdft_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:57 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include rdft/scalar/hc2cb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 58 FP additions, 32 FP multiplications,
Chris@82 32 * (or, 36 additions, 10 multiplications, 22 fused multiply/add),
Chris@82 33 * 34 stack variables, 2 constants, and 24 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cb.h"
Chris@82 36
Chris@82 37 static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 41 {
Chris@82 42 INT m;
Chris@82 43 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 44 E Tp, TD, Tj, TV, Tq, Tr, TG, TP, T4, Ts, TQ, Tb, Tc, TA, TU;
Chris@82 45 {
Chris@82 46 E Tf, TF, Ti, TE, Td, Te;
Chris@82 47 Td = Ip[WS(rs, 1)];
Chris@82 48 Te = Im[WS(rs, 1)];
Chris@82 49 Tf = Td - Te;
Chris@82 50 TF = Te + Td;
Chris@82 51 {
Chris@82 52 E Tn, To, Tg, Th;
Chris@82 53 Tn = Ip[0];
Chris@82 54 To = Im[WS(rs, 2)];
Chris@82 55 Tp = Tn - To;
Chris@82 56 TD = Tn + To;
Chris@82 57 Tg = Ip[WS(rs, 2)];
Chris@82 58 Th = Im[0];
Chris@82 59 Ti = Tg - Th;
Chris@82 60 TE = Tg + Th;
Chris@82 61 }
Chris@82 62 Tj = Tf - Ti;
Chris@82 63 TV = TF + TE;
Chris@82 64 Tq = Tf + Ti;
Chris@82 65 Tr = FNMS(KP500000000, Tq, Tp);
Chris@82 66 TG = TE - TF;
Chris@82 67 TP = FNMS(KP500000000, TG, TD);
Chris@82 68 }
Chris@82 69 {
Chris@82 70 E Tw, Ta, Ty, T7, Tx, T2, T3, Tz;
Chris@82 71 T2 = Rp[0];
Chris@82 72 T3 = Rm[WS(rs, 2)];
Chris@82 73 T4 = T2 + T3;
Chris@82 74 Tw = T2 - T3;
Chris@82 75 {
Chris@82 76 E T8, T9, T5, T6;
Chris@82 77 T8 = Rm[WS(rs, 1)];
Chris@82 78 T9 = Rp[WS(rs, 1)];
Chris@82 79 Ta = T8 + T9;
Chris@82 80 Ty = T8 - T9;
Chris@82 81 T5 = Rp[WS(rs, 2)];
Chris@82 82 T6 = Rm[0];
Chris@82 83 T7 = T5 + T6;
Chris@82 84 Tx = T5 - T6;
Chris@82 85 }
Chris@82 86 Ts = T7 - Ta;
Chris@82 87 TQ = Tx - Ty;
Chris@82 88 Tb = T7 + Ta;
Chris@82 89 Tc = FNMS(KP500000000, Tb, T4);
Chris@82 90 Tz = Tx + Ty;
Chris@82 91 TA = Tw + Tz;
Chris@82 92 TU = FNMS(KP500000000, Tz, Tw);
Chris@82 93 }
Chris@82 94 {
Chris@82 95 E TN, TY, TR, TW, TS, TZ, TO, TX, T10, TT;
Chris@82 96 TN = T4 + Tb;
Chris@82 97 TY = Tp + Tq;
Chris@82 98 TR = FMA(KP866025403, TQ, TP);
Chris@82 99 TW = FNMS(KP866025403, TV, TU);
Chris@82 100 TO = W[0];
Chris@82 101 TS = TO * TR;
Chris@82 102 TZ = TO * TW;
Chris@82 103 TT = W[1];
Chris@82 104 TX = FMA(TT, TW, TS);
Chris@82 105 T10 = FNMS(TT, TR, TZ);
Chris@82 106 Rp[0] = TN - TX;
Chris@82 107 Ip[0] = TY + T10;
Chris@82 108 Rm[0] = TN + TX;
Chris@82 109 Im[0] = T10 - TY;
Chris@82 110 }
Chris@82 111 {
Chris@82 112 E Tt, TH, Tv, TB, TC, TL, T1, Tl, Tm, TJ, Tk;
Chris@82 113 Tt = FNMS(KP866025403, Ts, Tr);
Chris@82 114 TH = TD + TG;
Chris@82 115 Tv = W[4];
Chris@82 116 TB = Tv * TA;
Chris@82 117 TC = W[5];
Chris@82 118 TL = TC * TA;
Chris@82 119 Tk = FNMS(KP866025403, Tj, Tc);
Chris@82 120 T1 = W[3];
Chris@82 121 Tl = T1 * Tk;
Chris@82 122 Tm = W[2];
Chris@82 123 TJ = Tm * Tk;
Chris@82 124 {
Chris@82 125 E Tu, TI, TK, TM;
Chris@82 126 Tu = FMA(Tm, Tt, Tl);
Chris@82 127 TI = FNMS(TC, TH, TB);
Chris@82 128 Ip[WS(rs, 1)] = Tu + TI;
Chris@82 129 Im[WS(rs, 1)] = TI - Tu;
Chris@82 130 TK = FNMS(T1, Tt, TJ);
Chris@82 131 TM = FMA(Tv, TH, TL);
Chris@82 132 Rp[WS(rs, 1)] = TK - TM;
Chris@82 133 Rm[WS(rs, 1)] = TK + TM;
Chris@82 134 }
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E T15, T11, T13, T14, T1d, T18, T1b, T19, T1f, T12, T17;
Chris@82 138 T15 = FMA(KP866025403, Ts, Tr);
Chris@82 139 T12 = FMA(KP866025403, Tj, Tc);
Chris@82 140 T11 = W[6];
Chris@82 141 T13 = T11 * T12;
Chris@82 142 T14 = W[7];
Chris@82 143 T1d = T14 * T12;
Chris@82 144 T18 = FNMS(KP866025403, TQ, TP);
Chris@82 145 T1b = FMA(KP866025403, TV, TU);
Chris@82 146 T17 = W[8];
Chris@82 147 T19 = T17 * T18;
Chris@82 148 T1f = T17 * T1b;
Chris@82 149 {
Chris@82 150 E T16, T1e, T1c, T1g, T1a;
Chris@82 151 T16 = FNMS(T14, T15, T13);
Chris@82 152 T1e = FMA(T11, T15, T1d);
Chris@82 153 T1a = W[9];
Chris@82 154 T1c = FMA(T1a, T1b, T19);
Chris@82 155 T1g = FNMS(T1a, T18, T1f);
Chris@82 156 Rp[WS(rs, 2)] = T16 - T1c;
Chris@82 157 Ip[WS(rs, 2)] = T1e + T1g;
Chris@82 158 Rm[WS(rs, 2)] = T16 + T1c;
Chris@82 159 Im[WS(rs, 2)] = T1g - T1e;
Chris@82 160 }
Chris@82 161 }
Chris@82 162 }
Chris@82 163 }
Chris@82 164 }
Chris@82 165
Chris@82 166 static const tw_instr twinstr[] = {
Chris@82 167 {TW_FULL, 1, 6},
Chris@82 168 {TW_NEXT, 1, 0}
Chris@82 169 };
Chris@82 170
Chris@82 171 static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {36, 10, 22, 0} };
Chris@82 172
Chris@82 173 void X(codelet_hc2cbdft_6) (planner *p) {
Chris@82 174 X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
Chris@82 175 }
Chris@82 176 #else
Chris@82 177
Chris@82 178 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include rdft/scalar/hc2cb.h */
Chris@82 179
Chris@82 180 /*
Chris@82 181 * This function contains 58 FP additions, 28 FP multiplications,
Chris@82 182 * (or, 44 additions, 14 multiplications, 14 fused multiply/add),
Chris@82 183 * 29 stack variables, 2 constants, and 24 memory accesses
Chris@82 184 */
Chris@82 185 #include "rdft/scalar/hc2cb.h"
Chris@82 186
Chris@82 187 static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 188 {
Chris@82 189 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 190 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 191 {
Chris@82 192 INT m;
Chris@82 193 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@82 194 E T4, Tv, Tr, TL, Tb, Tc, Ty, TP, To, TB, Tj, TQ, Tp, Tq, TE;
Chris@82 195 E TM;
Chris@82 196 {
Chris@82 197 E Ta, Tx, T7, Tw, T2, T3;
Chris@82 198 T2 = Rp[0];
Chris@82 199 T3 = Rm[WS(rs, 2)];
Chris@82 200 T4 = T2 + T3;
Chris@82 201 Tv = T2 - T3;
Chris@82 202 {
Chris@82 203 E T8, T9, T5, T6;
Chris@82 204 T8 = Rm[WS(rs, 1)];
Chris@82 205 T9 = Rp[WS(rs, 1)];
Chris@82 206 Ta = T8 + T9;
Chris@82 207 Tx = T8 - T9;
Chris@82 208 T5 = Rp[WS(rs, 2)];
Chris@82 209 T6 = Rm[0];
Chris@82 210 T7 = T5 + T6;
Chris@82 211 Tw = T5 - T6;
Chris@82 212 }
Chris@82 213 Tr = KP866025403 * (T7 - Ta);
Chris@82 214 TL = KP866025403 * (Tw - Tx);
Chris@82 215 Tb = T7 + Ta;
Chris@82 216 Tc = FNMS(KP500000000, Tb, T4);
Chris@82 217 Ty = Tw + Tx;
Chris@82 218 TP = FNMS(KP500000000, Ty, Tv);
Chris@82 219 }
Chris@82 220 {
Chris@82 221 E Tf, TC, Ti, TD, Td, Te;
Chris@82 222 Td = Ip[WS(rs, 1)];
Chris@82 223 Te = Im[WS(rs, 1)];
Chris@82 224 Tf = Td - Te;
Chris@82 225 TC = Te + Td;
Chris@82 226 {
Chris@82 227 E Tm, Tn, Tg, Th;
Chris@82 228 Tm = Ip[0];
Chris@82 229 Tn = Im[WS(rs, 2)];
Chris@82 230 To = Tm - Tn;
Chris@82 231 TB = Tm + Tn;
Chris@82 232 Tg = Ip[WS(rs, 2)];
Chris@82 233 Th = Im[0];
Chris@82 234 Ti = Tg - Th;
Chris@82 235 TD = Tg + Th;
Chris@82 236 }
Chris@82 237 Tj = KP866025403 * (Tf - Ti);
Chris@82 238 TQ = KP866025403 * (TC + TD);
Chris@82 239 Tp = Tf + Ti;
Chris@82 240 Tq = FNMS(KP500000000, Tp, To);
Chris@82 241 TE = TC - TD;
Chris@82 242 TM = FMA(KP500000000, TE, TB);
Chris@82 243 }
Chris@82 244 {
Chris@82 245 E TJ, TT, TS, TU;
Chris@82 246 TJ = T4 + Tb;
Chris@82 247 TT = To + Tp;
Chris@82 248 {
Chris@82 249 E TN, TR, TK, TO;
Chris@82 250 TN = TL + TM;
Chris@82 251 TR = TP - TQ;
Chris@82 252 TK = W[0];
Chris@82 253 TO = W[1];
Chris@82 254 TS = FMA(TK, TN, TO * TR);
Chris@82 255 TU = FNMS(TO, TN, TK * TR);
Chris@82 256 }
Chris@82 257 Rp[0] = TJ - TS;
Chris@82 258 Ip[0] = TT + TU;
Chris@82 259 Rm[0] = TJ + TS;
Chris@82 260 Im[0] = TU - TT;
Chris@82 261 }
Chris@82 262 {
Chris@82 263 E TZ, T15, T14, T16;
Chris@82 264 {
Chris@82 265 E TW, TY, TV, TX;
Chris@82 266 TW = Tc + Tj;
Chris@82 267 TY = Tr + Tq;
Chris@82 268 TV = W[6];
Chris@82 269 TX = W[7];
Chris@82 270 TZ = FNMS(TX, TY, TV * TW);
Chris@82 271 T15 = FMA(TX, TW, TV * TY);
Chris@82 272 }
Chris@82 273 {
Chris@82 274 E T11, T13, T10, T12;
Chris@82 275 T11 = TM - TL;
Chris@82 276 T13 = TP + TQ;
Chris@82 277 T10 = W[8];
Chris@82 278 T12 = W[9];
Chris@82 279 T14 = FMA(T10, T11, T12 * T13);
Chris@82 280 T16 = FNMS(T12, T11, T10 * T13);
Chris@82 281 }
Chris@82 282 Rp[WS(rs, 2)] = TZ - T14;
Chris@82 283 Ip[WS(rs, 2)] = T15 + T16;
Chris@82 284 Rm[WS(rs, 2)] = TZ + T14;
Chris@82 285 Im[WS(rs, 2)] = T16 - T15;
Chris@82 286 }
Chris@82 287 {
Chris@82 288 E Tt, TH, TG, TI;
Chris@82 289 {
Chris@82 290 E Tk, Ts, T1, Tl;
Chris@82 291 Tk = Tc - Tj;
Chris@82 292 Ts = Tq - Tr;
Chris@82 293 T1 = W[3];
Chris@82 294 Tl = W[2];
Chris@82 295 Tt = FMA(T1, Tk, Tl * Ts);
Chris@82 296 TH = FNMS(T1, Ts, Tl * Tk);
Chris@82 297 }
Chris@82 298 {
Chris@82 299 E Tz, TF, Tu, TA;
Chris@82 300 Tz = Tv + Ty;
Chris@82 301 TF = TB - TE;
Chris@82 302 Tu = W[4];
Chris@82 303 TA = W[5];
Chris@82 304 TG = FNMS(TA, TF, Tu * Tz);
Chris@82 305 TI = FMA(TA, Tz, Tu * TF);
Chris@82 306 }
Chris@82 307 Ip[WS(rs, 1)] = Tt + TG;
Chris@82 308 Rp[WS(rs, 1)] = TH - TI;
Chris@82 309 Im[WS(rs, 1)] = TG - Tt;
Chris@82 310 Rm[WS(rs, 1)] = TH + TI;
Chris@82 311 }
Chris@82 312 }
Chris@82 313 }
Chris@82 314 }
Chris@82 315
Chris@82 316 static const tw_instr twinstr[] = {
Chris@82 317 {TW_FULL, 1, 6},
Chris@82 318 {TW_NEXT, 1, 0}
Chris@82 319 };
Chris@82 320
Chris@82 321 static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {44, 14, 14, 0} };
Chris@82 322
Chris@82 323 void X(codelet_hc2cbdft_6) (planner *p) {
Chris@82 324 X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
Chris@82 325 }
Chris@82 326 #endif