annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft_6.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:51:55 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 58 FP additions, 32 FP multiplications,
Chris@42 32 * (or, 36 additions, 10 multiplications, 22 fused multiply/add),
Chris@42 33 * 52 stack variables, 2 constants, and 24 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 44 E T18, T1b, T16, T1e, T1a, T1f, T19, T1g, T1c;
Chris@42 45 {
Chris@42 46 E Tw, T4, TV, Tj, TP, TH, Tr, TY, T5, T6, Ta, Ty;
Chris@42 47 {
Chris@42 48 E Tg, TF, Tf, TD, Tp, Th;
Chris@42 49 {
Chris@42 50 E Td, Te, Tn, To;
Chris@42 51 Td = Ip[WS(rs, 1)];
Chris@42 52 Te = Im[WS(rs, 1)];
Chris@42 53 Tn = Ip[0];
Chris@42 54 To = Im[WS(rs, 2)];
Chris@42 55 Tg = Ip[WS(rs, 2)];
Chris@42 56 TF = Te + Td;
Chris@42 57 Tf = Td - Te;
Chris@42 58 TD = Tn + To;
Chris@42 59 Tp = Tn - To;
Chris@42 60 Th = Im[0];
Chris@42 61 }
Chris@42 62 {
Chris@42 63 E T2, T3, T8, T9;
Chris@42 64 T2 = Rp[0];
Chris@42 65 T3 = Rm[WS(rs, 2)];
Chris@42 66 {
Chris@42 67 E Tq, TE, Ti, TG;
Chris@42 68 T8 = Rm[WS(rs, 1)];
Chris@42 69 TE = Tg + Th;
Chris@42 70 Ti = Tg - Th;
Chris@42 71 Tw = T2 - T3;
Chris@42 72 T4 = T2 + T3;
Chris@42 73 TG = TE - TF;
Chris@42 74 TV = TF + TE;
Chris@42 75 Tq = Tf + Ti;
Chris@42 76 Tj = Tf - Ti;
Chris@42 77 TP = FNMS(KP500000000, TG, TD);
Chris@42 78 TH = TD + TG;
Chris@42 79 T9 = Rp[WS(rs, 1)];
Chris@42 80 Tr = FNMS(KP500000000, Tq, Tp);
Chris@42 81 TY = Tp + Tq;
Chris@42 82 }
Chris@42 83 T5 = Rp[WS(rs, 2)];
Chris@42 84 T6 = Rm[0];
Chris@42 85 Ta = T8 + T9;
Chris@42 86 Ty = T8 - T9;
Chris@42 87 }
Chris@42 88 }
Chris@42 89 {
Chris@42 90 E TO, TT, Ts, TA, TR, Tc, TN, TW, TS, Tx, T7;
Chris@42 91 Tx = T5 - T6;
Chris@42 92 T7 = T5 + T6;
Chris@42 93 TO = W[0];
Chris@42 94 TT = W[1];
Chris@42 95 {
Chris@42 96 E Tz, TQ, Tb, TU;
Chris@42 97 Tz = Tx + Ty;
Chris@42 98 TQ = Tx - Ty;
Chris@42 99 Tb = T7 + Ta;
Chris@42 100 Ts = T7 - Ta;
Chris@42 101 TU = FNMS(KP500000000, Tz, Tw);
Chris@42 102 TA = Tw + Tz;
Chris@42 103 TR = FMA(KP866025403, TQ, TP);
Chris@42 104 T18 = FNMS(KP866025403, TQ, TP);
Chris@42 105 Tc = FNMS(KP500000000, Tb, T4);
Chris@42 106 TN = T4 + Tb;
Chris@42 107 T1b = FMA(KP866025403, TV, TU);
Chris@42 108 TW = FNMS(KP866025403, TV, TU);
Chris@42 109 TS = TO * TR;
Chris@42 110 }
Chris@42 111 {
Chris@42 112 E T15, Tt, T12, T1, Tm, TI, TM, Tl, TJ;
Chris@42 113 {
Chris@42 114 E Tv, TC, TB, TL, Tk, TZ, TX, T10;
Chris@42 115 T15 = FMA(KP866025403, Ts, Tr);
Chris@42 116 Tt = FNMS(KP866025403, Ts, Tr);
Chris@42 117 TZ = TO * TW;
Chris@42 118 TX = FMA(TT, TW, TS);
Chris@42 119 Tv = W[4];
Chris@42 120 TC = W[5];
Chris@42 121 T10 = FNMS(TT, TR, TZ);
Chris@42 122 Rm[0] = TN + TX;
Chris@42 123 Rp[0] = TN - TX;
Chris@42 124 TB = Tv * TA;
Chris@42 125 Im[0] = T10 - TY;
Chris@42 126 Ip[0] = TY + T10;
Chris@42 127 TL = TC * TA;
Chris@42 128 Tk = FNMS(KP866025403, Tj, Tc);
Chris@42 129 T12 = FMA(KP866025403, Tj, Tc);
Chris@42 130 T1 = W[3];
Chris@42 131 Tm = W[2];
Chris@42 132 TI = FNMS(TC, TH, TB);
Chris@42 133 TM = FMA(Tv, TH, TL);
Chris@42 134 Tl = T1 * Tk;
Chris@42 135 TJ = Tm * Tk;
Chris@42 136 }
Chris@42 137 {
Chris@42 138 E T11, T14, T13, T1d, T17, Tu, TK;
Chris@42 139 Tu = FMA(Tm, Tt, Tl);
Chris@42 140 TK = FNMS(T1, Tt, TJ);
Chris@42 141 T11 = W[6];
Chris@42 142 T14 = W[7];
Chris@42 143 Im[WS(rs, 1)] = TI - Tu;
Chris@42 144 Ip[WS(rs, 1)] = Tu + TI;
Chris@42 145 Rm[WS(rs, 1)] = TK + TM;
Chris@42 146 Rp[WS(rs, 1)] = TK - TM;
Chris@42 147 T13 = T11 * T12;
Chris@42 148 T1d = T14 * T12;
Chris@42 149 T17 = W[8];
Chris@42 150 T16 = FNMS(T14, T15, T13);
Chris@42 151 T1e = FMA(T11, T15, T1d);
Chris@42 152 T1a = W[9];
Chris@42 153 T1f = T17 * T1b;
Chris@42 154 T19 = T17 * T18;
Chris@42 155 }
Chris@42 156 }
Chris@42 157 }
Chris@42 158 }
Chris@42 159 T1g = FNMS(T1a, T18, T1f);
Chris@42 160 T1c = FMA(T1a, T1b, T19);
Chris@42 161 Im[WS(rs, 2)] = T1g - T1e;
Chris@42 162 Ip[WS(rs, 2)] = T1e + T1g;
Chris@42 163 Rm[WS(rs, 2)] = T16 + T1c;
Chris@42 164 Rp[WS(rs, 2)] = T16 - T1c;
Chris@42 165 }
Chris@42 166 }
Chris@42 167 }
Chris@42 168
Chris@42 169 static const tw_instr twinstr[] = {
Chris@42 170 {TW_FULL, 1, 6},
Chris@42 171 {TW_NEXT, 1, 0}
Chris@42 172 };
Chris@42 173
Chris@42 174 static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {36, 10, 22, 0} };
Chris@42 175
Chris@42 176 void X(codelet_hc2cbdft_6) (planner *p) {
Chris@42 177 X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
Chris@42 178 }
Chris@42 179 #else /* HAVE_FMA */
Chris@42 180
Chris@42 181 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 6 -dif -name hc2cbdft_6 -include hc2cb.h */
Chris@42 182
Chris@42 183 /*
Chris@42 184 * This function contains 58 FP additions, 28 FP multiplications,
Chris@42 185 * (or, 44 additions, 14 multiplications, 14 fused multiply/add),
Chris@42 186 * 29 stack variables, 2 constants, and 24 memory accesses
Chris@42 187 */
Chris@42 188 #include "hc2cb.h"
Chris@42 189
Chris@42 190 static void hc2cbdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 191 {
Chris@42 192 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 193 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 194 {
Chris@42 195 INT m;
Chris@42 196 for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 197 E T4, Tv, Tr, TL, Tb, Tc, Ty, TP, To, TB, Tj, TQ, Tp, Tq, TE;
Chris@42 198 E TM;
Chris@42 199 {
Chris@42 200 E Ta, Tx, T7, Tw, T2, T3;
Chris@42 201 T2 = Rp[0];
Chris@42 202 T3 = Rm[WS(rs, 2)];
Chris@42 203 T4 = T2 + T3;
Chris@42 204 Tv = T2 - T3;
Chris@42 205 {
Chris@42 206 E T8, T9, T5, T6;
Chris@42 207 T8 = Rm[WS(rs, 1)];
Chris@42 208 T9 = Rp[WS(rs, 1)];
Chris@42 209 Ta = T8 + T9;
Chris@42 210 Tx = T8 - T9;
Chris@42 211 T5 = Rp[WS(rs, 2)];
Chris@42 212 T6 = Rm[0];
Chris@42 213 T7 = T5 + T6;
Chris@42 214 Tw = T5 - T6;
Chris@42 215 }
Chris@42 216 Tr = KP866025403 * (T7 - Ta);
Chris@42 217 TL = KP866025403 * (Tw - Tx);
Chris@42 218 Tb = T7 + Ta;
Chris@42 219 Tc = FNMS(KP500000000, Tb, T4);
Chris@42 220 Ty = Tw + Tx;
Chris@42 221 TP = FNMS(KP500000000, Ty, Tv);
Chris@42 222 }
Chris@42 223 {
Chris@42 224 E Tf, TC, Ti, TD, Td, Te;
Chris@42 225 Td = Ip[WS(rs, 1)];
Chris@42 226 Te = Im[WS(rs, 1)];
Chris@42 227 Tf = Td - Te;
Chris@42 228 TC = Te + Td;
Chris@42 229 {
Chris@42 230 E Tm, Tn, Tg, Th;
Chris@42 231 Tm = Ip[0];
Chris@42 232 Tn = Im[WS(rs, 2)];
Chris@42 233 To = Tm - Tn;
Chris@42 234 TB = Tm + Tn;
Chris@42 235 Tg = Ip[WS(rs, 2)];
Chris@42 236 Th = Im[0];
Chris@42 237 Ti = Tg - Th;
Chris@42 238 TD = Tg + Th;
Chris@42 239 }
Chris@42 240 Tj = KP866025403 * (Tf - Ti);
Chris@42 241 TQ = KP866025403 * (TC + TD);
Chris@42 242 Tp = Tf + Ti;
Chris@42 243 Tq = FNMS(KP500000000, Tp, To);
Chris@42 244 TE = TC - TD;
Chris@42 245 TM = FMA(KP500000000, TE, TB);
Chris@42 246 }
Chris@42 247 {
Chris@42 248 E TJ, TT, TS, TU;
Chris@42 249 TJ = T4 + Tb;
Chris@42 250 TT = To + Tp;
Chris@42 251 {
Chris@42 252 E TN, TR, TK, TO;
Chris@42 253 TN = TL + TM;
Chris@42 254 TR = TP - TQ;
Chris@42 255 TK = W[0];
Chris@42 256 TO = W[1];
Chris@42 257 TS = FMA(TK, TN, TO * TR);
Chris@42 258 TU = FNMS(TO, TN, TK * TR);
Chris@42 259 }
Chris@42 260 Rp[0] = TJ - TS;
Chris@42 261 Ip[0] = TT + TU;
Chris@42 262 Rm[0] = TJ + TS;
Chris@42 263 Im[0] = TU - TT;
Chris@42 264 }
Chris@42 265 {
Chris@42 266 E TZ, T15, T14, T16;
Chris@42 267 {
Chris@42 268 E TW, TY, TV, TX;
Chris@42 269 TW = Tc + Tj;
Chris@42 270 TY = Tr + Tq;
Chris@42 271 TV = W[6];
Chris@42 272 TX = W[7];
Chris@42 273 TZ = FNMS(TX, TY, TV * TW);
Chris@42 274 T15 = FMA(TX, TW, TV * TY);
Chris@42 275 }
Chris@42 276 {
Chris@42 277 E T11, T13, T10, T12;
Chris@42 278 T11 = TM - TL;
Chris@42 279 T13 = TP + TQ;
Chris@42 280 T10 = W[8];
Chris@42 281 T12 = W[9];
Chris@42 282 T14 = FMA(T10, T11, T12 * T13);
Chris@42 283 T16 = FNMS(T12, T11, T10 * T13);
Chris@42 284 }
Chris@42 285 Rp[WS(rs, 2)] = TZ - T14;
Chris@42 286 Ip[WS(rs, 2)] = T15 + T16;
Chris@42 287 Rm[WS(rs, 2)] = TZ + T14;
Chris@42 288 Im[WS(rs, 2)] = T16 - T15;
Chris@42 289 }
Chris@42 290 {
Chris@42 291 E Tt, TH, TG, TI;
Chris@42 292 {
Chris@42 293 E Tk, Ts, T1, Tl;
Chris@42 294 Tk = Tc - Tj;
Chris@42 295 Ts = Tq - Tr;
Chris@42 296 T1 = W[3];
Chris@42 297 Tl = W[2];
Chris@42 298 Tt = FMA(T1, Tk, Tl * Ts);
Chris@42 299 TH = FNMS(T1, Ts, Tl * Tk);
Chris@42 300 }
Chris@42 301 {
Chris@42 302 E Tz, TF, Tu, TA;
Chris@42 303 Tz = Tv + Ty;
Chris@42 304 TF = TB - TE;
Chris@42 305 Tu = W[4];
Chris@42 306 TA = W[5];
Chris@42 307 TG = FNMS(TA, TF, Tu * Tz);
Chris@42 308 TI = FMA(TA, Tz, Tu * TF);
Chris@42 309 }
Chris@42 310 Ip[WS(rs, 1)] = Tt + TG;
Chris@42 311 Rp[WS(rs, 1)] = TH - TI;
Chris@42 312 Im[WS(rs, 1)] = TG - Tt;
Chris@42 313 Rm[WS(rs, 1)] = TH + TI;
Chris@42 314 }
Chris@42 315 }
Chris@42 316 }
Chris@42 317 }
Chris@42 318
Chris@42 319 static const tw_instr twinstr[] = {
Chris@42 320 {TW_FULL, 1, 6},
Chris@42 321 {TW_NEXT, 1, 0}
Chris@42 322 };
Chris@42 323
Chris@42 324 static const hc2c_desc desc = { 6, "hc2cbdft_6", twinstr, &GENUS, {44, 14, 14, 0} };
Chris@42 325
Chris@42 326 void X(codelet_hc2cbdft_6) (planner *p) {
Chris@42 327 X(khc2c_register) (p, hc2cbdft_6, &desc, HC2C_VIA_DFT);
Chris@42 328 }
Chris@42 329 #endif /* HAVE_FMA */