annotate src/fftw-3.3.5/rdft/scalar/r2cf/r2cfII_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:47:26 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include r2cfII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 72 FP additions, 41 FP multiplications,
Chris@42 32 * (or, 38 additions, 7 multiplications, 34 fused multiply/add),
Chris@42 33 * 57 stack variables, 12 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cfII.h"
Chris@42 36
Chris@42 37 static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP823639103, +0.823639103546331925877420039278190003029660514);
Chris@42 40 DK(KP910592997, +0.910592997310029334643087372129977886038870291);
Chris@42 41 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 42 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 43 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 44 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 45 DK(KP690983005, +0.690983005625052575897706582817180941139845410);
Chris@42 46 DK(KP552786404, +0.552786404500042060718165266253744752911876328);
Chris@42 47 DK(KP447213595, +0.447213595499957939281834733746255247088123672);
Chris@42 48 DK(KP809016994, +0.809016994374947424102293417182819058860154590);
Chris@42 49 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 50 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 51 {
Chris@42 52 INT i;
Chris@42 53 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 54 E T9, TQ, TV, TW, Tw, TJ;
Chris@42 55 {
Chris@42 56 E Ta, Tl, Tg, T8, T7, TF, TX, TT, Tm, Th, TM, TZ, Tr, Tn, Tj;
Chris@42 57 E Tz, To, TN, TH, Tp, TO;
Chris@42 58 Ta = R0[WS(rs, 5)];
Chris@42 59 Tl = R1[WS(rs, 2)];
Chris@42 60 {
Chris@42 61 E T1, T2, T5, T3, T4;
Chris@42 62 T1 = R0[0];
Chris@42 63 T2 = R0[WS(rs, 3)];
Chris@42 64 T5 = R1[WS(rs, 4)];
Chris@42 65 T3 = R0[WS(rs, 6)];
Chris@42 66 T4 = R1[WS(rs, 1)];
Chris@42 67 {
Chris@42 68 E Tb, TL, Te, TK, TR, Tf, Ti, Ty;
Chris@42 69 Tb = R1[0];
Chris@42 70 TR = T2 + T5;
Chris@42 71 Tg = R0[WS(rs, 2)];
Chris@42 72 {
Chris@42 73 E T6, TS, Tc, Td;
Chris@42 74 T6 = T2 + T3 - T4 - T5;
Chris@42 75 T8 = (T3 + T5 - T2) - T4;
Chris@42 76 TS = T3 + T4;
Chris@42 77 Tc = R1[WS(rs, 3)];
Chris@42 78 Td = R1[WS(rs, 6)];
Chris@42 79 T7 = FNMS(KP250000000, T6, T1);
Chris@42 80 TF = T1 + T6;
Chris@42 81 TX = FNMS(KP618033988, TR, TS);
Chris@42 82 TT = FMA(KP618033988, TS, TR);
Chris@42 83 TL = Tc - Td;
Chris@42 84 Te = Tc + Td;
Chris@42 85 }
Chris@42 86 TK = Tg + Tb;
Chris@42 87 Tm = R0[WS(rs, 7)];
Chris@42 88 Tf = Tb - Te;
Chris@42 89 Th = Tb + Te;
Chris@42 90 TM = FMA(KP618033988, TL, TK);
Chris@42 91 TZ = FNMS(KP618033988, TK, TL);
Chris@42 92 Ti = FMA(KP809016994, Th, Tg);
Chris@42 93 Ty = FMA(KP447213595, Th, Tf);
Chris@42 94 Tr = R1[WS(rs, 5)];
Chris@42 95 Tn = R0[WS(rs, 1)];
Chris@42 96 Tj = FNMS(KP552786404, Ti, Tf);
Chris@42 97 Tz = FNMS(KP690983005, Ty, Tg);
Chris@42 98 To = R0[WS(rs, 4)];
Chris@42 99 TN = Tr + Tm;
Chris@42 100 }
Chris@42 101 }
Chris@42 102 TH = Ta + Tg - Th;
Chris@42 103 Tp = Tn + To;
Chris@42 104 TO = To - Tn;
Chris@42 105 {
Chris@42 106 E Tx, TA, TP, T14, T11, Tu, TD;
Chris@42 107 {
Chris@42 108 E T10, TI, TC, TY;
Chris@42 109 T9 = FNMS(KP559016994, T8, T7);
Chris@42 110 Tx = FMA(KP559016994, T8, T7);
Chris@42 111 TA = FNMS(KP809016994, Tz, Ta);
Chris@42 112 TP = FMA(KP618033988, TO, TN);
Chris@42 113 TY = FNMS(KP618033988, TN, TO);
Chris@42 114 {
Chris@42 115 E Tq, Ts, TG, Tt, TB;
Chris@42 116 Tq = Tm - Tp;
Chris@42 117 Ts = Tm + Tp;
Chris@42 118 T14 = TZ - TY;
Chris@42 119 T10 = TY + TZ;
Chris@42 120 TG = Ts - Tr - Tl;
Chris@42 121 Tt = FMA(KP809016994, Ts, Tr);
Chris@42 122 TB = FMA(KP447213595, Ts, Tq);
Chris@42 123 T11 = FMA(KP500000000, T10, TX);
Chris@42 124 Ci[WS(csi, 2)] = KP866025403 * (TH - TG);
Chris@42 125 TI = TG + TH;
Chris@42 126 Tu = FNMS(KP552786404, Tt, Tq);
Chris@42 127 TC = FNMS(KP690983005, TB, Tr);
Chris@42 128 }
Chris@42 129 Ci[WS(csi, 1)] = KP951056516 * (T10 - TX);
Chris@42 130 Cr[WS(csr, 7)] = TF + TI;
Chris@42 131 Cr[WS(csr, 2)] = FNMS(KP500000000, TI, TF);
Chris@42 132 TD = FNMS(KP809016994, TC, Tl);
Chris@42 133 }
Chris@42 134 {
Chris@42 135 E TU, Tk, T13, Tv, T12, TE;
Chris@42 136 TQ = TM - TP;
Chris@42 137 TU = TP + TM;
Chris@42 138 T12 = TD + TA;
Chris@42 139 TE = TA - TD;
Chris@42 140 Tk = FNMS(KP559016994, Tj, Ta);
Chris@42 141 TV = FMA(KP500000000, TU, TT);
Chris@42 142 Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP910592997, T12, T11)));
Chris@42 143 Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP910592997, T12, T11));
Chris@42 144 T13 = FNMS(KP500000000, TE, Tx);
Chris@42 145 Cr[WS(csr, 1)] = Tx + TE;
Chris@42 146 Tv = FNMS(KP559016994, Tu, Tl);
Chris@42 147 Ci[WS(csi, 4)] = KP951056516 * (TT - TU);
Chris@42 148 Cr[WS(csr, 6)] = FMA(KP823639103, T14, T13);
Chris@42 149 Cr[WS(csr, 3)] = FNMS(KP823639103, T14, T13);
Chris@42 150 TW = Tv + Tk;
Chris@42 151 Tw = Tk - Tv;
Chris@42 152 }
Chris@42 153 }
Chris@42 154 }
Chris@42 155 Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP910592997, TW, TV)));
Chris@42 156 Ci[0] = -(KP951056516 * (FMA(KP910592997, TW, TV)));
Chris@42 157 TJ = FNMS(KP500000000, Tw, T9);
Chris@42 158 Cr[WS(csr, 4)] = T9 + Tw;
Chris@42 159 Cr[0] = FMA(KP823639103, TQ, TJ);
Chris@42 160 Cr[WS(csr, 5)] = FNMS(KP823639103, TQ, TJ);
Chris@42 161 }
Chris@42 162 }
Chris@42 163 }
Chris@42 164
Chris@42 165 static const kr2c_desc desc = { 15, "r2cfII_15", {38, 7, 34, 0}, &GENUS };
Chris@42 166
Chris@42 167 void X(codelet_r2cfII_15) (planner *p) {
Chris@42 168 X(kr2c_register) (p, r2cfII_15, &desc);
Chris@42 169 }
Chris@42 170
Chris@42 171 #else /* HAVE_FMA */
Chris@42 172
Chris@42 173 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include r2cfII.h */
Chris@42 174
Chris@42 175 /*
Chris@42 176 * This function contains 72 FP additions, 33 FP multiplications,
Chris@42 177 * (or, 54 additions, 15 multiplications, 18 fused multiply/add),
Chris@42 178 * 37 stack variables, 8 constants, and 30 memory accesses
Chris@42 179 */
Chris@42 180 #include "r2cfII.h"
Chris@42 181
Chris@42 182 static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 183 {
Chris@42 184 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 185 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 186 DK(KP809016994, +0.809016994374947424102293417182819058860154590);
Chris@42 187 DK(KP309016994, +0.309016994374947424102293417182819058860154590);
Chris@42 188 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 189 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 190 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 191 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 192 {
Chris@42 193 INT i;
Chris@42 194 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 195 E T1, T2, Tx, TR, TE, T7, TD, Th, Tm, Tr, TQ, TA, TB, Tf, Te;
Chris@42 196 E Tu, TS, Td, TH, TO;
Chris@42 197 T1 = R0[WS(rs, 5)];
Chris@42 198 {
Chris@42 199 E T3, Tv, T6, Tw, T4, T5;
Chris@42 200 T2 = R0[WS(rs, 2)];
Chris@42 201 T3 = R1[0];
Chris@42 202 Tv = T2 + T3;
Chris@42 203 T4 = R1[WS(rs, 3)];
Chris@42 204 T5 = R1[WS(rs, 6)];
Chris@42 205 T6 = T4 + T5;
Chris@42 206 Tw = T4 - T5;
Chris@42 207 Tx = FMA(KP951056516, Tv, KP587785252 * Tw);
Chris@42 208 TR = FNMS(KP587785252, Tv, KP951056516 * Tw);
Chris@42 209 TE = KP559016994 * (T3 - T6);
Chris@42 210 T7 = T3 + T6;
Chris@42 211 TD = KP250000000 * T7;
Chris@42 212 }
Chris@42 213 {
Chris@42 214 E Ti, Tl, Tj, Tk, Tp, Tq;
Chris@42 215 Th = R0[0];
Chris@42 216 Ti = R1[WS(rs, 4)];
Chris@42 217 Tl = R0[WS(rs, 6)];
Chris@42 218 Tj = R1[WS(rs, 1)];
Chris@42 219 Tk = R0[WS(rs, 3)];
Chris@42 220 Tp = Tk + Ti;
Chris@42 221 Tq = Tl + Tj;
Chris@42 222 Tm = Ti + Tj - (Tk + Tl);
Chris@42 223 Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
Chris@42 224 TQ = FNMS(KP951056516, Tq, KP587785252 * Tp);
Chris@42 225 TA = FMA(KP250000000, Tm, Th);
Chris@42 226 TB = KP559016994 * (Tl + Ti - (Tk + Tj));
Chris@42 227 }
Chris@42 228 {
Chris@42 229 E T9, Tt, Tc, Ts, Ta, Tb, TG;
Chris@42 230 Tf = R1[WS(rs, 2)];
Chris@42 231 T9 = R0[WS(rs, 7)];
Chris@42 232 Te = R1[WS(rs, 5)];
Chris@42 233 Tt = T9 + Te;
Chris@42 234 Ta = R0[WS(rs, 1)];
Chris@42 235 Tb = R0[WS(rs, 4)];
Chris@42 236 Tc = Ta + Tb;
Chris@42 237 Ts = Ta - Tb;
Chris@42 238 Tu = FNMS(KP951056516, Tt, KP587785252 * Ts);
Chris@42 239 TS = FMA(KP951056516, Ts, KP587785252 * Tt);
Chris@42 240 Td = T9 + Tc;
Chris@42 241 TG = KP559016994 * (T9 - Tc);
Chris@42 242 TH = FNMS(KP309016994, Te, TG) + FNMA(KP250000000, Td, Tf);
Chris@42 243 TO = FMS(KP809016994, Te, Tf) + FNMA(KP250000000, Td, TG);
Chris@42 244 }
Chris@42 245 {
Chris@42 246 E Tn, T8, Tg, To;
Chris@42 247 Tn = Th - Tm;
Chris@42 248 T8 = T1 + T2 - T7;
Chris@42 249 Tg = Td - Te - Tf;
Chris@42 250 To = T8 + Tg;
Chris@42 251 Ci[WS(csi, 2)] = KP866025403 * (T8 - Tg);
Chris@42 252 Cr[WS(csr, 2)] = FNMS(KP500000000, To, Tn);
Chris@42 253 Cr[WS(csr, 7)] = Tn + To;
Chris@42 254 }
Chris@42 255 {
Chris@42 256 E TM, TX, TT, TV, TP, TU, TN, TW;
Chris@42 257 TM = TB + TA;
Chris@42 258 TX = KP866025403 * (TR + TS);
Chris@42 259 TT = TR - TS;
Chris@42 260 TV = FMS(KP500000000, TT, TQ);
Chris@42 261 TN = T1 + TE + FNMS(KP809016994, T2, TD);
Chris@42 262 TP = TN + TO;
Chris@42 263 TU = KP866025403 * (TO - TN);
Chris@42 264 Cr[WS(csr, 1)] = TM + TP;
Chris@42 265 Ci[WS(csi, 1)] = TQ + TT;
Chris@42 266 Ci[WS(csi, 6)] = TU - TV;
Chris@42 267 Ci[WS(csi, 3)] = TU + TV;
Chris@42 268 TW = FNMS(KP500000000, TP, TM);
Chris@42 269 Cr[WS(csr, 3)] = TW - TX;
Chris@42 270 Cr[WS(csr, 6)] = TW + TX;
Chris@42 271 }
Chris@42 272 {
Chris@42 273 E Tz, TC, Ty, TK, TI, TL, TF, TJ;
Chris@42 274 Tz = KP866025403 * (Tx + Tu);
Chris@42 275 TC = TA - TB;
Chris@42 276 Ty = Tu - Tx;
Chris@42 277 TK = FMS(KP500000000, Ty, Tr);
Chris@42 278 TF = FMA(KP309016994, T2, T1) + TD - TE;
Chris@42 279 TI = TF + TH;
Chris@42 280 TL = KP866025403 * (TH - TF);
Chris@42 281 Ci[WS(csi, 4)] = Tr + Ty;
Chris@42 282 Cr[WS(csr, 4)] = TC + TI;
Chris@42 283 Ci[WS(csi, 5)] = TK - TL;
Chris@42 284 Ci[0] = TK + TL;
Chris@42 285 TJ = FNMS(KP500000000, TI, TC);
Chris@42 286 Cr[0] = Tz + TJ;
Chris@42 287 Cr[WS(csr, 5)] = TJ - Tz;
Chris@42 288 }
Chris@42 289 }
Chris@42 290 }
Chris@42 291 }
Chris@42 292
Chris@42 293 static const kr2c_desc desc = { 15, "r2cfII_15", {54, 15, 18, 0}, &GENUS };
Chris@42 294
Chris@42 295 void X(codelet_r2cfII_15) (planner *p) {
Chris@42 296 X(kr2c_register) (p, r2cfII_15, &desc);
Chris@42 297 }
Chris@42 298
Chris@42 299 #endif /* HAVE_FMA */