annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cb_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:28 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cb_15 -include r2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 64 FP additions, 43 FP multiplications,
Chris@42 32 * (or, 21 additions, 0 multiplications, 43 fused multiply/add),
Chris@42 33 * 54 stack variables, 9 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cb.h"
Chris@42 36
Chris@42 37 static void r2cb_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 43 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@42 44 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 45 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 46 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 47 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 48 {
Chris@42 49 INT i;
Chris@42 50 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 51 E TL, Tz, TM, TK;
Chris@42 52 {
Chris@42 53 E T3, Th, Tt, TD, TI, TH, TY, TC, TZ, Tu, Tm, Tv, Tr, Te, TW;
Chris@42 54 E Tg, T1, T2, T12, T10, TV;
Chris@42 55 Tg = Ci[WS(csi, 5)];
Chris@42 56 T1 = Cr[0];
Chris@42 57 T2 = Cr[WS(csr, 5)];
Chris@42 58 {
Chris@42 59 E T4, TA, T9, TF, T7, Tj, Tc, Tk, TG, Tq, Tf, Tl, TB;
Chris@42 60 T4 = Cr[WS(csr, 3)];
Chris@42 61 TA = Ci[WS(csi, 3)];
Chris@42 62 T9 = Cr[WS(csr, 6)];
Chris@42 63 Tf = T1 - T2;
Chris@42 64 T3 = FMA(KP2_000000000, T2, T1);
Chris@42 65 TF = Ci[WS(csi, 6)];
Chris@42 66 {
Chris@42 67 E Ta, Tb, T5, T6, To, Tp;
Chris@42 68 T5 = Cr[WS(csr, 7)];
Chris@42 69 T6 = Cr[WS(csr, 2)];
Chris@42 70 Th = FMA(KP1_732050807, Tg, Tf);
Chris@42 71 Tt = FNMS(KP1_732050807, Tg, Tf);
Chris@42 72 Ta = Cr[WS(csr, 4)];
Chris@42 73 TD = T5 - T6;
Chris@42 74 T7 = T5 + T6;
Chris@42 75 Tb = Cr[WS(csr, 1)];
Chris@42 76 To = Ci[WS(csi, 4)];
Chris@42 77 Tp = Ci[WS(csi, 1)];
Chris@42 78 Tj = Ci[WS(csi, 7)];
Chris@42 79 Tc = Ta + Tb;
Chris@42 80 TI = Ta - Tb;
Chris@42 81 Tk = Ci[WS(csi, 2)];
Chris@42 82 TG = Tp - To;
Chris@42 83 Tq = To + Tp;
Chris@42 84 }
Chris@42 85 Tl = Tj - Tk;
Chris@42 86 TB = Tj + Tk;
Chris@42 87 TH = FNMS(KP500000000, TG, TF);
Chris@42 88 TY = TG + TF;
Chris@42 89 TC = FMA(KP500000000, TB, TA);
Chris@42 90 TZ = TA - TB;
Chris@42 91 {
Chris@42 92 E Ti, T8, Td, Tn;
Chris@42 93 Ti = FNMS(KP2_000000000, T4, T7);
Chris@42 94 T8 = T4 + T7;
Chris@42 95 Td = T9 + Tc;
Chris@42 96 Tn = FNMS(KP2_000000000, T9, Tc);
Chris@42 97 Tu = FNMS(KP1_732050807, Tl, Ti);
Chris@42 98 Tm = FMA(KP1_732050807, Tl, Ti);
Chris@42 99 Tv = FNMS(KP1_732050807, Tq, Tn);
Chris@42 100 Tr = FMA(KP1_732050807, Tq, Tn);
Chris@42 101 Te = T8 + Td;
Chris@42 102 TW = T8 - Td;
Chris@42 103 }
Chris@42 104 }
Chris@42 105 T12 = FMA(KP618033988, TY, TZ);
Chris@42 106 T10 = FNMS(KP618033988, TZ, TY);
Chris@42 107 TV = FNMS(KP500000000, Te, T3);
Chris@42 108 R0[0] = FMA(KP2_000000000, Te, T3);
Chris@42 109 {
Chris@42 110 E TJ, TE, TT, TP, TU, TS, Ty, Tw, Tx;
Chris@42 111 {
Chris@42 112 E TO, Ts, TQ, TN, TR, T11, TX;
Chris@42 113 TO = Tr - Tm;
Chris@42 114 Ts = Tm + Tr;
Chris@42 115 T11 = FMA(KP1_118033988, TW, TV);
Chris@42 116 TX = FNMS(KP1_118033988, TW, TV);
Chris@42 117 TQ = FNMS(KP866025403, TI, TH);
Chris@42 118 TJ = FMA(KP866025403, TI, TH);
Chris@42 119 TN = FMA(KP250000000, Ts, Th);
Chris@42 120 R0[WS(rs, 3)] = FNMS(KP1_902113032, T12, T11);
Chris@42 121 R1[WS(rs, 4)] = FMA(KP1_902113032, T12, T11);
Chris@42 122 R0[WS(rs, 6)] = FMA(KP1_902113032, T10, TX);
Chris@42 123 R1[WS(rs, 1)] = FNMS(KP1_902113032, T10, TX);
Chris@42 124 TR = FNMS(KP866025403, TD, TC);
Chris@42 125 TE = FMA(KP866025403, TD, TC);
Chris@42 126 R1[WS(rs, 2)] = Th - Ts;
Chris@42 127 TT = FMA(KP559016994, TO, TN);
Chris@42 128 TP = FNMS(KP559016994, TO, TN);
Chris@42 129 TU = FMA(KP618033988, TQ, TR);
Chris@42 130 TS = FNMS(KP618033988, TR, TQ);
Chris@42 131 }
Chris@42 132 Ty = Tv - Tu;
Chris@42 133 Tw = Tu + Tv;
Chris@42 134 R0[WS(rs, 7)] = FMA(KP1_902113032, TU, TT);
Chris@42 135 R1[WS(rs, 5)] = FNMS(KP1_902113032, TU, TT);
Chris@42 136 R0[WS(rs, 1)] = FMA(KP1_902113032, TS, TP);
Chris@42 137 R0[WS(rs, 4)] = FNMS(KP1_902113032, TS, TP);
Chris@42 138 Tx = FMA(KP250000000, Tw, Tt);
Chris@42 139 R0[WS(rs, 5)] = Tt - Tw;
Chris@42 140 TL = FNMS(KP559016994, Ty, Tx);
Chris@42 141 Tz = FMA(KP559016994, Ty, Tx);
Chris@42 142 TM = FNMS(KP618033988, TE, TJ);
Chris@42 143 TK = FMA(KP618033988, TJ, TE);
Chris@42 144 }
Chris@42 145 }
Chris@42 146 R1[WS(rs, 3)] = FMA(KP1_902113032, TM, TL);
Chris@42 147 R1[WS(rs, 6)] = FNMS(KP1_902113032, TM, TL);
Chris@42 148 R0[WS(rs, 2)] = FMA(KP1_902113032, TK, Tz);
Chris@42 149 R1[0] = FNMS(KP1_902113032, TK, Tz);
Chris@42 150 }
Chris@42 151 }
Chris@42 152 }
Chris@42 153
Chris@42 154 static const kr2c_desc desc = { 15, "r2cb_15", {21, 0, 43, 0}, &GENUS };
Chris@42 155
Chris@42 156 void X(codelet_r2cb_15) (planner *p) {
Chris@42 157 X(kr2c_register) (p, r2cb_15, &desc);
Chris@42 158 }
Chris@42 159
Chris@42 160 #else /* HAVE_FMA */
Chris@42 161
Chris@42 162 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cb_15 -include r2cb.h */
Chris@42 163
Chris@42 164 /*
Chris@42 165 * This function contains 64 FP additions, 31 FP multiplications,
Chris@42 166 * (or, 47 additions, 14 multiplications, 17 fused multiply/add),
Chris@42 167 * 44 stack variables, 7 constants, and 30 memory accesses
Chris@42 168 */
Chris@42 169 #include "r2cb.h"
Chris@42 170
Chris@42 171 static void r2cb_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 172 {
Chris@42 173 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@42 174 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@42 175 DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
Chris@42 176 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 177 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 178 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 179 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 180 {
Chris@42 181 INT i;
Chris@42 182 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 183 E T3, Tu, Ti, TB, TZ, T10, TE, TG, TJ, Tn, Tv, Ts, Tw, T8, Td;
Chris@42 184 E Te;
Chris@42 185 {
Chris@42 186 E Th, T1, T2, Tf, Tg;
Chris@42 187 Tg = Ci[WS(csi, 5)];
Chris@42 188 Th = KP1_732050807 * Tg;
Chris@42 189 T1 = Cr[0];
Chris@42 190 T2 = Cr[WS(csr, 5)];
Chris@42 191 Tf = T1 - T2;
Chris@42 192 T3 = FMA(KP2_000000000, T2, T1);
Chris@42 193 Tu = Tf - Th;
Chris@42 194 Ti = Tf + Th;
Chris@42 195 }
Chris@42 196 {
Chris@42 197 E T4, TD, T9, TI, T5, T6, T7, Ta, Tb, Tc, Tr, TH, Tm, TC, Tj;
Chris@42 198 E To;
Chris@42 199 T4 = Cr[WS(csr, 3)];
Chris@42 200 TD = Ci[WS(csi, 3)];
Chris@42 201 T9 = Cr[WS(csr, 6)];
Chris@42 202 TI = Ci[WS(csi, 6)];
Chris@42 203 T5 = Cr[WS(csr, 7)];
Chris@42 204 T6 = Cr[WS(csr, 2)];
Chris@42 205 T7 = T5 + T6;
Chris@42 206 Ta = Cr[WS(csr, 4)];
Chris@42 207 Tb = Cr[WS(csr, 1)];
Chris@42 208 Tc = Ta + Tb;
Chris@42 209 {
Chris@42 210 E Tp, Tq, Tk, Tl;
Chris@42 211 Tp = Ci[WS(csi, 4)];
Chris@42 212 Tq = Ci[WS(csi, 1)];
Chris@42 213 Tr = KP866025403 * (Tp + Tq);
Chris@42 214 TH = Tp - Tq;
Chris@42 215 Tk = Ci[WS(csi, 7)];
Chris@42 216 Tl = Ci[WS(csi, 2)];
Chris@42 217 Tm = KP866025403 * (Tk - Tl);
Chris@42 218 TC = Tk + Tl;
Chris@42 219 }
Chris@42 220 TB = KP866025403 * (T5 - T6);
Chris@42 221 TZ = TD - TC;
Chris@42 222 T10 = TI - TH;
Chris@42 223 TE = FMA(KP500000000, TC, TD);
Chris@42 224 TG = KP866025403 * (Ta - Tb);
Chris@42 225 TJ = FMA(KP500000000, TH, TI);
Chris@42 226 Tj = FNMS(KP500000000, T7, T4);
Chris@42 227 Tn = Tj - Tm;
Chris@42 228 Tv = Tj + Tm;
Chris@42 229 To = FNMS(KP500000000, Tc, T9);
Chris@42 230 Ts = To - Tr;
Chris@42 231 Tw = To + Tr;
Chris@42 232 T8 = T4 + T7;
Chris@42 233 Td = T9 + Tc;
Chris@42 234 Te = T8 + Td;
Chris@42 235 }
Chris@42 236 R0[0] = FMA(KP2_000000000, Te, T3);
Chris@42 237 {
Chris@42 238 E T11, T13, TY, T12, TW, TX;
Chris@42 239 T11 = FNMS(KP1_902113032, T10, KP1_175570504 * TZ);
Chris@42 240 T13 = FMA(KP1_902113032, TZ, KP1_175570504 * T10);
Chris@42 241 TW = FNMS(KP500000000, Te, T3);
Chris@42 242 TX = KP1_118033988 * (T8 - Td);
Chris@42 243 TY = TW - TX;
Chris@42 244 T12 = TX + TW;
Chris@42 245 R0[WS(rs, 6)] = TY - T11;
Chris@42 246 R1[WS(rs, 4)] = T12 + T13;
Chris@42 247 R1[WS(rs, 1)] = TY + T11;
Chris@42 248 R0[WS(rs, 3)] = T12 - T13;
Chris@42 249 }
Chris@42 250 {
Chris@42 251 E TP, Tt, TO, TT, TV, TR, TS, TU, TQ;
Chris@42 252 TP = KP1_118033988 * (Tn - Ts);
Chris@42 253 Tt = Tn + Ts;
Chris@42 254 TO = FNMS(KP500000000, Tt, Ti);
Chris@42 255 TR = TE - TB;
Chris@42 256 TS = TJ - TG;
Chris@42 257 TT = FNMS(KP1_902113032, TS, KP1_175570504 * TR);
Chris@42 258 TV = FMA(KP1_902113032, TR, KP1_175570504 * TS);
Chris@42 259 R1[WS(rs, 2)] = FMA(KP2_000000000, Tt, Ti);
Chris@42 260 TU = TP + TO;
Chris@42 261 R1[WS(rs, 5)] = TU - TV;
Chris@42 262 R0[WS(rs, 7)] = TU + TV;
Chris@42 263 TQ = TO - TP;
Chris@42 264 R0[WS(rs, 1)] = TQ - TT;
Chris@42 265 R0[WS(rs, 4)] = TQ + TT;
Chris@42 266 }
Chris@42 267 {
Chris@42 268 E Tz, Tx, Ty, TL, TN, TF, TK, TM, TA;
Chris@42 269 Tz = KP1_118033988 * (Tv - Tw);
Chris@42 270 Tx = Tv + Tw;
Chris@42 271 Ty = FNMS(KP500000000, Tx, Tu);
Chris@42 272 TF = TB + TE;
Chris@42 273 TK = TG + TJ;
Chris@42 274 TL = FNMS(KP1_902113032, TK, KP1_175570504 * TF);
Chris@42 275 TN = FMA(KP1_902113032, TF, KP1_175570504 * TK);
Chris@42 276 R0[WS(rs, 5)] = FMA(KP2_000000000, Tx, Tu);
Chris@42 277 TM = Tz + Ty;
Chris@42 278 R1[0] = TM - TN;
Chris@42 279 R0[WS(rs, 2)] = TM + TN;
Chris@42 280 TA = Ty - Tz;
Chris@42 281 R1[WS(rs, 3)] = TA - TL;
Chris@42 282 R1[WS(rs, 6)] = TA + TL;
Chris@42 283 }
Chris@42 284 }
Chris@42 285 }
Chris@42 286 }
Chris@42 287
Chris@42 288 static const kr2c_desc desc = { 15, "r2cb_15", {47, 14, 17, 0}, &GENUS };
Chris@42 289
Chris@42 290 void X(codelet_r2cb_15) (planner *p) {
Chris@42 291 X(kr2c_register) (p, r2cb_15, &desc);
Chris@42 292 }
Chris@42 293
Chris@42 294 #endif /* HAVE_FMA */