annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cbIII_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:42 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include r2cbIII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 64 FP additions, 43 FP multiplications,
Chris@42 32 * (or, 21 additions, 0 multiplications, 43 fused multiply/add),
Chris@42 33 * 48 stack variables, 9 constants, and 30 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cbIII.h"
Chris@42 36
Chris@42 37 static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 41 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 42 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 43 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@42 44 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@42 45 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 46 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 47 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 48 {
Chris@42 49 INT i;
Chris@42 50 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 51 E TX, Tv, To, TW, Tl, Tx, Ty, Tw;
Chris@42 52 {
Chris@42 53 E TA, Tk, T6, T5, Tz, Th, TI, Tp, Tu, TK, TR, Tn, Td, Tq;
Chris@42 54 {
Chris@42 55 E T1, T2, T3, Ti, Tj;
Chris@42 56 Ti = Ci[WS(csi, 4)];
Chris@42 57 Tj = Ci[WS(csi, 1)];
Chris@42 58 T1 = Cr[WS(csr, 7)];
Chris@42 59 T2 = Cr[WS(csr, 4)];
Chris@42 60 T3 = Cr[WS(csr, 1)];
Chris@42 61 TA = FNMS(KP618033988, Ti, Tj);
Chris@42 62 Tk = FMA(KP618033988, Tj, Ti);
Chris@42 63 {
Chris@42 64 E T7, TP, Tc, T8;
Chris@42 65 T6 = Cr[WS(csr, 2)];
Chris@42 66 {
Chris@42 67 E T4, Tg, Ta, Tb, Tf;
Chris@42 68 T4 = T2 + T3;
Chris@42 69 Tg = T2 - T3;
Chris@42 70 Ta = Cr[WS(csr, 3)];
Chris@42 71 Tb = Cr[WS(csr, 6)];
Chris@42 72 T7 = Cr[0];
Chris@42 73 Tf = FNMS(KP500000000, T4, T1);
Chris@42 74 T5 = FMA(KP2_000000000, T4, T1);
Chris@42 75 TP = Ta - Tb;
Chris@42 76 Tc = Ta + Tb;
Chris@42 77 Tz = FNMS(KP1_118033988, Tg, Tf);
Chris@42 78 Th = FMA(KP1_118033988, Tg, Tf);
Chris@42 79 T8 = Cr[WS(csr, 5)];
Chris@42 80 }
Chris@42 81 TI = Ci[WS(csi, 2)];
Chris@42 82 {
Chris@42 83 E Ts, Tt, TQ, T9;
Chris@42 84 Ts = Ci[WS(csi, 3)];
Chris@42 85 Tt = Ci[WS(csi, 6)];
Chris@42 86 TQ = T7 - T8;
Chris@42 87 T9 = T7 + T8;
Chris@42 88 Tp = Ci[0];
Chris@42 89 Tu = Ts - Tt;
Chris@42 90 TK = Ts + Tt;
Chris@42 91 TX = FMA(KP618033988, TP, TQ);
Chris@42 92 TR = FNMS(KP618033988, TQ, TP);
Chris@42 93 Tn = T9 - Tc;
Chris@42 94 Td = T9 + Tc;
Chris@42 95 Tq = Ci[WS(csi, 5)];
Chris@42 96 }
Chris@42 97 }
Chris@42 98 }
Chris@42 99 {
Chris@42 100 E TB, TF, TO, TG, TE;
Chris@42 101 {
Chris@42 102 E Tm, T11, TN, TD, TM, T12, TC;
Chris@42 103 TB = FNMS(KP1_902113032, TA, Tz);
Chris@42 104 TF = FMA(KP1_902113032, TA, Tz);
Chris@42 105 {
Chris@42 106 E Te, Tr, TJ, TL;
Chris@42 107 Tm = FNMS(KP250000000, Td, T6);
Chris@42 108 Te = T6 + Td;
Chris@42 109 Tr = Tp + Tq;
Chris@42 110 TJ = Tq - Tp;
Chris@42 111 R0[0] = FMA(KP2_000000000, Te, T5);
Chris@42 112 T11 = Te - T5;
Chris@42 113 TN = TJ + TK;
Chris@42 114 TL = TJ - TK;
Chris@42 115 Tv = FMA(KP618033988, Tu, Tr);
Chris@42 116 TD = FNMS(KP618033988, Tr, Tu);
Chris@42 117 TM = FNMS(KP250000000, TL, TI);
Chris@42 118 T12 = TL + TI;
Chris@42 119 }
Chris@42 120 TC = FNMS(KP559016994, Tn, Tm);
Chris@42 121 To = FMA(KP559016994, Tn, Tm);
Chris@42 122 R1[WS(rs, 2)] = FMA(KP1_732050807, T12, T11);
Chris@42 123 R0[WS(rs, 5)] = FMS(KP1_732050807, T12, T11);
Chris@42 124 TW = FMA(KP559016994, TN, TM);
Chris@42 125 TO = FNMS(KP559016994, TN, TM);
Chris@42 126 TG = FNMS(KP951056516, TD, TC);
Chris@42 127 TE = FMA(KP951056516, TD, TC);
Chris@42 128 }
Chris@42 129 Tl = FNMS(KP1_902113032, Tk, Th);
Chris@42 130 Tx = FMA(KP1_902113032, Tk, Th);
Chris@42 131 {
Chris@42 132 E TS, TU, TT, TH;
Chris@42 133 TS = FMA(KP951056516, TR, TO);
Chris@42 134 TU = FNMS(KP951056516, TR, TO);
Chris@42 135 TT = TF - TG;
Chris@42 136 R1[WS(rs, 1)] = -(FMA(KP2_000000000, TG, TF));
Chris@42 137 TH = TB - TE;
Chris@42 138 R0[WS(rs, 6)] = FMA(KP2_000000000, TE, TB);
Chris@42 139 R1[WS(rs, 6)] = -(FMA(KP1_732050807, TU, TT));
Chris@42 140 R0[WS(rs, 4)] = FNMS(KP1_732050807, TU, TT);
Chris@42 141 R1[WS(rs, 3)] = -(FMA(KP1_732050807, TS, TH));
Chris@42 142 R0[WS(rs, 1)] = FNMS(KP1_732050807, TS, TH);
Chris@42 143 }
Chris@42 144 }
Chris@42 145 }
Chris@42 146 Ty = FNMS(KP951056516, Tv, To);
Chris@42 147 Tw = FMA(KP951056516, Tv, To);
Chris@42 148 {
Chris@42 149 E T10, TY, TV, TZ;
Chris@42 150 T10 = FMA(KP951056516, TX, TW);
Chris@42 151 TY = FNMS(KP951056516, TX, TW);
Chris@42 152 TV = Ty - Tx;
Chris@42 153 R0[WS(rs, 3)] = FMA(KP2_000000000, Ty, Tx);
Chris@42 154 TZ = Tl - Tw;
Chris@42 155 R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tw, Tl));
Chris@42 156 R1[WS(rs, 5)] = FMA(KP1_732050807, TY, TV);
Chris@42 157 R1[0] = FNMS(KP1_732050807, TY, TV);
Chris@42 158 R0[WS(rs, 2)] = FMA(KP1_732050807, T10, TZ);
Chris@42 159 R0[WS(rs, 7)] = FNMS(KP1_732050807, T10, TZ);
Chris@42 160 }
Chris@42 161 }
Chris@42 162 }
Chris@42 163 }
Chris@42 164
Chris@42 165 static const kr2c_desc desc = { 15, "r2cbIII_15", {21, 0, 43, 0}, &GENUS };
Chris@42 166
Chris@42 167 void X(codelet_r2cbIII_15) (planner *p) {
Chris@42 168 X(kr2c_register) (p, r2cbIII_15, &desc);
Chris@42 169 }
Chris@42 170
Chris@42 171 #else /* HAVE_FMA */
Chris@42 172
Chris@42 173 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 15 -name r2cbIII_15 -dft-III -include r2cbIII.h */
Chris@42 174
Chris@42 175 /*
Chris@42 176 * This function contains 64 FP additions, 26 FP multiplications,
Chris@42 177 * (or, 49 additions, 11 multiplications, 15 fused multiply/add),
Chris@42 178 * 47 stack variables, 14 constants, and 30 memory accesses
Chris@42 179 */
Chris@42 180 #include "r2cbIII.h"
Chris@42 181
Chris@42 182 static void r2cbIII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 183 {
Chris@42 184 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 185 DK(KP433012701, +0.433012701892219323381861585376468091735701313);
Chris@42 186 DK(KP968245836, +0.968245836551854221294816349945599902708230426);
Chris@42 187 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 188 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 189 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 190 DK(KP1_647278207, +1.647278207092663851754840078556380006059321028);
Chris@42 191 DK(KP1_018073920, +1.018073920910254366901961726787815297021466329);
Chris@42 192 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 193 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 194 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 195 DK(KP1_118033988, +1.118033988749894848204586834365638117720309180);
Chris@42 196 DK(KP1_175570504, +1.175570504584946258337411909278145537195304875);
Chris@42 197 DK(KP1_902113032, +1.902113032590307144232878666758764286811397268);
Chris@42 198 {
Chris@42 199 INT i;
Chris@42 200 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
Chris@42 201 E Tv, TD, T5, Ts, TC, T6, Tf, TW, TK, Td, Tg, TP, To, TN, TA;
Chris@42 202 E TO, TQ, Tt, Tu, T12, Te, T11;
Chris@42 203 Tt = Ci[WS(csi, 4)];
Chris@42 204 Tu = Ci[WS(csi, 1)];
Chris@42 205 Tv = FMA(KP1_902113032, Tt, KP1_175570504 * Tu);
Chris@42 206 TD = FNMS(KP1_175570504, Tt, KP1_902113032 * Tu);
Chris@42 207 {
Chris@42 208 E T1, T4, Tq, T2, T3, Tr;
Chris@42 209 T1 = Cr[WS(csr, 7)];
Chris@42 210 T2 = Cr[WS(csr, 4)];
Chris@42 211 T3 = Cr[WS(csr, 1)];
Chris@42 212 T4 = T2 + T3;
Chris@42 213 Tq = KP1_118033988 * (T2 - T3);
Chris@42 214 T5 = FMA(KP2_000000000, T4, T1);
Chris@42 215 Tr = FNMS(KP500000000, T4, T1);
Chris@42 216 Ts = Tq + Tr;
Chris@42 217 TC = Tr - Tq;
Chris@42 218 }
Chris@42 219 {
Chris@42 220 E Tc, TJ, T9, TI;
Chris@42 221 T6 = Cr[WS(csr, 2)];
Chris@42 222 {
Chris@42 223 E Ta, Tb, T7, T8;
Chris@42 224 Ta = Cr[WS(csr, 3)];
Chris@42 225 Tb = Cr[WS(csr, 6)];
Chris@42 226 Tc = Ta + Tb;
Chris@42 227 TJ = Ta - Tb;
Chris@42 228 T7 = Cr[0];
Chris@42 229 T8 = Cr[WS(csr, 5)];
Chris@42 230 T9 = T7 + T8;
Chris@42 231 TI = T7 - T8;
Chris@42 232 }
Chris@42 233 Tf = KP559016994 * (T9 - Tc);
Chris@42 234 TW = FNMS(KP1_647278207, TJ, KP1_018073920 * TI);
Chris@42 235 TK = FMA(KP1_647278207, TI, KP1_018073920 * TJ);
Chris@42 236 Td = T9 + Tc;
Chris@42 237 Tg = FNMS(KP250000000, Td, T6);
Chris@42 238 }
Chris@42 239 {
Chris@42 240 E Tn, TM, Tk, TL;
Chris@42 241 TP = Ci[WS(csi, 2)];
Chris@42 242 {
Chris@42 243 E Tl, Tm, Ti, Tj;
Chris@42 244 Tl = Ci[WS(csi, 3)];
Chris@42 245 Tm = Ci[WS(csi, 6)];
Chris@42 246 Tn = Tl - Tm;
Chris@42 247 TM = Tl + Tm;
Chris@42 248 Ti = Ci[0];
Chris@42 249 Tj = Ci[WS(csi, 5)];
Chris@42 250 Tk = Ti + Tj;
Chris@42 251 TL = Ti - Tj;
Chris@42 252 }
Chris@42 253 To = FMA(KP951056516, Tk, KP587785252 * Tn);
Chris@42 254 TN = KP968245836 * (TL - TM);
Chris@42 255 TA = FNMS(KP587785252, Tk, KP951056516 * Tn);
Chris@42 256 TO = TL + TM;
Chris@42 257 TQ = FMA(KP433012701, TO, KP1_732050807 * TP);
Chris@42 258 }
Chris@42 259 T12 = KP1_732050807 * (TP - TO);
Chris@42 260 Te = T6 + Td;
Chris@42 261 T11 = Te - T5;
Chris@42 262 R0[0] = FMA(KP2_000000000, Te, T5);
Chris@42 263 R0[WS(rs, 5)] = T12 - T11;
Chris@42 264 R1[WS(rs, 2)] = T11 + T12;
Chris@42 265 {
Chris@42 266 E TE, TG, TB, TF, TY, T10, Tz, TX, TV, TZ;
Chris@42 267 TE = TC - TD;
Chris@42 268 TG = TC + TD;
Chris@42 269 Tz = Tg - Tf;
Chris@42 270 TB = Tz + TA;
Chris@42 271 TF = TA - Tz;
Chris@42 272 TX = TN + TQ;
Chris@42 273 TY = TW - TX;
Chris@42 274 T10 = TW + TX;
Chris@42 275 R0[WS(rs, 6)] = FMA(KP2_000000000, TB, TE);
Chris@42 276 R1[WS(rs, 1)] = FMS(KP2_000000000, TF, TG);
Chris@42 277 TV = TE - TB;
Chris@42 278 R0[WS(rs, 1)] = TV + TY;
Chris@42 279 R1[WS(rs, 3)] = TY - TV;
Chris@42 280 TZ = TF + TG;
Chris@42 281 R0[WS(rs, 4)] = TZ - T10;
Chris@42 282 R1[WS(rs, 6)] = -(TZ + T10);
Chris@42 283 }
Chris@42 284 {
Chris@42 285 E Tw, Ty, Tp, Tx, TS, TU, Th, TR, TH, TT;
Chris@42 286 Tw = Ts - Tv;
Chris@42 287 Ty = Ts + Tv;
Chris@42 288 Th = Tf + Tg;
Chris@42 289 Tp = Th + To;
Chris@42 290 Tx = Th - To;
Chris@42 291 TR = TN - TQ;
Chris@42 292 TS = TK + TR;
Chris@42 293 TU = TR - TK;
Chris@42 294 R1[WS(rs, 4)] = -(FMA(KP2_000000000, Tp, Tw));
Chris@42 295 R0[WS(rs, 3)] = FMA(KP2_000000000, Tx, Ty);
Chris@42 296 TH = Tx - Ty;
Chris@42 297 R1[WS(rs, 5)] = TH - TS;
Chris@42 298 R1[0] = TH + TS;
Chris@42 299 TT = Tw - Tp;
Chris@42 300 R0[WS(rs, 2)] = TT - TU;
Chris@42 301 R0[WS(rs, 7)] = TT + TU;
Chris@42 302 }
Chris@42 303 }
Chris@42 304 }
Chris@42 305 }
Chris@42 306
Chris@42 307 static const kr2c_desc desc = { 15, "r2cbIII_15", {49, 11, 15, 0}, &GENUS };
Chris@42 308
Chris@42 309 void X(codelet_r2cbIII_15) (planner *p) {
Chris@42 310 X(kr2c_register) (p, r2cbIII_15, &desc);
Chris@42 311 }
Chris@42 312
Chris@42 313 #endif /* HAVE_FMA */