annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cbIII_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:39 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cbIII_9 -dft-III -include r2cbIII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 32 FP additions, 24 FP multiplications,
Chris@42 32 * (or, 8 additions, 0 multiplications, 24 fused multiply/add),
Chris@42 33 * 40 stack variables, 12 constants, and 18 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cbIII.h"
Chris@42 36
Chris@42 37 static void r2cbIII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
Chris@42 40 DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
Chris@42 41 DK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@42 42 DK(KP1_532088886, +1.532088886237956070404785301110833347871664914);
Chris@42 43 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 44 DK(KP1_969615506, +1.969615506024416118733486049179046027341286503);
Chris@42 45 DK(KP839099631, +0.839099631177280011763127298123181364687434283);
Chris@42 46 DK(KP176326980, +0.176326980708464973471090386868618986121633062);
Chris@42 47 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 48 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 49 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 50 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 51 {
Chris@42 52 INT i;
Chris@42 53 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
Chris@42 54 E T4, Td, T3, Th, Tr, Tm, T7, Tc, Tj, Tg, T1, T2;
Chris@42 55 Tg = Ci[WS(csi, 1)];
Chris@42 56 T1 = Cr[WS(csr, 4)];
Chris@42 57 T2 = Cr[WS(csr, 1)];
Chris@42 58 T4 = Cr[WS(csr, 3)];
Chris@42 59 Td = Ci[WS(csi, 3)];
Chris@42 60 {
Chris@42 61 E T5, Tf, T6, Ta, Tb;
Chris@42 62 T5 = Cr[0];
Chris@42 63 Tf = T2 - T1;
Chris@42 64 T3 = FMA(KP2_000000000, T2, T1);
Chris@42 65 T6 = Cr[WS(csr, 2)];
Chris@42 66 Ta = Ci[WS(csi, 2)];
Chris@42 67 Tb = Ci[0];
Chris@42 68 Th = FNMS(KP1_732050807, Tg, Tf);
Chris@42 69 Tr = FMA(KP1_732050807, Tg, Tf);
Chris@42 70 Tm = T5 - T6;
Chris@42 71 T7 = T5 + T6;
Chris@42 72 Tc = Ta - Tb;
Chris@42 73 Tj = Tb + Ta;
Chris@42 74 }
Chris@42 75 {
Chris@42 76 E Tw, Tq, Tv, Tp, Ti, T8;
Chris@42 77 Ti = FNMS(KP500000000, T7, T4);
Chris@42 78 T8 = T4 + T7;
Chris@42 79 {
Chris@42 80 E Te, Tl, Tt, Tk, T9;
Chris@42 81 Te = Tc - Td;
Chris@42 82 Tl = FMA(KP500000000, Tc, Td);
Chris@42 83 Tt = FNMS(KP866025403, Tj, Ti);
Chris@42 84 Tk = FMA(KP866025403, Tj, Ti);
Chris@42 85 T9 = T8 - T3;
Chris@42 86 R0[0] = FMA(KP2_000000000, T8, T3);
Chris@42 87 {
Chris@42 88 E Ts, Tn, Tu, To;
Chris@42 89 Ts = FMA(KP866025403, Tm, Tl);
Chris@42 90 Tn = FNMS(KP866025403, Tm, Tl);
Chris@42 91 R0[WS(rs, 3)] = FMS(KP1_732050807, Te, T9);
Chris@42 92 R1[WS(rs, 1)] = FMA(KP1_732050807, Te, T9);
Chris@42 93 Tu = FMA(KP176326980, Tt, Ts);
Chris@42 94 Tw = FNMS(KP176326980, Ts, Tt);
Chris@42 95 To = FMA(KP839099631, Tn, Tk);
Chris@42 96 Tq = FNMS(KP839099631, Tk, Tn);
Chris@42 97 R0[WS(rs, 1)] = FMS(KP1_969615506, Tu, Tr);
Chris@42 98 Tv = FMA(KP984807753, Tu, Tr);
Chris@42 99 R1[0] = FNMS(KP1_532088886, To, Th);
Chris@42 100 Tp = FMA(KP766044443, To, Th);
Chris@42 101 }
Chris@42 102 }
Chris@42 103 R0[WS(rs, 4)] = FMS(KP1_705737063, Tw, Tv);
Chris@42 104 R1[WS(rs, 2)] = FMA(KP1_705737063, Tw, Tv);
Chris@42 105 R0[WS(rs, 2)] = FMS(KP1_326827896, Tq, Tp);
Chris@42 106 R1[WS(rs, 3)] = FMA(KP1_326827896, Tq, Tp);
Chris@42 107 }
Chris@42 108 }
Chris@42 109 }
Chris@42 110 }
Chris@42 111
Chris@42 112 static const kr2c_desc desc = { 9, "r2cbIII_9", {8, 0, 24, 0}, &GENUS };
Chris@42 113
Chris@42 114 void X(codelet_r2cbIII_9) (planner *p) {
Chris@42 115 X(kr2c_register) (p, r2cbIII_9, &desc);
Chris@42 116 }
Chris@42 117
Chris@42 118 #else /* HAVE_FMA */
Chris@42 119
Chris@42 120 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 9 -name r2cbIII_9 -dft-III -include r2cbIII.h */
Chris@42 121
Chris@42 122 /*
Chris@42 123 * This function contains 32 FP additions, 18 FP multiplications,
Chris@42 124 * (or, 22 additions, 8 multiplications, 10 fused multiply/add),
Chris@42 125 * 35 stack variables, 12 constants, and 18 memory accesses
Chris@42 126 */
Chris@42 127 #include "r2cbIII.h"
Chris@42 128
Chris@42 129 static void r2cbIII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 130 {
Chris@42 131 DK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@42 132 DK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@42 133 DK(KP1_326827896, +1.326827896337876792410842639271782594433726619);
Chris@42 134 DK(KP1_113340798, +1.113340798452838732905825904094046265936583811);
Chris@42 135 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 136 DK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@42 137 DK(KP1_705737063, +1.705737063904886419256501927880148143872040591);
Chris@42 138 DK(KP300767466, +0.300767466360870593278543795225003852144476517);
Chris@42 139 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 140 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 141 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 142 DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
Chris@42 143 {
Chris@42 144 INT i;
Chris@42 145 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
Chris@42 146 E T3, Ts, Ti, Td, Tc, T8, To, Tu, Tl, Tt, T9, Te;
Chris@42 147 {
Chris@42 148 E Th, T1, T2, Tf, Tg;
Chris@42 149 Tg = Ci[WS(csi, 1)];
Chris@42 150 Th = KP1_732050807 * Tg;
Chris@42 151 T1 = Cr[WS(csr, 4)];
Chris@42 152 T2 = Cr[WS(csr, 1)];
Chris@42 153 Tf = T2 - T1;
Chris@42 154 T3 = FMA(KP2_000000000, T2, T1);
Chris@42 155 Ts = Tf - Th;
Chris@42 156 Ti = Tf + Th;
Chris@42 157 }
Chris@42 158 {
Chris@42 159 E T4, T7, Tm, Tk, Tn, Tj;
Chris@42 160 T4 = Cr[WS(csr, 3)];
Chris@42 161 Td = Ci[WS(csi, 3)];
Chris@42 162 {
Chris@42 163 E T5, T6, Ta, Tb;
Chris@42 164 T5 = Cr[0];
Chris@42 165 T6 = Cr[WS(csr, 2)];
Chris@42 166 T7 = T5 + T6;
Chris@42 167 Tm = KP866025403 * (T6 - T5);
Chris@42 168 Ta = Ci[WS(csi, 2)];
Chris@42 169 Tb = Ci[0];
Chris@42 170 Tc = Ta - Tb;
Chris@42 171 Tk = KP866025403 * (Tb + Ta);
Chris@42 172 }
Chris@42 173 T8 = T4 + T7;
Chris@42 174 Tn = FMA(KP500000000, Tc, Td);
Chris@42 175 To = Tm - Tn;
Chris@42 176 Tu = Tm + Tn;
Chris@42 177 Tj = FMS(KP500000000, T7, T4);
Chris@42 178 Tl = Tj + Tk;
Chris@42 179 Tt = Tj - Tk;
Chris@42 180 }
Chris@42 181 R0[0] = FMA(KP2_000000000, T8, T3);
Chris@42 182 T9 = T8 - T3;
Chris@42 183 Te = KP1_732050807 * (Tc - Td);
Chris@42 184 R1[WS(rs, 1)] = T9 + Te;
Chris@42 185 R0[WS(rs, 3)] = Te - T9;
Chris@42 186 {
Chris@42 187 E Tr, Tp, Tq, Tx, Tv, Tw;
Chris@42 188 Tr = FNMS(KP1_705737063, Tl, KP300767466 * To);
Chris@42 189 Tp = FMA(KP173648177, Tl, KP984807753 * To);
Chris@42 190 Tq = Ti - Tp;
Chris@42 191 R0[WS(rs, 1)] = -(FMA(KP2_000000000, Tp, Ti));
Chris@42 192 R0[WS(rs, 4)] = Tr - Tq;
Chris@42 193 R1[WS(rs, 2)] = Tq + Tr;
Chris@42 194 Tx = FMA(KP1_113340798, Tt, KP1_326827896 * Tu);
Chris@42 195 Tv = FNMS(KP642787609, Tu, KP766044443 * Tt);
Chris@42 196 Tw = Tv - Ts;
Chris@42 197 R1[0] = FMA(KP2_000000000, Tv, Ts);
Chris@42 198 R1[WS(rs, 3)] = Tx - Tw;
Chris@42 199 R0[WS(rs, 2)] = Tw + Tx;
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204
Chris@42 205 static const kr2c_desc desc = { 9, "r2cbIII_9", {22, 8, 10, 0}, &GENUS };
Chris@42 206
Chris@42 207 void X(codelet_r2cbIII_9) (planner *p) {
Chris@42 208 X(kr2c_register) (p, r2cbIII_9, &desc);
Chris@42 209 }
Chris@42 210
Chris@42 211 #endif /* HAVE_FMA */