annotate src/fftw-3.3.5/rdft/scalar/r2cf/r2cf_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:07 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include r2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 58 FP additions, 20 FP multiplications,
Chris@42 32 * (or, 38 additions, 0 multiplications, 20 fused multiply/add),
Chris@42 33 * 38 stack variables, 3 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cf.h"
Chris@42 36
Chris@42 37 static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 41 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 42 {
Chris@42 43 INT i;
Chris@42 44 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 45 E TQ, TP;
Chris@42 46 {
Chris@42 47 E TB, TN, Tf, T7, Te, Tv, TO, TE, Tq, TJ, Tp, TI, TT, Ty, Tm;
Chris@42 48 E Tr, TK, Ts;
Chris@42 49 {
Chris@42 50 E TC, Ta, Td, TD;
Chris@42 51 {
Chris@42 52 E T1, T2, T4, T5;
Chris@42 53 T1 = R0[0];
Chris@42 54 T2 = R0[WS(rs, 4)];
Chris@42 55 T4 = R0[WS(rs, 2)];
Chris@42 56 T5 = R0[WS(rs, 6)];
Chris@42 57 {
Chris@42 58 E T8, T3, T6, T9, Tb, Tc;
Chris@42 59 T8 = R0[WS(rs, 1)];
Chris@42 60 TB = T1 - T2;
Chris@42 61 T3 = T1 + T2;
Chris@42 62 TN = T4 - T5;
Chris@42 63 T6 = T4 + T5;
Chris@42 64 T9 = R0[WS(rs, 5)];
Chris@42 65 Tb = R0[WS(rs, 7)];
Chris@42 66 Tc = R0[WS(rs, 3)];
Chris@42 67 Tf = T3 - T6;
Chris@42 68 T7 = T3 + T6;
Chris@42 69 TC = T8 - T9;
Chris@42 70 Ta = T8 + T9;
Chris@42 71 Td = Tb + Tc;
Chris@42 72 TD = Tb - Tc;
Chris@42 73 }
Chris@42 74 }
Chris@42 75 {
Chris@42 76 E TG, Ti, Tj, Tk, Tg, Th;
Chris@42 77 Tg = R1[0];
Chris@42 78 Th = R1[WS(rs, 4)];
Chris@42 79 Te = Ta + Td;
Chris@42 80 Tv = Td - Ta;
Chris@42 81 TO = TD - TC;
Chris@42 82 TE = TC + TD;
Chris@42 83 TG = Tg - Th;
Chris@42 84 Ti = Tg + Th;
Chris@42 85 Tj = R1[WS(rs, 2)];
Chris@42 86 Tk = R1[WS(rs, 6)];
Chris@42 87 {
Chris@42 88 E Tn, To, TH, Tl;
Chris@42 89 Tn = R1[WS(rs, 7)];
Chris@42 90 To = R1[WS(rs, 3)];
Chris@42 91 Tq = R1[WS(rs, 1)];
Chris@42 92 TH = Tj - Tk;
Chris@42 93 Tl = Tj + Tk;
Chris@42 94 TJ = Tn - To;
Chris@42 95 Tp = Tn + To;
Chris@42 96 TI = FNMS(KP414213562, TH, TG);
Chris@42 97 TT = FMA(KP414213562, TG, TH);
Chris@42 98 Ty = Ti + Tl;
Chris@42 99 Tm = Ti - Tl;
Chris@42 100 Tr = R1[WS(rs, 5)];
Chris@42 101 }
Chris@42 102 }
Chris@42 103 }
Chris@42 104 Cr[WS(csr, 4)] = T7 - Te;
Chris@42 105 TK = Tr - Tq;
Chris@42 106 Ts = Tq + Tr;
Chris@42 107 {
Chris@42 108 E Tx, TV, TF, TS, Tz, Tt, TM, TL;
Chris@42 109 Tx = T7 + Te;
Chris@42 110 TV = FNMS(KP707106781, TE, TB);
Chris@42 111 TF = FMA(KP707106781, TE, TB);
Chris@42 112 TL = FNMS(KP414213562, TK, TJ);
Chris@42 113 TS = FMA(KP414213562, TJ, TK);
Chris@42 114 Tz = Tp + Ts;
Chris@42 115 Tt = Tp - Ts;
Chris@42 116 TM = TI + TL;
Chris@42 117 TQ = TL - TI;
Chris@42 118 {
Chris@42 119 E TR, TU, TW, TA, Tw, Tu;
Chris@42 120 TP = FMA(KP707106781, TO, TN);
Chris@42 121 TR = FNMS(KP707106781, TO, TN);
Chris@42 122 TA = Ty + Tz;
Chris@42 123 Ci[WS(csi, 4)] = Tz - Ty;
Chris@42 124 Tw = Tt - Tm;
Chris@42 125 Tu = Tm + Tt;
Chris@42 126 Cr[WS(csr, 1)] = FMA(KP923879532, TM, TF);
Chris@42 127 Cr[WS(csr, 7)] = FNMS(KP923879532, TM, TF);
Chris@42 128 Cr[0] = Tx + TA;
Chris@42 129 Cr[WS(csr, 8)] = Tx - TA;
Chris@42 130 Ci[WS(csi, 6)] = FMS(KP707106781, Tw, Tv);
Chris@42 131 Ci[WS(csi, 2)] = FMA(KP707106781, Tw, Tv);
Chris@42 132 Cr[WS(csr, 2)] = FMA(KP707106781, Tu, Tf);
Chris@42 133 Cr[WS(csr, 6)] = FNMS(KP707106781, Tu, Tf);
Chris@42 134 TU = TS - TT;
Chris@42 135 TW = TT + TS;
Chris@42 136 Ci[WS(csi, 7)] = FMA(KP923879532, TU, TR);
Chris@42 137 Ci[WS(csi, 1)] = FMS(KP923879532, TU, TR);
Chris@42 138 Cr[WS(csr, 3)] = FMA(KP923879532, TW, TV);
Chris@42 139 Cr[WS(csr, 5)] = FNMS(KP923879532, TW, TV);
Chris@42 140 }
Chris@42 141 }
Chris@42 142 }
Chris@42 143 Ci[WS(csi, 5)] = FMS(KP923879532, TQ, TP);
Chris@42 144 Ci[WS(csi, 3)] = FMA(KP923879532, TQ, TP);
Chris@42 145 }
Chris@42 146 }
Chris@42 147 }
Chris@42 148
Chris@42 149 static const kr2c_desc desc = { 16, "r2cf_16", {38, 0, 20, 0}, &GENUS };
Chris@42 150
Chris@42 151 void X(codelet_r2cf_16) (planner *p) {
Chris@42 152 X(kr2c_register) (p, r2cf_16, &desc);
Chris@42 153 }
Chris@42 154
Chris@42 155 #else /* HAVE_FMA */
Chris@42 156
Chris@42 157 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include r2cf.h */
Chris@42 158
Chris@42 159 /*
Chris@42 160 * This function contains 58 FP additions, 12 FP multiplications,
Chris@42 161 * (or, 54 additions, 8 multiplications, 4 fused multiply/add),
Chris@42 162 * 34 stack variables, 3 constants, and 32 memory accesses
Chris@42 163 */
Chris@42 164 #include "r2cf.h"
Chris@42 165
Chris@42 166 static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 167 {
Chris@42 168 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 169 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 170 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 171 {
Chris@42 172 INT i;
Chris@42 173 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 174 E T3, T6, T7, Tz, Ti, Ta, Td, Te, TA, Th, Tq, TV, TF, TP, Tx;
Chris@42 175 E TU, TE, TM, Tg, Tf, TJ, TQ;
Chris@42 176 {
Chris@42 177 E T1, T2, T4, T5;
Chris@42 178 T1 = R0[0];
Chris@42 179 T2 = R0[WS(rs, 4)];
Chris@42 180 T3 = T1 + T2;
Chris@42 181 T4 = R0[WS(rs, 2)];
Chris@42 182 T5 = R0[WS(rs, 6)];
Chris@42 183 T6 = T4 + T5;
Chris@42 184 T7 = T3 + T6;
Chris@42 185 Tz = T1 - T2;
Chris@42 186 Ti = T4 - T5;
Chris@42 187 }
Chris@42 188 {
Chris@42 189 E T8, T9, Tb, Tc;
Chris@42 190 T8 = R0[WS(rs, 1)];
Chris@42 191 T9 = R0[WS(rs, 5)];
Chris@42 192 Ta = T8 + T9;
Chris@42 193 Tg = T8 - T9;
Chris@42 194 Tb = R0[WS(rs, 7)];
Chris@42 195 Tc = R0[WS(rs, 3)];
Chris@42 196 Td = Tb + Tc;
Chris@42 197 Tf = Tb - Tc;
Chris@42 198 }
Chris@42 199 Te = Ta + Td;
Chris@42 200 TA = KP707106781 * (Tg + Tf);
Chris@42 201 Th = KP707106781 * (Tf - Tg);
Chris@42 202 {
Chris@42 203 E Tm, TN, Tp, TO;
Chris@42 204 {
Chris@42 205 E Tk, Tl, Tn, To;
Chris@42 206 Tk = R1[WS(rs, 7)];
Chris@42 207 Tl = R1[WS(rs, 3)];
Chris@42 208 Tm = Tk - Tl;
Chris@42 209 TN = Tk + Tl;
Chris@42 210 Tn = R1[WS(rs, 1)];
Chris@42 211 To = R1[WS(rs, 5)];
Chris@42 212 Tp = Tn - To;
Chris@42 213 TO = Tn + To;
Chris@42 214 }
Chris@42 215 Tq = FNMS(KP923879532, Tp, KP382683432 * Tm);
Chris@42 216 TV = TN + TO;
Chris@42 217 TF = FMA(KP923879532, Tm, KP382683432 * Tp);
Chris@42 218 TP = TN - TO;
Chris@42 219 }
Chris@42 220 {
Chris@42 221 E Tt, TK, Tw, TL;
Chris@42 222 {
Chris@42 223 E Tr, Ts, Tu, Tv;
Chris@42 224 Tr = R1[0];
Chris@42 225 Ts = R1[WS(rs, 4)];
Chris@42 226 Tt = Tr - Ts;
Chris@42 227 TK = Tr + Ts;
Chris@42 228 Tu = R1[WS(rs, 2)];
Chris@42 229 Tv = R1[WS(rs, 6)];
Chris@42 230 Tw = Tu - Tv;
Chris@42 231 TL = Tu + Tv;
Chris@42 232 }
Chris@42 233 Tx = FMA(KP382683432, Tt, KP923879532 * Tw);
Chris@42 234 TU = TK + TL;
Chris@42 235 TE = FNMS(KP382683432, Tw, KP923879532 * Tt);
Chris@42 236 TM = TK - TL;
Chris@42 237 }
Chris@42 238 Cr[WS(csr, 4)] = T7 - Te;
Chris@42 239 Ci[WS(csi, 4)] = TV - TU;
Chris@42 240 {
Chris@42 241 E Tj, Ty, TD, TG;
Chris@42 242 Tj = Th - Ti;
Chris@42 243 Ty = Tq - Tx;
Chris@42 244 Ci[WS(csi, 1)] = Tj + Ty;
Chris@42 245 Ci[WS(csi, 7)] = Ty - Tj;
Chris@42 246 TD = Tz + TA;
Chris@42 247 TG = TE + TF;
Chris@42 248 Cr[WS(csr, 7)] = TD - TG;
Chris@42 249 Cr[WS(csr, 1)] = TD + TG;
Chris@42 250 }
Chris@42 251 {
Chris@42 252 E TB, TC, TH, TI;
Chris@42 253 TB = Tz - TA;
Chris@42 254 TC = Tx + Tq;
Chris@42 255 Cr[WS(csr, 5)] = TB - TC;
Chris@42 256 Cr[WS(csr, 3)] = TB + TC;
Chris@42 257 TH = Ti + Th;
Chris@42 258 TI = TF - TE;
Chris@42 259 Ci[WS(csi, 3)] = TH + TI;
Chris@42 260 Ci[WS(csi, 5)] = TI - TH;
Chris@42 261 }
Chris@42 262 TJ = T3 - T6;
Chris@42 263 TQ = KP707106781 * (TM + TP);
Chris@42 264 Cr[WS(csr, 6)] = TJ - TQ;
Chris@42 265 Cr[WS(csr, 2)] = TJ + TQ;
Chris@42 266 {
Chris@42 267 E TR, TS, TT, TW;
Chris@42 268 TR = Td - Ta;
Chris@42 269 TS = KP707106781 * (TP - TM);
Chris@42 270 Ci[WS(csi, 2)] = TR + TS;
Chris@42 271 Ci[WS(csi, 6)] = TS - TR;
Chris@42 272 TT = T7 + Te;
Chris@42 273 TW = TU + TV;
Chris@42 274 Cr[WS(csr, 8)] = TT - TW;
Chris@42 275 Cr[0] = TT + TW;
Chris@42 276 }
Chris@42 277 }
Chris@42 278 }
Chris@42 279 }
Chris@42 280
Chris@42 281 static const kr2c_desc desc = { 16, "r2cf_16", {54, 8, 4, 0}, &GENUS };
Chris@42 282
Chris@42 283 void X(codelet_r2cf_16) (planner *p) {
Chris@42 284 X(kr2c_register) (p, r2cf_16, &desc);
Chris@42 285 }
Chris@42 286
Chris@42 287 #endif /* HAVE_FMA */