annotate src/fftw-3.3.5/rdft/scalar/r2cf/r2cfII_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:47:27 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include r2cfII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 66 FP additions, 48 FP multiplications,
Chris@42 32 * (or, 18 additions, 0 multiplications, 48 fused multiply/add),
Chris@42 33 * 54 stack variables, 7 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cfII.h"
Chris@42 36
Chris@42 37 static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 40 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 41 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 42 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 43 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 46 {
Chris@42 47 INT i;
Chris@42 48 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 49 E TN, TF, TX, TV, TO, TP, TY, TM, TQ, TW;
Chris@42 50 {
Chris@42 51 E TT, TZ, TB, T5, Tu, TK, TJ, Tr, T9, TC, T8, Tl, TH, TG, Ti;
Chris@42 52 E Ta;
Chris@42 53 {
Chris@42 54 E T1, TR, Tn, Ts, To, TS, T4, Tp, T2, T3;
Chris@42 55 T1 = R0[0];
Chris@42 56 TR = R0[WS(rs, 4)];
Chris@42 57 T2 = R0[WS(rs, 2)];
Chris@42 58 T3 = R0[WS(rs, 6)];
Chris@42 59 Tn = R1[WS(rs, 7)];
Chris@42 60 Ts = R1[WS(rs, 3)];
Chris@42 61 To = R1[WS(rs, 1)];
Chris@42 62 TS = T2 + T3;
Chris@42 63 T4 = T2 - T3;
Chris@42 64 Tp = R1[WS(rs, 5)];
Chris@42 65 {
Chris@42 66 E Te, Tj, Tf, Tg, Tt, Tq;
Chris@42 67 Te = R1[0];
Chris@42 68 TT = FMA(KP707106781, TS, TR);
Chris@42 69 TZ = FNMS(KP707106781, TS, TR);
Chris@42 70 TB = FMA(KP707106781, T4, T1);
Chris@42 71 T5 = FNMS(KP707106781, T4, T1);
Chris@42 72 Tt = To + Tp;
Chris@42 73 Tq = To - Tp;
Chris@42 74 Tj = R1[WS(rs, 4)];
Chris@42 75 Tf = R1[WS(rs, 2)];
Chris@42 76 Tu = FNMS(KP707106781, Tt, Ts);
Chris@42 77 TK = FMA(KP707106781, Tt, Ts);
Chris@42 78 TJ = FMS(KP707106781, Tq, Tn);
Chris@42 79 Tr = FMA(KP707106781, Tq, Tn);
Chris@42 80 Tg = R1[WS(rs, 6)];
Chris@42 81 {
Chris@42 82 E T6, T7, Tk, Th;
Chris@42 83 T6 = R0[WS(rs, 5)];
Chris@42 84 T7 = R0[WS(rs, 1)];
Chris@42 85 T9 = R0[WS(rs, 3)];
Chris@42 86 Tk = Tf + Tg;
Chris@42 87 Th = Tf - Tg;
Chris@42 88 TC = FNMS(KP414213562, T6, T7);
Chris@42 89 T8 = FMA(KP414213562, T7, T6);
Chris@42 90 Tl = FNMS(KP707106781, Tk, Tj);
Chris@42 91 TH = FMA(KP707106781, Tk, Tj);
Chris@42 92 TG = FMA(KP707106781, Th, Te);
Chris@42 93 Ti = FNMS(KP707106781, Th, Te);
Chris@42 94 Ta = R0[WS(rs, 7)];
Chris@42 95 }
Chris@42 96 }
Chris@42 97 }
Chris@42 98 {
Chris@42 99 E TE, TU, Ty, Tv, TI, TL;
Chris@42 100 Ty = FNMS(KP668178637, Tr, Tu);
Chris@42 101 Tv = FMA(KP668178637, Tu, Tr);
Chris@42 102 {
Chris@42 103 E Tw, T14, T12, TA, T11, T13, Tx, Td;
Chris@42 104 {
Chris@42 105 E Tz, Tm, TD, Tb, T10, Tc;
Chris@42 106 Tz = FNMS(KP668178637, Ti, Tl);
Chris@42 107 Tm = FMA(KP668178637, Tl, Ti);
Chris@42 108 TD = FMS(KP414213562, T9, Ta);
Chris@42 109 Tb = FMA(KP414213562, Ta, T9);
Chris@42 110 Tw = Tm - Tv;
Chris@42 111 T14 = Tm + Tv;
Chris@42 112 T10 = TD - TC;
Chris@42 113 TE = TC + TD;
Chris@42 114 Tc = T8 - Tb;
Chris@42 115 TU = T8 + Tb;
Chris@42 116 T12 = Tz + Ty;
Chris@42 117 TA = Ty - Tz;
Chris@42 118 T11 = FMA(KP923879532, T10, TZ);
Chris@42 119 T13 = FNMS(KP923879532, T10, TZ);
Chris@42 120 Tx = FNMS(KP923879532, Tc, T5);
Chris@42 121 Td = FMA(KP923879532, Tc, T5);
Chris@42 122 }
Chris@42 123 Ci[WS(csi, 2)] = -(FMA(KP831469612, T14, T13));
Chris@42 124 Ci[WS(csi, 5)] = FNMS(KP831469612, T14, T13);
Chris@42 125 Cr[WS(csr, 1)] = FMA(KP831469612, Tw, Td);
Chris@42 126 Cr[WS(csr, 6)] = FNMS(KP831469612, Tw, Td);
Chris@42 127 Cr[WS(csr, 5)] = FNMS(KP831469612, TA, Tx);
Chris@42 128 Ci[WS(csi, 1)] = FMA(KP831469612, T12, T11);
Chris@42 129 Cr[WS(csr, 2)] = FMA(KP831469612, TA, Tx);
Chris@42 130 Ci[WS(csi, 6)] = FMS(KP831469612, T12, T11);
Chris@42 131 }
Chris@42 132 TN = FNMS(KP923879532, TE, TB);
Chris@42 133 TF = FMA(KP923879532, TE, TB);
Chris@42 134 TX = FNMS(KP923879532, TU, TT);
Chris@42 135 TV = FMA(KP923879532, TU, TT);
Chris@42 136 TO = FMA(KP198912367, TG, TH);
Chris@42 137 TI = FNMS(KP198912367, TH, TG);
Chris@42 138 TL = FMA(KP198912367, TK, TJ);
Chris@42 139 TP = FNMS(KP198912367, TJ, TK);
Chris@42 140 TY = TL - TI;
Chris@42 141 TM = TI + TL;
Chris@42 142 }
Chris@42 143 }
Chris@42 144 Ci[WS(csi, 4)] = FMS(KP980785280, TY, TX);
Chris@42 145 Ci[WS(csi, 3)] = FMA(KP980785280, TY, TX);
Chris@42 146 Cr[0] = FMA(KP980785280, TM, TF);
Chris@42 147 Cr[WS(csr, 7)] = FNMS(KP980785280, TM, TF);
Chris@42 148 TQ = TO - TP;
Chris@42 149 TW = TO + TP;
Chris@42 150 Ci[0] = -(FMA(KP980785280, TW, TV));
Chris@42 151 Ci[WS(csi, 7)] = FNMS(KP980785280, TW, TV);
Chris@42 152 Cr[WS(csr, 3)] = FMA(KP980785280, TQ, TN);
Chris@42 153 Cr[WS(csr, 4)] = FNMS(KP980785280, TQ, TN);
Chris@42 154 }
Chris@42 155 }
Chris@42 156 }
Chris@42 157
Chris@42 158 static const kr2c_desc desc = { 16, "r2cfII_16", {18, 0, 48, 0}, &GENUS };
Chris@42 159
Chris@42 160 void X(codelet_r2cfII_16) (planner *p) {
Chris@42 161 X(kr2c_register) (p, r2cfII_16, &desc);
Chris@42 162 }
Chris@42 163
Chris@42 164 #else /* HAVE_FMA */
Chris@42 165
Chris@42 166 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include r2cfII.h */
Chris@42 167
Chris@42 168 /*
Chris@42 169 * This function contains 66 FP additions, 30 FP multiplications,
Chris@42 170 * (or, 54 additions, 18 multiplications, 12 fused multiply/add),
Chris@42 171 * 32 stack variables, 7 constants, and 32 memory accesses
Chris@42 172 */
Chris@42 173 #include "r2cfII.h"
Chris@42 174
Chris@42 175 static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 176 {
Chris@42 177 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@42 178 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@42 179 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@42 180 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@42 181 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 182 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 183 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 184 {
Chris@42 185 INT i;
Chris@42 186 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@42 187 E T5, T11, TB, TV, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
Chris@42 188 E TS;
Chris@42 189 {
Chris@42 190 E T1, TU, T4, TT, T2, T3;
Chris@42 191 T1 = R0[0];
Chris@42 192 TU = R0[WS(rs, 4)];
Chris@42 193 T2 = R0[WS(rs, 2)];
Chris@42 194 T3 = R0[WS(rs, 6)];
Chris@42 195 T4 = KP707106781 * (T2 - T3);
Chris@42 196 TT = KP707106781 * (T2 + T3);
Chris@42 197 T5 = T1 + T4;
Chris@42 198 T11 = TU - TT;
Chris@42 199 TB = T1 - T4;
Chris@42 200 TV = TT + TU;
Chris@42 201 }
Chris@42 202 {
Chris@42 203 E Tq, Tt, Tp, Ts, Tn, To;
Chris@42 204 Tq = R1[WS(rs, 7)];
Chris@42 205 Tt = R1[WS(rs, 3)];
Chris@42 206 Tn = R1[WS(rs, 1)];
Chris@42 207 To = R1[WS(rs, 5)];
Chris@42 208 Tp = KP707106781 * (Tn - To);
Chris@42 209 Ts = KP707106781 * (Tn + To);
Chris@42 210 Tr = Tp - Tq;
Chris@42 211 TK = Tt - Ts;
Chris@42 212 Tu = Ts + Tt;
Chris@42 213 TJ = Tp + Tq;
Chris@42 214 }
Chris@42 215 {
Chris@42 216 E Te, Tk, Th, Tj, Tf, Tg;
Chris@42 217 Te = R1[0];
Chris@42 218 Tk = R1[WS(rs, 4)];
Chris@42 219 Tf = R1[WS(rs, 2)];
Chris@42 220 Tg = R1[WS(rs, 6)];
Chris@42 221 Th = KP707106781 * (Tf - Tg);
Chris@42 222 Tj = KP707106781 * (Tf + Tg);
Chris@42 223 Ti = Te + Th;
Chris@42 224 TH = Tk - Tj;
Chris@42 225 Tl = Tj + Tk;
Chris@42 226 TG = Te - Th;
Chris@42 227 }
Chris@42 228 {
Chris@42 229 E T8, TC, Tb, TD;
Chris@42 230 {
Chris@42 231 E T6, T7, T9, Ta;
Chris@42 232 T6 = R0[WS(rs, 1)];
Chris@42 233 T7 = R0[WS(rs, 5)];
Chris@42 234 T8 = FNMS(KP382683432, T7, KP923879532 * T6);
Chris@42 235 TC = FMA(KP382683432, T6, KP923879532 * T7);
Chris@42 236 T9 = R0[WS(rs, 3)];
Chris@42 237 Ta = R0[WS(rs, 7)];
Chris@42 238 Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
Chris@42 239 TD = FMA(KP923879532, T9, KP382683432 * Ta);
Chris@42 240 }
Chris@42 241 Tc = T8 + Tb;
Chris@42 242 T10 = Tb - T8;
Chris@42 243 TE = TC - TD;
Chris@42 244 TS = TC + TD;
Chris@42 245 }
Chris@42 246 {
Chris@42 247 E Td, TW, Tw, TR, Tm, Tv;
Chris@42 248 Td = T5 - Tc;
Chris@42 249 TW = TS + TV;
Chris@42 250 Tm = FMA(KP195090322, Ti, KP980785280 * Tl);
Chris@42 251 Tv = FNMS(KP980785280, Tu, KP195090322 * Tr);
Chris@42 252 Tw = Tm + Tv;
Chris@42 253 TR = Tv - Tm;
Chris@42 254 Cr[WS(csr, 4)] = Td - Tw;
Chris@42 255 Ci[WS(csi, 7)] = TR + TW;
Chris@42 256 Cr[WS(csr, 3)] = Td + Tw;
Chris@42 257 Ci[0] = TR - TW;
Chris@42 258 }
Chris@42 259 {
Chris@42 260 E Tx, TY, TA, TX, Ty, Tz;
Chris@42 261 Tx = T5 + Tc;
Chris@42 262 TY = TV - TS;
Chris@42 263 Ty = FNMS(KP195090322, Tl, KP980785280 * Ti);
Chris@42 264 Tz = FMA(KP980785280, Tr, KP195090322 * Tu);
Chris@42 265 TA = Ty + Tz;
Chris@42 266 TX = Tz - Ty;
Chris@42 267 Cr[WS(csr, 7)] = Tx - TA;
Chris@42 268 Ci[WS(csi, 3)] = TX + TY;
Chris@42 269 Cr[0] = Tx + TA;
Chris@42 270 Ci[WS(csi, 4)] = TX - TY;
Chris@42 271 }
Chris@42 272 {
Chris@42 273 E TF, T12, TM, TZ, TI, TL;
Chris@42 274 TF = TB + TE;
Chris@42 275 T12 = T10 - T11;
Chris@42 276 TI = FMA(KP831469612, TG, KP555570233 * TH);
Chris@42 277 TL = FMA(KP831469612, TJ, KP555570233 * TK);
Chris@42 278 TM = TI - TL;
Chris@42 279 TZ = TI + TL;
Chris@42 280 Cr[WS(csr, 6)] = TF - TM;
Chris@42 281 Ci[WS(csi, 2)] = T12 - TZ;
Chris@42 282 Cr[WS(csr, 1)] = TF + TM;
Chris@42 283 Ci[WS(csi, 5)] = -(TZ + T12);
Chris@42 284 }
Chris@42 285 {
Chris@42 286 E TN, T14, TQ, T13, TO, TP;
Chris@42 287 TN = TB - TE;
Chris@42 288 T14 = T10 + T11;
Chris@42 289 TO = FNMS(KP555570233, TJ, KP831469612 * TK);
Chris@42 290 TP = FNMS(KP555570233, TG, KP831469612 * TH);
Chris@42 291 TQ = TO - TP;
Chris@42 292 T13 = TP + TO;
Chris@42 293 Cr[WS(csr, 5)] = TN - TQ;
Chris@42 294 Ci[WS(csi, 1)] = T13 + T14;
Chris@42 295 Cr[WS(csr, 2)] = TN + TQ;
Chris@42 296 Ci[WS(csi, 6)] = T13 - T14;
Chris@42 297 }
Chris@42 298 }
Chris@42 299 }
Chris@42 300 }
Chris@42 301
Chris@42 302 static const kr2c_desc desc = { 16, "r2cfII_16", {54, 18, 12, 0}, &GENUS };
Chris@42 303
Chris@42 304 void X(codelet_r2cfII_16) (planner *p) {
Chris@42 305 X(kr2c_register) (p, r2cfII_16, &desc);
Chris@42 306 }
Chris@42 307
Chris@42 308 #endif /* HAVE_FMA */