annotate src/fftw-3.3.8/rdft/scalar/r2cf/r2cfII_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:43 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 48 FP multiplications,
Chris@82 32 * (or, 18 additions, 0 multiplications, 48 fused multiply/add),
Chris@82 33 * 32 stack variables, 7 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/r2cfII.h"
Chris@82 36
Chris@82 37 static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 41 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 42 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 45 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 46 {
Chris@82 47 INT i;
Chris@82 48 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 49 E T5, TZ, TB, TT, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
Chris@82 50 E TU;
Chris@82 51 {
Chris@82 52 E T1, TR, T4, TS, T2, T3;
Chris@82 53 T1 = R0[0];
Chris@82 54 TR = R0[WS(rs, 4)];
Chris@82 55 T2 = R0[WS(rs, 2)];
Chris@82 56 T3 = R0[WS(rs, 6)];
Chris@82 57 T4 = T2 - T3;
Chris@82 58 TS = T2 + T3;
Chris@82 59 T5 = FNMS(KP707106781, T4, T1);
Chris@82 60 TZ = FNMS(KP707106781, TS, TR);
Chris@82 61 TB = FMA(KP707106781, T4, T1);
Chris@82 62 TT = FMA(KP707106781, TS, TR);
Chris@82 63 }
Chris@82 64 {
Chris@82 65 E Tn, Ts, Tq, Tt, To, Tp;
Chris@82 66 Tn = R1[WS(rs, 7)];
Chris@82 67 Ts = R1[WS(rs, 3)];
Chris@82 68 To = R1[WS(rs, 1)];
Chris@82 69 Tp = R1[WS(rs, 5)];
Chris@82 70 Tq = To - Tp;
Chris@82 71 Tt = To + Tp;
Chris@82 72 Tr = FMA(KP707106781, Tq, Tn);
Chris@82 73 TK = FMA(KP707106781, Tt, Ts);
Chris@82 74 Tu = FNMS(KP707106781, Tt, Ts);
Chris@82 75 TJ = FMS(KP707106781, Tq, Tn);
Chris@82 76 }
Chris@82 77 {
Chris@82 78 E Te, Tj, Th, Tk, Tf, Tg;
Chris@82 79 Te = R1[0];
Chris@82 80 Tj = R1[WS(rs, 4)];
Chris@82 81 Tf = R1[WS(rs, 2)];
Chris@82 82 Tg = R1[WS(rs, 6)];
Chris@82 83 Th = Tf - Tg;
Chris@82 84 Tk = Tf + Tg;
Chris@82 85 Ti = FNMS(KP707106781, Th, Te);
Chris@82 86 TH = FMA(KP707106781, Tk, Tj);
Chris@82 87 Tl = FNMS(KP707106781, Tk, Tj);
Chris@82 88 TG = FMA(KP707106781, Th, Te);
Chris@82 89 }
Chris@82 90 {
Chris@82 91 E T8, TC, Tb, TD;
Chris@82 92 {
Chris@82 93 E T6, T7, T9, Ta;
Chris@82 94 T6 = R0[WS(rs, 5)];
Chris@82 95 T7 = R0[WS(rs, 1)];
Chris@82 96 T8 = FMA(KP414213562, T7, T6);
Chris@82 97 TC = FNMS(KP414213562, T6, T7);
Chris@82 98 T9 = R0[WS(rs, 3)];
Chris@82 99 Ta = R0[WS(rs, 7)];
Chris@82 100 Tb = FMA(KP414213562, Ta, T9);
Chris@82 101 TD = FMS(KP414213562, T9, Ta);
Chris@82 102 }
Chris@82 103 Tc = T8 - Tb;
Chris@82 104 T10 = TD - TC;
Chris@82 105 TE = TC + TD;
Chris@82 106 TU = T8 + Tb;
Chris@82 107 }
Chris@82 108 {
Chris@82 109 E Td, T13, Tw, T14, Tm, Tv;
Chris@82 110 Td = FMA(KP923879532, Tc, T5);
Chris@82 111 T13 = FNMS(KP923879532, T10, TZ);
Chris@82 112 Tm = FMA(KP668178637, Tl, Ti);
Chris@82 113 Tv = FMA(KP668178637, Tu, Tr);
Chris@82 114 Tw = Tm - Tv;
Chris@82 115 T14 = Tm + Tv;
Chris@82 116 Cr[WS(csr, 6)] = FNMS(KP831469612, Tw, Td);
Chris@82 117 Ci[WS(csi, 5)] = FNMS(KP831469612, T14, T13);
Chris@82 118 Cr[WS(csr, 1)] = FMA(KP831469612, Tw, Td);
Chris@82 119 Ci[WS(csi, 2)] = -(FMA(KP831469612, T14, T13));
Chris@82 120 }
Chris@82 121 {
Chris@82 122 E Tx, T11, TA, T12, Ty, Tz;
Chris@82 123 Tx = FNMS(KP923879532, Tc, T5);
Chris@82 124 T11 = FMA(KP923879532, T10, TZ);
Chris@82 125 Ty = FNMS(KP668178637, Tr, Tu);
Chris@82 126 Tz = FNMS(KP668178637, Ti, Tl);
Chris@82 127 TA = Ty - Tz;
Chris@82 128 T12 = Tz + Ty;
Chris@82 129 Cr[WS(csr, 5)] = FNMS(KP831469612, TA, Tx);
Chris@82 130 Ci[WS(csi, 1)] = FMA(KP831469612, T12, T11);
Chris@82 131 Cr[WS(csr, 2)] = FMA(KP831469612, TA, Tx);
Chris@82 132 Ci[WS(csi, 6)] = FMS(KP831469612, T12, T11);
Chris@82 133 }
Chris@82 134 {
Chris@82 135 E TF, TX, TM, TY, TI, TL;
Chris@82 136 TF = FMA(KP923879532, TE, TB);
Chris@82 137 TX = FNMS(KP923879532, TU, TT);
Chris@82 138 TI = FNMS(KP198912367, TH, TG);
Chris@82 139 TL = FMA(KP198912367, TK, TJ);
Chris@82 140 TM = TI + TL;
Chris@82 141 TY = TL - TI;
Chris@82 142 Cr[WS(csr, 7)] = FNMS(KP980785280, TM, TF);
Chris@82 143 Ci[WS(csi, 3)] = FMA(KP980785280, TY, TX);
Chris@82 144 Cr[0] = FMA(KP980785280, TM, TF);
Chris@82 145 Ci[WS(csi, 4)] = FMS(KP980785280, TY, TX);
Chris@82 146 }
Chris@82 147 {
Chris@82 148 E TN, TV, TQ, TW, TO, TP;
Chris@82 149 TN = FNMS(KP923879532, TE, TB);
Chris@82 150 TV = FMA(KP923879532, TU, TT);
Chris@82 151 TO = FMA(KP198912367, TG, TH);
Chris@82 152 TP = FNMS(KP198912367, TJ, TK);
Chris@82 153 TQ = TO - TP;
Chris@82 154 TW = TO + TP;
Chris@82 155 Cr[WS(csr, 4)] = FNMS(KP980785280, TQ, TN);
Chris@82 156 Ci[WS(csi, 7)] = FNMS(KP980785280, TW, TV);
Chris@82 157 Cr[WS(csr, 3)] = FMA(KP980785280, TQ, TN);
Chris@82 158 Ci[0] = -(FMA(KP980785280, TW, TV));
Chris@82 159 }
Chris@82 160 }
Chris@82 161 }
Chris@82 162 }
Chris@82 163
Chris@82 164 static const kr2c_desc desc = { 16, "r2cfII_16", {18, 0, 48, 0}, &GENUS };
Chris@82 165
Chris@82 166 void X(codelet_r2cfII_16) (planner *p) {
Chris@82 167 X(kr2c_register) (p, r2cfII_16, &desc);
Chris@82 168 }
Chris@82 169
Chris@82 170 #else
Chris@82 171
Chris@82 172 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
Chris@82 173
Chris@82 174 /*
Chris@82 175 * This function contains 66 FP additions, 30 FP multiplications,
Chris@82 176 * (or, 54 additions, 18 multiplications, 12 fused multiply/add),
Chris@82 177 * 32 stack variables, 7 constants, and 32 memory accesses
Chris@82 178 */
Chris@82 179 #include "rdft/scalar/r2cfII.h"
Chris@82 180
Chris@82 181 static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@82 182 {
Chris@82 183 DK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 184 DK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 185 DK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 186 DK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 187 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 188 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 189 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 190 {
Chris@82 191 INT i;
Chris@82 192 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
Chris@82 193 E T5, T11, TB, TV, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
Chris@82 194 E TS;
Chris@82 195 {
Chris@82 196 E T1, TU, T4, TT, T2, T3;
Chris@82 197 T1 = R0[0];
Chris@82 198 TU = R0[WS(rs, 4)];
Chris@82 199 T2 = R0[WS(rs, 2)];
Chris@82 200 T3 = R0[WS(rs, 6)];
Chris@82 201 T4 = KP707106781 * (T2 - T3);
Chris@82 202 TT = KP707106781 * (T2 + T3);
Chris@82 203 T5 = T1 + T4;
Chris@82 204 T11 = TU - TT;
Chris@82 205 TB = T1 - T4;
Chris@82 206 TV = TT + TU;
Chris@82 207 }
Chris@82 208 {
Chris@82 209 E Tq, Tt, Tp, Ts, Tn, To;
Chris@82 210 Tq = R1[WS(rs, 7)];
Chris@82 211 Tt = R1[WS(rs, 3)];
Chris@82 212 Tn = R1[WS(rs, 1)];
Chris@82 213 To = R1[WS(rs, 5)];
Chris@82 214 Tp = KP707106781 * (Tn - To);
Chris@82 215 Ts = KP707106781 * (Tn + To);
Chris@82 216 Tr = Tp - Tq;
Chris@82 217 TK = Tt - Ts;
Chris@82 218 Tu = Ts + Tt;
Chris@82 219 TJ = Tp + Tq;
Chris@82 220 }
Chris@82 221 {
Chris@82 222 E Te, Tk, Th, Tj, Tf, Tg;
Chris@82 223 Te = R1[0];
Chris@82 224 Tk = R1[WS(rs, 4)];
Chris@82 225 Tf = R1[WS(rs, 2)];
Chris@82 226 Tg = R1[WS(rs, 6)];
Chris@82 227 Th = KP707106781 * (Tf - Tg);
Chris@82 228 Tj = KP707106781 * (Tf + Tg);
Chris@82 229 Ti = Te + Th;
Chris@82 230 TH = Tk - Tj;
Chris@82 231 Tl = Tj + Tk;
Chris@82 232 TG = Te - Th;
Chris@82 233 }
Chris@82 234 {
Chris@82 235 E T8, TC, Tb, TD;
Chris@82 236 {
Chris@82 237 E T6, T7, T9, Ta;
Chris@82 238 T6 = R0[WS(rs, 1)];
Chris@82 239 T7 = R0[WS(rs, 5)];
Chris@82 240 T8 = FNMS(KP382683432, T7, KP923879532 * T6);
Chris@82 241 TC = FMA(KP382683432, T6, KP923879532 * T7);
Chris@82 242 T9 = R0[WS(rs, 3)];
Chris@82 243 Ta = R0[WS(rs, 7)];
Chris@82 244 Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
Chris@82 245 TD = FMA(KP923879532, T9, KP382683432 * Ta);
Chris@82 246 }
Chris@82 247 Tc = T8 + Tb;
Chris@82 248 T10 = Tb - T8;
Chris@82 249 TE = TC - TD;
Chris@82 250 TS = TC + TD;
Chris@82 251 }
Chris@82 252 {
Chris@82 253 E Td, TW, Tw, TR, Tm, Tv;
Chris@82 254 Td = T5 - Tc;
Chris@82 255 TW = TS + TV;
Chris@82 256 Tm = FMA(KP195090322, Ti, KP980785280 * Tl);
Chris@82 257 Tv = FNMS(KP980785280, Tu, KP195090322 * Tr);
Chris@82 258 Tw = Tm + Tv;
Chris@82 259 TR = Tv - Tm;
Chris@82 260 Cr[WS(csr, 4)] = Td - Tw;
Chris@82 261 Ci[WS(csi, 7)] = TR + TW;
Chris@82 262 Cr[WS(csr, 3)] = Td + Tw;
Chris@82 263 Ci[0] = TR - TW;
Chris@82 264 }
Chris@82 265 {
Chris@82 266 E Tx, TY, TA, TX, Ty, Tz;
Chris@82 267 Tx = T5 + Tc;
Chris@82 268 TY = TV - TS;
Chris@82 269 Ty = FNMS(KP195090322, Tl, KP980785280 * Ti);
Chris@82 270 Tz = FMA(KP980785280, Tr, KP195090322 * Tu);
Chris@82 271 TA = Ty + Tz;
Chris@82 272 TX = Tz - Ty;
Chris@82 273 Cr[WS(csr, 7)] = Tx - TA;
Chris@82 274 Ci[WS(csi, 3)] = TX + TY;
Chris@82 275 Cr[0] = Tx + TA;
Chris@82 276 Ci[WS(csi, 4)] = TX - TY;
Chris@82 277 }
Chris@82 278 {
Chris@82 279 E TF, T12, TM, TZ, TI, TL;
Chris@82 280 TF = TB + TE;
Chris@82 281 T12 = T10 - T11;
Chris@82 282 TI = FMA(KP831469612, TG, KP555570233 * TH);
Chris@82 283 TL = FMA(KP831469612, TJ, KP555570233 * TK);
Chris@82 284 TM = TI - TL;
Chris@82 285 TZ = TI + TL;
Chris@82 286 Cr[WS(csr, 6)] = TF - TM;
Chris@82 287 Ci[WS(csi, 2)] = T12 - TZ;
Chris@82 288 Cr[WS(csr, 1)] = TF + TM;
Chris@82 289 Ci[WS(csi, 5)] = -(TZ + T12);
Chris@82 290 }
Chris@82 291 {
Chris@82 292 E TN, T14, TQ, T13, TO, TP;
Chris@82 293 TN = TB - TE;
Chris@82 294 T14 = T10 + T11;
Chris@82 295 TO = FNMS(KP555570233, TJ, KP831469612 * TK);
Chris@82 296 TP = FNMS(KP555570233, TG, KP831469612 * TH);
Chris@82 297 TQ = TO - TP;
Chris@82 298 T13 = TP + TO;
Chris@82 299 Cr[WS(csr, 5)] = TN - TQ;
Chris@82 300 Ci[WS(csi, 1)] = T13 + T14;
Chris@82 301 Cr[WS(csr, 2)] = TN + TQ;
Chris@82 302 Ci[WS(csi, 6)] = T13 - T14;
Chris@82 303 }
Chris@82 304 }
Chris@82 305 }
Chris@82 306 }
Chris@82 307
Chris@82 308 static const kr2c_desc desc = { 16, "r2cfII_16", {54, 18, 12, 0}, &GENUS };
Chris@82 309
Chris@82 310 void X(codelet_r2cfII_16) (planner *p) {
Chris@82 311 X(kr2c_register) (p, r2cfII_16, &desc);
Chris@82 312 }
Chris@82 313
Chris@82 314 #endif