annotate src/fftw-3.3.5/rdft/scalar/r2cf/r2cf_20.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:46:10 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cf.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 86 FP additions, 32 FP multiplications,
Chris@42 32 * (or, 58 additions, 4 multiplications, 28 fused multiply/add),
Chris@42 33 * 70 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cf.h"
Chris@42 36
Chris@42 37 static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 40 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 41 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 42 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
Chris@42 46 E T1i, T1c, T1a, T1o, T1m, T1h, T1b, T13, T1j, T1n;
Chris@42 47 {
Chris@42 48 E T3, T1d, TJ, TV, T1k, T16, T19, T1l, Ty, Ti, T12, TD, T1g, TR, TX;
Chris@42 49 E TK, Tt, TU, TW, TL, TE;
Chris@42 50 {
Chris@42 51 E T1, T2, TG, TH;
Chris@42 52 T1 = R0[0];
Chris@42 53 T2 = R0[WS(rs, 5)];
Chris@42 54 TG = R1[WS(rs, 2)];
Chris@42 55 TH = R1[WS(rs, 7)];
Chris@42 56 {
Chris@42 57 E T6, To, T17, Tx, T18, TC, Tj, T9, Tp, Tu, Td, T15, Tm, Tq, Te;
Chris@42 58 E Tf;
Chris@42 59 {
Chris@42 60 E TA, TB, T7, T8;
Chris@42 61 {
Chris@42 62 E T4, TF, TI, T5, Tv, Tw;
Chris@42 63 T4 = R0[WS(rs, 2)];
Chris@42 64 T3 = T1 - T2;
Chris@42 65 TF = T1 + T2;
Chris@42 66 T1d = TG - TH;
Chris@42 67 TI = TG + TH;
Chris@42 68 T5 = R0[WS(rs, 7)];
Chris@42 69 Tv = R1[WS(rs, 6)];
Chris@42 70 Tw = R1[WS(rs, 1)];
Chris@42 71 TJ = TF - TI;
Chris@42 72 TV = TF + TI;
Chris@42 73 T6 = T4 - T5;
Chris@42 74 To = T4 + T5;
Chris@42 75 T17 = Tw - Tv;
Chris@42 76 Tx = Tv + Tw;
Chris@42 77 }
Chris@42 78 TA = R1[WS(rs, 8)];
Chris@42 79 TB = R1[WS(rs, 3)];
Chris@42 80 T7 = R0[WS(rs, 8)];
Chris@42 81 T8 = R0[WS(rs, 3)];
Chris@42 82 {
Chris@42 83 E Tb, Tc, Tk, Tl;
Chris@42 84 Tb = R0[WS(rs, 4)];
Chris@42 85 T18 = TB - TA;
Chris@42 86 TC = TA + TB;
Chris@42 87 Tj = T7 + T8;
Chris@42 88 T9 = T7 - T8;
Chris@42 89 Tc = R0[WS(rs, 9)];
Chris@42 90 Tk = R1[0];
Chris@42 91 Tl = R1[WS(rs, 5)];
Chris@42 92 Tp = R1[WS(rs, 4)];
Chris@42 93 Tu = Tb + Tc;
Chris@42 94 Td = Tb - Tc;
Chris@42 95 T15 = Tl - Tk;
Chris@42 96 Tm = Tk + Tl;
Chris@42 97 Tq = R1[WS(rs, 9)];
Chris@42 98 Te = R0[WS(rs, 6)];
Chris@42 99 Tf = R0[WS(rs, 1)];
Chris@42 100 }
Chris@42 101 }
Chris@42 102 {
Chris@42 103 E Ta, Tr, Tz, T1e, T1f, Th, T14, Tg, TP, TQ;
Chris@42 104 Ta = T6 + T9;
Chris@42 105 T1k = T6 - T9;
Chris@42 106 T14 = Tq - Tp;
Chris@42 107 Tr = Tp + Tq;
Chris@42 108 Tz = Te + Tf;
Chris@42 109 Tg = Te - Tf;
Chris@42 110 T16 = T14 - T15;
Chris@42 111 T1e = T14 + T15;
Chris@42 112 T1f = T17 + T18;
Chris@42 113 T19 = T17 - T18;
Chris@42 114 Th = Td + Tg;
Chris@42 115 T1l = Td - Tg;
Chris@42 116 Ty = Tu - Tx;
Chris@42 117 TP = Tu + Tx;
Chris@42 118 Ti = Ta + Th;
Chris@42 119 T12 = Ta - Th;
Chris@42 120 TD = Tz - TC;
Chris@42 121 TQ = Tz + TC;
Chris@42 122 T1g = T1e + T1f;
Chris@42 123 T1i = T1e - T1f;
Chris@42 124 {
Chris@42 125 E TT, Tn, Ts, TS;
Chris@42 126 TT = Tj + Tm;
Chris@42 127 Tn = Tj - Tm;
Chris@42 128 Ts = To - Tr;
Chris@42 129 TS = To + Tr;
Chris@42 130 TR = TP - TQ;
Chris@42 131 TX = TP + TQ;
Chris@42 132 TK = Ts + Tn;
Chris@42 133 Tt = Tn - Ts;
Chris@42 134 TU = TS - TT;
Chris@42 135 TW = TS + TT;
Chris@42 136 }
Chris@42 137 }
Chris@42 138 }
Chris@42 139 }
Chris@42 140 Cr[WS(csr, 5)] = T3 + Ti;
Chris@42 141 Ci[WS(csi, 5)] = T1g - T1d;
Chris@42 142 TL = Ty + TD;
Chris@42 143 TE = Ty - TD;
Chris@42 144 {
Chris@42 145 E TY, T10, TM, TO, T11, TZ, TN;
Chris@42 146 TY = TW + TX;
Chris@42 147 T10 = TW - TX;
Chris@42 148 Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, Tt, TE));
Chris@42 149 Ci[WS(csi, 6)] = KP951056516 * (FNMS(KP618033988, TE, Tt));
Chris@42 150 Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, TR, TU));
Chris@42 151 Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP618033988, TU, TR)));
Chris@42 152 TM = TK + TL;
Chris@42 153 TO = TK - TL;
Chris@42 154 T1c = FNMS(KP618033988, T16, T19);
Chris@42 155 T1a = FMA(KP618033988, T19, T16);
Chris@42 156 Cr[0] = TV + TY;
Chris@42 157 TZ = FNMS(KP250000000, TY, TV);
Chris@42 158 Cr[WS(csr, 10)] = TJ + TM;
Chris@42 159 TN = FNMS(KP250000000, TM, TJ);
Chris@42 160 Cr[WS(csr, 8)] = FNMS(KP559016994, T10, TZ);
Chris@42 161 Cr[WS(csr, 4)] = FMA(KP559016994, T10, TZ);
Chris@42 162 Cr[WS(csr, 6)] = FMA(KP559016994, TO, TN);
Chris@42 163 Cr[WS(csr, 2)] = FNMS(KP559016994, TO, TN);
Chris@42 164 T11 = FNMS(KP250000000, Ti, T3);
Chris@42 165 T1o = FNMS(KP618033988, T1k, T1l);
Chris@42 166 T1m = FMA(KP618033988, T1l, T1k);
Chris@42 167 T1h = FMA(KP250000000, T1g, T1d);
Chris@42 168 T1b = FNMS(KP559016994, T12, T11);
Chris@42 169 T13 = FMA(KP559016994, T12, T11);
Chris@42 170 }
Chris@42 171 }
Chris@42 172 Cr[WS(csr, 3)] = FNMS(KP951056516, T1c, T1b);
Chris@42 173 Cr[WS(csr, 7)] = FMA(KP951056516, T1c, T1b);
Chris@42 174 Cr[WS(csr, 1)] = FMA(KP951056516, T1a, T13);
Chris@42 175 Cr[WS(csr, 9)] = FNMS(KP951056516, T1a, T13);
Chris@42 176 T1j = FNMS(KP559016994, T1i, T1h);
Chris@42 177 T1n = FMA(KP559016994, T1i, T1h);
Chris@42 178 Ci[WS(csi, 3)] = FNMS(KP951056516, T1o, T1n);
Chris@42 179 Ci[WS(csi, 7)] = FMA(KP951056516, T1o, T1n);
Chris@42 180 Ci[WS(csi, 9)] = FMS(KP951056516, T1m, T1j);
Chris@42 181 Ci[WS(csi, 1)] = -(FMA(KP951056516, T1m, T1j));
Chris@42 182 }
Chris@42 183 }
Chris@42 184 }
Chris@42 185
Chris@42 186 static const kr2c_desc desc = { 20, "r2cf_20", {58, 4, 28, 0}, &GENUS };
Chris@42 187
Chris@42 188 void X(codelet_r2cf_20) (planner *p) {
Chris@42 189 X(kr2c_register) (p, r2cf_20, &desc);
Chris@42 190 }
Chris@42 191
Chris@42 192 #else /* HAVE_FMA */
Chris@42 193
Chris@42 194 /* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include r2cf.h */
Chris@42 195
Chris@42 196 /*
Chris@42 197 * This function contains 86 FP additions, 24 FP multiplications,
Chris@42 198 * (or, 74 additions, 12 multiplications, 12 fused multiply/add),
Chris@42 199 * 51 stack variables, 4 constants, and 40 memory accesses
Chris@42 200 */
Chris@42 201 #include "r2cf.h"
Chris@42 202
Chris@42 203 static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 204 {
Chris@42 205 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 206 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 207 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 208 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 209 {
Chris@42 210 INT i;
Chris@42 211 for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
Chris@42 212 E T3, T1m, TF, T17, Ts, TM, TN, Tz, Ta, Th, Ti, T1g, T1h, T1k, T10;
Chris@42 213 E T13, T19, TG, TH, TI, T1d, T1e, T1j, TT, TW, T18;
Chris@42 214 {
Chris@42 215 E T1, T2, T15, TD, TE, T16;
Chris@42 216 T1 = R0[0];
Chris@42 217 T2 = R0[WS(rs, 5)];
Chris@42 218 T15 = T1 + T2;
Chris@42 219 TD = R1[WS(rs, 7)];
Chris@42 220 TE = R1[WS(rs, 2)];
Chris@42 221 T16 = TE + TD;
Chris@42 222 T3 = T1 - T2;
Chris@42 223 T1m = T15 + T16;
Chris@42 224 TF = TD - TE;
Chris@42 225 T17 = T15 - T16;
Chris@42 226 }
Chris@42 227 {
Chris@42 228 E T6, TU, Tv, T12, Ty, TZ, T9, TR, Td, TY, To, TS, Tr, TV, Tg;
Chris@42 229 E T11;
Chris@42 230 {
Chris@42 231 E T4, T5, Tt, Tu;
Chris@42 232 T4 = R0[WS(rs, 2)];
Chris@42 233 T5 = R0[WS(rs, 7)];
Chris@42 234 T6 = T4 - T5;
Chris@42 235 TU = T4 + T5;
Chris@42 236 Tt = R1[WS(rs, 8)];
Chris@42 237 Tu = R1[WS(rs, 3)];
Chris@42 238 Tv = Tt - Tu;
Chris@42 239 T12 = Tt + Tu;
Chris@42 240 }
Chris@42 241 {
Chris@42 242 E Tw, Tx, T7, T8;
Chris@42 243 Tw = R1[WS(rs, 6)];
Chris@42 244 Tx = R1[WS(rs, 1)];
Chris@42 245 Ty = Tw - Tx;
Chris@42 246 TZ = Tw + Tx;
Chris@42 247 T7 = R0[WS(rs, 8)];
Chris@42 248 T8 = R0[WS(rs, 3)];
Chris@42 249 T9 = T7 - T8;
Chris@42 250 TR = T7 + T8;
Chris@42 251 }
Chris@42 252 {
Chris@42 253 E Tb, Tc, Tm, Tn;
Chris@42 254 Tb = R0[WS(rs, 4)];
Chris@42 255 Tc = R0[WS(rs, 9)];
Chris@42 256 Td = Tb - Tc;
Chris@42 257 TY = Tb + Tc;
Chris@42 258 Tm = R1[0];
Chris@42 259 Tn = R1[WS(rs, 5)];
Chris@42 260 To = Tm - Tn;
Chris@42 261 TS = Tm + Tn;
Chris@42 262 }
Chris@42 263 {
Chris@42 264 E Tp, Tq, Te, Tf;
Chris@42 265 Tp = R1[WS(rs, 4)];
Chris@42 266 Tq = R1[WS(rs, 9)];
Chris@42 267 Tr = Tp - Tq;
Chris@42 268 TV = Tp + Tq;
Chris@42 269 Te = R0[WS(rs, 6)];
Chris@42 270 Tf = R0[WS(rs, 1)];
Chris@42 271 Tg = Te - Tf;
Chris@42 272 T11 = Te + Tf;
Chris@42 273 }
Chris@42 274 Ts = To - Tr;
Chris@42 275 TM = T6 - T9;
Chris@42 276 TN = Td - Tg;
Chris@42 277 Tz = Tv - Ty;
Chris@42 278 Ta = T6 + T9;
Chris@42 279 Th = Td + Tg;
Chris@42 280 Ti = Ta + Th;
Chris@42 281 T1g = TY + TZ;
Chris@42 282 T1h = T11 + T12;
Chris@42 283 T1k = T1g + T1h;
Chris@42 284 T10 = TY - TZ;
Chris@42 285 T13 = T11 - T12;
Chris@42 286 T19 = T10 + T13;
Chris@42 287 TG = Tr + To;
Chris@42 288 TH = Ty + Tv;
Chris@42 289 TI = TG + TH;
Chris@42 290 T1d = TU + TV;
Chris@42 291 T1e = TR + TS;
Chris@42 292 T1j = T1d + T1e;
Chris@42 293 TT = TR - TS;
Chris@42 294 TW = TU - TV;
Chris@42 295 T18 = TW + TT;
Chris@42 296 }
Chris@42 297 Cr[WS(csr, 5)] = T3 + Ti;
Chris@42 298 Ci[WS(csi, 5)] = TF - TI;
Chris@42 299 {
Chris@42 300 E TX, T14, T1f, T1i;
Chris@42 301 TX = TT - TW;
Chris@42 302 T14 = T10 - T13;
Chris@42 303 Ci[WS(csi, 6)] = FNMS(KP587785252, T14, KP951056516 * TX);
Chris@42 304 Ci[WS(csi, 2)] = FMA(KP587785252, TX, KP951056516 * T14);
Chris@42 305 T1f = T1d - T1e;
Chris@42 306 T1i = T1g - T1h;
Chris@42 307 Ci[WS(csi, 8)] = FNMS(KP951056516, T1i, KP587785252 * T1f);
Chris@42 308 Ci[WS(csi, 4)] = FMA(KP951056516, T1f, KP587785252 * T1i);
Chris@42 309 }
Chris@42 310 {
Chris@42 311 E T1l, T1n, T1o, T1c, T1a, T1b;
Chris@42 312 T1l = KP559016994 * (T1j - T1k);
Chris@42 313 T1n = T1j + T1k;
Chris@42 314 T1o = FNMS(KP250000000, T1n, T1m);
Chris@42 315 Cr[WS(csr, 4)] = T1l + T1o;
Chris@42 316 Cr[0] = T1m + T1n;
Chris@42 317 Cr[WS(csr, 8)] = T1o - T1l;
Chris@42 318 T1c = KP559016994 * (T18 - T19);
Chris@42 319 T1a = T18 + T19;
Chris@42 320 T1b = FNMS(KP250000000, T1a, T17);
Chris@42 321 Cr[WS(csr, 2)] = T1b - T1c;
Chris@42 322 Cr[WS(csr, 10)] = T17 + T1a;
Chris@42 323 Cr[WS(csr, 6)] = T1c + T1b;
Chris@42 324 }
Chris@42 325 {
Chris@42 326 E TA, TC, Tl, TB, Tj, Tk;
Chris@42 327 TA = FMA(KP951056516, Ts, KP587785252 * Tz);
Chris@42 328 TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
Chris@42 329 Tj = KP559016994 * (Ta - Th);
Chris@42 330 Tk = FNMS(KP250000000, Ti, T3);
Chris@42 331 Tl = Tj + Tk;
Chris@42 332 TB = Tk - Tj;
Chris@42 333 Cr[WS(csr, 9)] = Tl - TA;
Chris@42 334 Cr[WS(csr, 7)] = TB + TC;
Chris@42 335 Cr[WS(csr, 1)] = Tl + TA;
Chris@42 336 Cr[WS(csr, 3)] = TB - TC;
Chris@42 337 }
Chris@42 338 {
Chris@42 339 E TO, TQ, TL, TP, TJ, TK;
Chris@42 340 TO = FMA(KP951056516, TM, KP587785252 * TN);
Chris@42 341 TQ = FNMS(KP587785252, TM, KP951056516 * TN);
Chris@42 342 TJ = FMA(KP250000000, TI, TF);
Chris@42 343 TK = KP559016994 * (TH - TG);
Chris@42 344 TL = TJ + TK;
Chris@42 345 TP = TK - TJ;
Chris@42 346 Ci[WS(csi, 1)] = TL - TO;
Chris@42 347 Ci[WS(csi, 7)] = TQ + TP;
Chris@42 348 Ci[WS(csi, 9)] = TO + TL;
Chris@42 349 Ci[WS(csi, 3)] = TP - TQ;
Chris@42 350 }
Chris@42 351 }
Chris@42 352 }
Chris@42 353 }
Chris@42 354
Chris@42 355 static const kr2c_desc desc = { 20, "r2cf_20", {74, 12, 12, 0}, &GENUS };
Chris@42 356
Chris@42 357 void X(codelet_r2cf_20) (planner *p) {
Chris@42 358 X(kr2c_register) (p, r2cf_20, &desc);
Chris@42 359 }
Chris@42 360
Chris@42 361 #endif /* HAVE_FMA */