annotate src/fftw-3.3.5/dft/scalar/codelets/n1_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:35:51 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include n.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 84 FP additions, 36 FP multiplications,
Chris@42 32 * (or, 48 additions, 0 multiplications, 36 fused multiply/add),
Chris@42 33 * 59 stack variables, 4 constants, and 40 memory accesses
Chris@42 34 */
Chris@42 35 #include "n.h"
Chris@42 36
Chris@42 37 static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 {
Chris@42 44 INT i;
Chris@42 45 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 46 E T1g, T1a, T18, T1m, T1k, T1f, T19, T11, T1h, T1l;
Chris@42 47 {
Chris@42 48 E Tj, T3, T1b, TN, T1j, TU, T1i, TV, Tq, T10, Ti, Ts, Tw, T15, Tx;
Chris@42 49 E T13, TG, Ty, TB, TC;
Chris@42 50 {
Chris@42 51 E T1, T2, TL, TM;
Chris@42 52 T1 = ri[0];
Chris@42 53 T2 = ri[WS(is, 5)];
Chris@42 54 TL = ii[0];
Chris@42 55 TM = ii[WS(is, 5)];
Chris@42 56 {
Chris@42 57 E T7, Tk, T6, To, Tg, T8, Tb, Tc;
Chris@42 58 {
Chris@42 59 E T4, T5, Te, Tf;
Chris@42 60 T4 = ri[WS(is, 2)];
Chris@42 61 Tj = T1 + T2;
Chris@42 62 T3 = T1 - T2;
Chris@42 63 T1b = TL + TM;
Chris@42 64 TN = TL - TM;
Chris@42 65 T5 = ri[WS(is, 7)];
Chris@42 66 Te = ri[WS(is, 6)];
Chris@42 67 Tf = ri[WS(is, 1)];
Chris@42 68 T7 = ri[WS(is, 8)];
Chris@42 69 Tk = T4 + T5;
Chris@42 70 T6 = T4 - T5;
Chris@42 71 To = Te + Tf;
Chris@42 72 Tg = Te - Tf;
Chris@42 73 T8 = ri[WS(is, 3)];
Chris@42 74 Tb = ri[WS(is, 4)];
Chris@42 75 Tc = ri[WS(is, 9)];
Chris@42 76 }
Chris@42 77 {
Chris@42 78 E TE, TF, Tu, Tv;
Chris@42 79 {
Chris@42 80 E Ta, Th, Tl, T9;
Chris@42 81 Tu = ii[WS(is, 2)];
Chris@42 82 Tl = T7 + T8;
Chris@42 83 T9 = T7 - T8;
Chris@42 84 {
Chris@42 85 E Tn, Td, Tm, Tp;
Chris@42 86 Tn = Tb + Tc;
Chris@42 87 Td = Tb - Tc;
Chris@42 88 Tm = Tk + Tl;
Chris@42 89 T1j = Tk - Tl;
Chris@42 90 Ta = T6 + T9;
Chris@42 91 TU = T6 - T9;
Chris@42 92 Tp = Tn + To;
Chris@42 93 T1i = Tn - To;
Chris@42 94 Th = Td + Tg;
Chris@42 95 TV = Td - Tg;
Chris@42 96 Tq = Tm + Tp;
Chris@42 97 T10 = Tm - Tp;
Chris@42 98 Tv = ii[WS(is, 7)];
Chris@42 99 }
Chris@42 100 Ti = Ta + Th;
Chris@42 101 Ts = Ta - Th;
Chris@42 102 }
Chris@42 103 TE = ii[WS(is, 6)];
Chris@42 104 TF = ii[WS(is, 1)];
Chris@42 105 Tw = Tu - Tv;
Chris@42 106 T15 = Tu + Tv;
Chris@42 107 Tx = ii[WS(is, 8)];
Chris@42 108 T13 = TE + TF;
Chris@42 109 TG = TE - TF;
Chris@42 110 Ty = ii[WS(is, 3)];
Chris@42 111 TB = ii[WS(is, 4)];
Chris@42 112 TC = ii[WS(is, 9)];
Chris@42 113 }
Chris@42 114 }
Chris@42 115 }
Chris@42 116 {
Chris@42 117 E T17, TA, T14, TH, T1e, TQ, TS;
Chris@42 118 {
Chris@42 119 E TO, TP, T16, Tz;
Chris@42 120 ro[WS(os, 5)] = T3 + Ti;
Chris@42 121 T16 = Tx + Ty;
Chris@42 122 Tz = Tx - Ty;
Chris@42 123 {
Chris@42 124 E T12, TD, T1c, T1d;
Chris@42 125 T12 = TB + TC;
Chris@42 126 TD = TB - TC;
Chris@42 127 T1c = T15 + T16;
Chris@42 128 T17 = T15 - T16;
Chris@42 129 TO = Tw + Tz;
Chris@42 130 TA = Tw - Tz;
Chris@42 131 T1d = T12 + T13;
Chris@42 132 T14 = T12 - T13;
Chris@42 133 TP = TD + TG;
Chris@42 134 TH = TD - TG;
Chris@42 135 T1e = T1c + T1d;
Chris@42 136 T1g = T1c - T1d;
Chris@42 137 }
Chris@42 138 ro[0] = Tj + Tq;
Chris@42 139 TQ = TO + TP;
Chris@42 140 TS = TO - TP;
Chris@42 141 }
Chris@42 142 {
Chris@42 143 E TK, TI, TY, TW, TR, TJ, Tt, Tr, TZ, TX, TT;
Chris@42 144 TK = FNMS(KP618033988, TA, TH);
Chris@42 145 TI = FMA(KP618033988, TH, TA);
Chris@42 146 io[0] = T1b + T1e;
Chris@42 147 io[WS(os, 5)] = TN + TQ;
Chris@42 148 Tr = FNMS(KP250000000, Ti, T3);
Chris@42 149 TY = FNMS(KP618033988, TU, TV);
Chris@42 150 TW = FMA(KP618033988, TV, TU);
Chris@42 151 TR = FNMS(KP250000000, TQ, TN);
Chris@42 152 TJ = FNMS(KP559016994, Ts, Tr);
Chris@42 153 Tt = FMA(KP559016994, Ts, Tr);
Chris@42 154 T1a = FMA(KP618033988, T14, T17);
Chris@42 155 T18 = FNMS(KP618033988, T17, T14);
Chris@42 156 ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
Chris@42 157 ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
Chris@42 158 ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
Chris@42 159 ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
Chris@42 160 TX = FNMS(KP559016994, TS, TR);
Chris@42 161 TT = FMA(KP559016994, TS, TR);
Chris@42 162 TZ = FNMS(KP250000000, Tq, Tj);
Chris@42 163 io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
Chris@42 164 io[WS(os, 7)] = FMA(KP951056516, TY, TX);
Chris@42 165 io[WS(os, 9)] = FMA(KP951056516, TW, TT);
Chris@42 166 io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
Chris@42 167 T1m = FMA(KP618033988, T1i, T1j);
Chris@42 168 T1k = FNMS(KP618033988, T1j, T1i);
Chris@42 169 T1f = FNMS(KP250000000, T1e, T1b);
Chris@42 170 T19 = FMA(KP559016994, T10, TZ);
Chris@42 171 T11 = FNMS(KP559016994, T10, TZ);
Chris@42 172 }
Chris@42 173 }
Chris@42 174 }
Chris@42 175 ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
Chris@42 176 ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
Chris@42 177 ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
Chris@42 178 ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
Chris@42 179 T1h = FNMS(KP559016994, T1g, T1f);
Chris@42 180 T1l = FMA(KP559016994, T1g, T1f);
Chris@42 181 io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
Chris@42 182 io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
Chris@42 183 io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
Chris@42 184 io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
Chris@42 185 }
Chris@42 186 }
Chris@42 187 }
Chris@42 188
Chris@42 189 static const kdft_desc desc = { 10, "n1_10", {48, 0, 36, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 190
Chris@42 191 void X(codelet_n1_10) (planner *p) {
Chris@42 192 X(kdft_register) (p, n1_10, &desc);
Chris@42 193 }
Chris@42 194
Chris@42 195 #else /* HAVE_FMA */
Chris@42 196
Chris@42 197 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include n.h */
Chris@42 198
Chris@42 199 /*
Chris@42 200 * This function contains 84 FP additions, 24 FP multiplications,
Chris@42 201 * (or, 72 additions, 12 multiplications, 12 fused multiply/add),
Chris@42 202 * 41 stack variables, 4 constants, and 40 memory accesses
Chris@42 203 */
Chris@42 204 #include "n.h"
Chris@42 205
Chris@42 206 static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 207 {
Chris@42 208 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 209 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 210 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 211 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 212 {
Chris@42 213 INT i;
Chris@42 214 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@42 215 E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
Chris@42 216 E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
Chris@42 217 {
Chris@42 218 E T1, T2, TO, TP;
Chris@42 219 T1 = ri[0];
Chris@42 220 T2 = ri[WS(is, 5)];
Chris@42 221 T3 = T1 - T2;
Chris@42 222 Tj = T1 + T2;
Chris@42 223 TO = ii[0];
Chris@42 224 TP = ii[WS(is, 5)];
Chris@42 225 TQ = TO - TP;
Chris@42 226 T1e = TO + TP;
Chris@42 227 }
Chris@42 228 {
Chris@42 229 E T6, Tk, Tg, To, T9, Tl, Td, Tn;
Chris@42 230 {
Chris@42 231 E T4, T5, Te, Tf;
Chris@42 232 T4 = ri[WS(is, 2)];
Chris@42 233 T5 = ri[WS(is, 7)];
Chris@42 234 T6 = T4 - T5;
Chris@42 235 Tk = T4 + T5;
Chris@42 236 Te = ri[WS(is, 6)];
Chris@42 237 Tf = ri[WS(is, 1)];
Chris@42 238 Tg = Te - Tf;
Chris@42 239 To = Te + Tf;
Chris@42 240 }
Chris@42 241 {
Chris@42 242 E T7, T8, Tb, Tc;
Chris@42 243 T7 = ri[WS(is, 8)];
Chris@42 244 T8 = ri[WS(is, 3)];
Chris@42 245 T9 = T7 - T8;
Chris@42 246 Tl = T7 + T8;
Chris@42 247 Tb = ri[WS(is, 4)];
Chris@42 248 Tc = ri[WS(is, 9)];
Chris@42 249 Td = Tb - Tc;
Chris@42 250 Tn = Tb + Tc;
Chris@42 251 }
Chris@42 252 TU = T6 - T9;
Chris@42 253 TV = Td - Tg;
Chris@42 254 T1c = Tk - Tl;
Chris@42 255 T1b = Tn - To;
Chris@42 256 Tm = Tk + Tl;
Chris@42 257 Tp = Tn + To;
Chris@42 258 Tq = Tm + Tp;
Chris@42 259 Ta = T6 + T9;
Chris@42 260 Th = Td + Tg;
Chris@42 261 Ti = Ta + Th;
Chris@42 262 }
Chris@42 263 {
Chris@42 264 E Tw, T15, TG, T13, Tz, T16, TD, T12;
Chris@42 265 {
Chris@42 266 E Tu, Tv, TE, TF;
Chris@42 267 Tu = ii[WS(is, 2)];
Chris@42 268 Tv = ii[WS(is, 7)];
Chris@42 269 Tw = Tu - Tv;
Chris@42 270 T15 = Tu + Tv;
Chris@42 271 TE = ii[WS(is, 6)];
Chris@42 272 TF = ii[WS(is, 1)];
Chris@42 273 TG = TE - TF;
Chris@42 274 T13 = TE + TF;
Chris@42 275 }
Chris@42 276 {
Chris@42 277 E Tx, Ty, TB, TC;
Chris@42 278 Tx = ii[WS(is, 8)];
Chris@42 279 Ty = ii[WS(is, 3)];
Chris@42 280 Tz = Tx - Ty;
Chris@42 281 T16 = Tx + Ty;
Chris@42 282 TB = ii[WS(is, 4)];
Chris@42 283 TC = ii[WS(is, 9)];
Chris@42 284 TD = TB - TC;
Chris@42 285 T12 = TB + TC;
Chris@42 286 }
Chris@42 287 TA = Tw - Tz;
Chris@42 288 TH = TD - TG;
Chris@42 289 T17 = T15 - T16;
Chris@42 290 T14 = T12 - T13;
Chris@42 291 T1f = T15 + T16;
Chris@42 292 T1g = T12 + T13;
Chris@42 293 T1h = T1f + T1g;
Chris@42 294 TL = Tw + Tz;
Chris@42 295 TM = TD + TG;
Chris@42 296 TR = TL + TM;
Chris@42 297 }
Chris@42 298 ro[WS(os, 5)] = T3 + Ti;
Chris@42 299 io[WS(os, 5)] = TQ + TR;
Chris@42 300 ro[0] = Tj + Tq;
Chris@42 301 io[0] = T1e + T1h;
Chris@42 302 {
Chris@42 303 E TI, TK, Tt, TJ, Tr, Ts;
Chris@42 304 TI = FMA(KP951056516, TA, KP587785252 * TH);
Chris@42 305 TK = FNMS(KP587785252, TA, KP951056516 * TH);
Chris@42 306 Tr = KP559016994 * (Ta - Th);
Chris@42 307 Ts = FNMS(KP250000000, Ti, T3);
Chris@42 308 Tt = Tr + Ts;
Chris@42 309 TJ = Ts - Tr;
Chris@42 310 ro[WS(os, 9)] = Tt - TI;
Chris@42 311 ro[WS(os, 3)] = TJ + TK;
Chris@42 312 ro[WS(os, 1)] = Tt + TI;
Chris@42 313 ro[WS(os, 7)] = TJ - TK;
Chris@42 314 }
Chris@42 315 {
Chris@42 316 E TW, TY, TT, TX, TN, TS;
Chris@42 317 TW = FMA(KP951056516, TU, KP587785252 * TV);
Chris@42 318 TY = FNMS(KP587785252, TU, KP951056516 * TV);
Chris@42 319 TN = KP559016994 * (TL - TM);
Chris@42 320 TS = FNMS(KP250000000, TR, TQ);
Chris@42 321 TT = TN + TS;
Chris@42 322 TX = TS - TN;
Chris@42 323 io[WS(os, 1)] = TT - TW;
Chris@42 324 io[WS(os, 7)] = TY + TX;
Chris@42 325 io[WS(os, 9)] = TW + TT;
Chris@42 326 io[WS(os, 3)] = TX - TY;
Chris@42 327 }
Chris@42 328 {
Chris@42 329 E T18, T1a, T11, T19, TZ, T10;
Chris@42 330 T18 = FNMS(KP587785252, T17, KP951056516 * T14);
Chris@42 331 T1a = FMA(KP951056516, T17, KP587785252 * T14);
Chris@42 332 TZ = FNMS(KP250000000, Tq, Tj);
Chris@42 333 T10 = KP559016994 * (Tm - Tp);
Chris@42 334 T11 = TZ - T10;
Chris@42 335 T19 = T10 + TZ;
Chris@42 336 ro[WS(os, 2)] = T11 - T18;
Chris@42 337 ro[WS(os, 6)] = T19 + T1a;
Chris@42 338 ro[WS(os, 8)] = T11 + T18;
Chris@42 339 ro[WS(os, 4)] = T19 - T1a;
Chris@42 340 }
Chris@42 341 {
Chris@42 342 E T1d, T1l, T1k, T1m, T1i, T1j;
Chris@42 343 T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
Chris@42 344 T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
Chris@42 345 T1i = FNMS(KP250000000, T1h, T1e);
Chris@42 346 T1j = KP559016994 * (T1f - T1g);
Chris@42 347 T1k = T1i - T1j;
Chris@42 348 T1m = T1j + T1i;
Chris@42 349 io[WS(os, 2)] = T1d + T1k;
Chris@42 350 io[WS(os, 6)] = T1m - T1l;
Chris@42 351 io[WS(os, 8)] = T1k - T1d;
Chris@42 352 io[WS(os, 4)] = T1l + T1m;
Chris@42 353 }
Chris@42 354 }
Chris@42 355 }
Chris@42 356 }
Chris@42 357
Chris@42 358 static const kdft_desc desc = { 10, "n1_10", {72, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 359
Chris@42 360 void X(codelet_n1_10) (planner *p) {
Chris@42 361 X(kdft_register) (p, n1_10, &desc);
Chris@42 362 }
Chris@42 363
Chris@42 364 #endif /* HAVE_FMA */