annotate src/fftw-3.3.8/dft/scalar/codelets/n1_10.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 84 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 48 additions, 0 multiplications, 36 fused multiply/add),
Chris@82 33 * 41 stack variables, 4 constants, and 40 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/n.h"
Chris@82 36
Chris@82 37 static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 {
Chris@82 44 INT i;
Chris@82 45 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 46 E T3, Tj, TN, T1b, TU, TV, T1j, T1i, Tm, Tp, Tq, Ta, Th, Ti, TA;
Chris@82 47 E TH, T17, T14, T1c, T1d, T1e, TO, TP, TQ;
Chris@82 48 {
Chris@82 49 E T1, T2, TL, TM;
Chris@82 50 T1 = ri[0];
Chris@82 51 T2 = ri[WS(is, 5)];
Chris@82 52 T3 = T1 - T2;
Chris@82 53 Tj = T1 + T2;
Chris@82 54 TL = ii[0];
Chris@82 55 TM = ii[WS(is, 5)];
Chris@82 56 TN = TL - TM;
Chris@82 57 T1b = TL + TM;
Chris@82 58 }
Chris@82 59 {
Chris@82 60 E T6, Tk, Tg, To, T9, Tl, Td, Tn;
Chris@82 61 {
Chris@82 62 E T4, T5, Te, Tf;
Chris@82 63 T4 = ri[WS(is, 2)];
Chris@82 64 T5 = ri[WS(is, 7)];
Chris@82 65 T6 = T4 - T5;
Chris@82 66 Tk = T4 + T5;
Chris@82 67 Te = ri[WS(is, 6)];
Chris@82 68 Tf = ri[WS(is, 1)];
Chris@82 69 Tg = Te - Tf;
Chris@82 70 To = Te + Tf;
Chris@82 71 }
Chris@82 72 {
Chris@82 73 E T7, T8, Tb, Tc;
Chris@82 74 T7 = ri[WS(is, 8)];
Chris@82 75 T8 = ri[WS(is, 3)];
Chris@82 76 T9 = T7 - T8;
Chris@82 77 Tl = T7 + T8;
Chris@82 78 Tb = ri[WS(is, 4)];
Chris@82 79 Tc = ri[WS(is, 9)];
Chris@82 80 Td = Tb - Tc;
Chris@82 81 Tn = Tb + Tc;
Chris@82 82 }
Chris@82 83 TU = T6 - T9;
Chris@82 84 TV = Td - Tg;
Chris@82 85 T1j = Tk - Tl;
Chris@82 86 T1i = Tn - To;
Chris@82 87 Tm = Tk + Tl;
Chris@82 88 Tp = Tn + To;
Chris@82 89 Tq = Tm + Tp;
Chris@82 90 Ta = T6 + T9;
Chris@82 91 Th = Td + Tg;
Chris@82 92 Ti = Ta + Th;
Chris@82 93 }
Chris@82 94 {
Chris@82 95 E Tw, T15, TG, T13, Tz, T16, TD, T12;
Chris@82 96 {
Chris@82 97 E Tu, Tv, TE, TF;
Chris@82 98 Tu = ii[WS(is, 2)];
Chris@82 99 Tv = ii[WS(is, 7)];
Chris@82 100 Tw = Tu - Tv;
Chris@82 101 T15 = Tu + Tv;
Chris@82 102 TE = ii[WS(is, 6)];
Chris@82 103 TF = ii[WS(is, 1)];
Chris@82 104 TG = TE - TF;
Chris@82 105 T13 = TE + TF;
Chris@82 106 }
Chris@82 107 {
Chris@82 108 E Tx, Ty, TB, TC;
Chris@82 109 Tx = ii[WS(is, 8)];
Chris@82 110 Ty = ii[WS(is, 3)];
Chris@82 111 Tz = Tx - Ty;
Chris@82 112 T16 = Tx + Ty;
Chris@82 113 TB = ii[WS(is, 4)];
Chris@82 114 TC = ii[WS(is, 9)];
Chris@82 115 TD = TB - TC;
Chris@82 116 T12 = TB + TC;
Chris@82 117 }
Chris@82 118 TA = Tw - Tz;
Chris@82 119 TH = TD - TG;
Chris@82 120 T17 = T15 - T16;
Chris@82 121 T14 = T12 - T13;
Chris@82 122 T1c = T15 + T16;
Chris@82 123 T1d = T12 + T13;
Chris@82 124 T1e = T1c + T1d;
Chris@82 125 TO = Tw + Tz;
Chris@82 126 TP = TD + TG;
Chris@82 127 TQ = TO + TP;
Chris@82 128 }
Chris@82 129 ro[WS(os, 5)] = T3 + Ti;
Chris@82 130 io[WS(os, 5)] = TN + TQ;
Chris@82 131 ro[0] = Tj + Tq;
Chris@82 132 io[0] = T1b + T1e;
Chris@82 133 {
Chris@82 134 E TI, TK, Tt, TJ, Tr, Ts;
Chris@82 135 TI = FMA(KP618033988, TH, TA);
Chris@82 136 TK = FNMS(KP618033988, TA, TH);
Chris@82 137 Tr = FNMS(KP250000000, Ti, T3);
Chris@82 138 Ts = Ta - Th;
Chris@82 139 Tt = FMA(KP559016994, Ts, Tr);
Chris@82 140 TJ = FNMS(KP559016994, Ts, Tr);
Chris@82 141 ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
Chris@82 142 ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
Chris@82 143 ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
Chris@82 144 ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
Chris@82 145 }
Chris@82 146 {
Chris@82 147 E TW, TY, TT, TX, TR, TS;
Chris@82 148 TW = FMA(KP618033988, TV, TU);
Chris@82 149 TY = FNMS(KP618033988, TU, TV);
Chris@82 150 TR = FNMS(KP250000000, TQ, TN);
Chris@82 151 TS = TO - TP;
Chris@82 152 TT = FMA(KP559016994, TS, TR);
Chris@82 153 TX = FNMS(KP559016994, TS, TR);
Chris@82 154 io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
Chris@82 155 io[WS(os, 7)] = FMA(KP951056516, TY, TX);
Chris@82 156 io[WS(os, 9)] = FMA(KP951056516, TW, TT);
Chris@82 157 io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
Chris@82 158 }
Chris@82 159 {
Chris@82 160 E T18, T1a, T11, T19, TZ, T10;
Chris@82 161 T18 = FNMS(KP618033988, T17, T14);
Chris@82 162 T1a = FMA(KP618033988, T14, T17);
Chris@82 163 TZ = FNMS(KP250000000, Tq, Tj);
Chris@82 164 T10 = Tm - Tp;
Chris@82 165 T11 = FNMS(KP559016994, T10, TZ);
Chris@82 166 T19 = FMA(KP559016994, T10, TZ);
Chris@82 167 ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
Chris@82 168 ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
Chris@82 169 ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
Chris@82 170 ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
Chris@82 171 }
Chris@82 172 {
Chris@82 173 E T1k, T1m, T1h, T1l, T1f, T1g;
Chris@82 174 T1k = FNMS(KP618033988, T1j, T1i);
Chris@82 175 T1m = FMA(KP618033988, T1i, T1j);
Chris@82 176 T1f = FNMS(KP250000000, T1e, T1b);
Chris@82 177 T1g = T1c - T1d;
Chris@82 178 T1h = FNMS(KP559016994, T1g, T1f);
Chris@82 179 T1l = FMA(KP559016994, T1g, T1f);
Chris@82 180 io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
Chris@82 181 io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
Chris@82 182 io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
Chris@82 183 io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
Chris@82 184 }
Chris@82 185 }
Chris@82 186 }
Chris@82 187 }
Chris@82 188
Chris@82 189 static const kdft_desc desc = { 10, "n1_10", {48, 0, 36, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 190
Chris@82 191 void X(codelet_n1_10) (planner *p) {
Chris@82 192 X(kdft_register) (p, n1_10, &desc);
Chris@82 193 }
Chris@82 194
Chris@82 195 #else
Chris@82 196
Chris@82 197 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
Chris@82 198
Chris@82 199 /*
Chris@82 200 * This function contains 84 FP additions, 24 FP multiplications,
Chris@82 201 * (or, 72 additions, 12 multiplications, 12 fused multiply/add),
Chris@82 202 * 41 stack variables, 4 constants, and 40 memory accesses
Chris@82 203 */
Chris@82 204 #include "dft/scalar/n.h"
Chris@82 205
Chris@82 206 static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 207 {
Chris@82 208 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 209 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 210 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 211 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 212 {
Chris@82 213 INT i;
Chris@82 214 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
Chris@82 215 E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
Chris@82 216 E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
Chris@82 217 {
Chris@82 218 E T1, T2, TO, TP;
Chris@82 219 T1 = ri[0];
Chris@82 220 T2 = ri[WS(is, 5)];
Chris@82 221 T3 = T1 - T2;
Chris@82 222 Tj = T1 + T2;
Chris@82 223 TO = ii[0];
Chris@82 224 TP = ii[WS(is, 5)];
Chris@82 225 TQ = TO - TP;
Chris@82 226 T1e = TO + TP;
Chris@82 227 }
Chris@82 228 {
Chris@82 229 E T6, Tk, Tg, To, T9, Tl, Td, Tn;
Chris@82 230 {
Chris@82 231 E T4, T5, Te, Tf;
Chris@82 232 T4 = ri[WS(is, 2)];
Chris@82 233 T5 = ri[WS(is, 7)];
Chris@82 234 T6 = T4 - T5;
Chris@82 235 Tk = T4 + T5;
Chris@82 236 Te = ri[WS(is, 6)];
Chris@82 237 Tf = ri[WS(is, 1)];
Chris@82 238 Tg = Te - Tf;
Chris@82 239 To = Te + Tf;
Chris@82 240 }
Chris@82 241 {
Chris@82 242 E T7, T8, Tb, Tc;
Chris@82 243 T7 = ri[WS(is, 8)];
Chris@82 244 T8 = ri[WS(is, 3)];
Chris@82 245 T9 = T7 - T8;
Chris@82 246 Tl = T7 + T8;
Chris@82 247 Tb = ri[WS(is, 4)];
Chris@82 248 Tc = ri[WS(is, 9)];
Chris@82 249 Td = Tb - Tc;
Chris@82 250 Tn = Tb + Tc;
Chris@82 251 }
Chris@82 252 TU = T6 - T9;
Chris@82 253 TV = Td - Tg;
Chris@82 254 T1c = Tk - Tl;
Chris@82 255 T1b = Tn - To;
Chris@82 256 Tm = Tk + Tl;
Chris@82 257 Tp = Tn + To;
Chris@82 258 Tq = Tm + Tp;
Chris@82 259 Ta = T6 + T9;
Chris@82 260 Th = Td + Tg;
Chris@82 261 Ti = Ta + Th;
Chris@82 262 }
Chris@82 263 {
Chris@82 264 E Tw, T15, TG, T13, Tz, T16, TD, T12;
Chris@82 265 {
Chris@82 266 E Tu, Tv, TE, TF;
Chris@82 267 Tu = ii[WS(is, 2)];
Chris@82 268 Tv = ii[WS(is, 7)];
Chris@82 269 Tw = Tu - Tv;
Chris@82 270 T15 = Tu + Tv;
Chris@82 271 TE = ii[WS(is, 6)];
Chris@82 272 TF = ii[WS(is, 1)];
Chris@82 273 TG = TE - TF;
Chris@82 274 T13 = TE + TF;
Chris@82 275 }
Chris@82 276 {
Chris@82 277 E Tx, Ty, TB, TC;
Chris@82 278 Tx = ii[WS(is, 8)];
Chris@82 279 Ty = ii[WS(is, 3)];
Chris@82 280 Tz = Tx - Ty;
Chris@82 281 T16 = Tx + Ty;
Chris@82 282 TB = ii[WS(is, 4)];
Chris@82 283 TC = ii[WS(is, 9)];
Chris@82 284 TD = TB - TC;
Chris@82 285 T12 = TB + TC;
Chris@82 286 }
Chris@82 287 TA = Tw - Tz;
Chris@82 288 TH = TD - TG;
Chris@82 289 T17 = T15 - T16;
Chris@82 290 T14 = T12 - T13;
Chris@82 291 T1f = T15 + T16;
Chris@82 292 T1g = T12 + T13;
Chris@82 293 T1h = T1f + T1g;
Chris@82 294 TL = Tw + Tz;
Chris@82 295 TM = TD + TG;
Chris@82 296 TR = TL + TM;
Chris@82 297 }
Chris@82 298 ro[WS(os, 5)] = T3 + Ti;
Chris@82 299 io[WS(os, 5)] = TQ + TR;
Chris@82 300 ro[0] = Tj + Tq;
Chris@82 301 io[0] = T1e + T1h;
Chris@82 302 {
Chris@82 303 E TI, TK, Tt, TJ, Tr, Ts;
Chris@82 304 TI = FMA(KP951056516, TA, KP587785252 * TH);
Chris@82 305 TK = FNMS(KP587785252, TA, KP951056516 * TH);
Chris@82 306 Tr = KP559016994 * (Ta - Th);
Chris@82 307 Ts = FNMS(KP250000000, Ti, T3);
Chris@82 308 Tt = Tr + Ts;
Chris@82 309 TJ = Ts - Tr;
Chris@82 310 ro[WS(os, 9)] = Tt - TI;
Chris@82 311 ro[WS(os, 3)] = TJ + TK;
Chris@82 312 ro[WS(os, 1)] = Tt + TI;
Chris@82 313 ro[WS(os, 7)] = TJ - TK;
Chris@82 314 }
Chris@82 315 {
Chris@82 316 E TW, TY, TT, TX, TN, TS;
Chris@82 317 TW = FMA(KP951056516, TU, KP587785252 * TV);
Chris@82 318 TY = FNMS(KP587785252, TU, KP951056516 * TV);
Chris@82 319 TN = KP559016994 * (TL - TM);
Chris@82 320 TS = FNMS(KP250000000, TR, TQ);
Chris@82 321 TT = TN + TS;
Chris@82 322 TX = TS - TN;
Chris@82 323 io[WS(os, 1)] = TT - TW;
Chris@82 324 io[WS(os, 7)] = TY + TX;
Chris@82 325 io[WS(os, 9)] = TW + TT;
Chris@82 326 io[WS(os, 3)] = TX - TY;
Chris@82 327 }
Chris@82 328 {
Chris@82 329 E T18, T1a, T11, T19, TZ, T10;
Chris@82 330 T18 = FNMS(KP587785252, T17, KP951056516 * T14);
Chris@82 331 T1a = FMA(KP951056516, T17, KP587785252 * T14);
Chris@82 332 TZ = FNMS(KP250000000, Tq, Tj);
Chris@82 333 T10 = KP559016994 * (Tm - Tp);
Chris@82 334 T11 = TZ - T10;
Chris@82 335 T19 = T10 + TZ;
Chris@82 336 ro[WS(os, 2)] = T11 - T18;
Chris@82 337 ro[WS(os, 6)] = T19 + T1a;
Chris@82 338 ro[WS(os, 8)] = T11 + T18;
Chris@82 339 ro[WS(os, 4)] = T19 - T1a;
Chris@82 340 }
Chris@82 341 {
Chris@82 342 E T1d, T1l, T1k, T1m, T1i, T1j;
Chris@82 343 T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
Chris@82 344 T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
Chris@82 345 T1i = FNMS(KP250000000, T1h, T1e);
Chris@82 346 T1j = KP559016994 * (T1f - T1g);
Chris@82 347 T1k = T1i - T1j;
Chris@82 348 T1m = T1j + T1i;
Chris@82 349 io[WS(os, 2)] = T1d + T1k;
Chris@82 350 io[WS(os, 6)] = T1m - T1l;
Chris@82 351 io[WS(os, 8)] = T1k - T1d;
Chris@82 352 io[WS(os, 4)] = T1l + T1m;
Chris@82 353 }
Chris@82 354 }
Chris@82 355 }
Chris@82 356 }
Chris@82 357
Chris@82 358 static const kdft_desc desc = { 10, "n1_10", {72, 12, 12, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 359
Chris@82 360 void X(codelet_n1_10) (planner *p) {
Chris@82 361 X(kdft_register) (p, n1_10, &desc);
Chris@82 362 }
Chris@82 363
Chris@82 364 #endif