annotate src/fftw-3.3.8/dft/scalar/codelets/n1_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 80 FP additions, 56 FP multiplications,
Chris@82 32 * (or, 24 additions, 0 multiplications, 56 fused multiply/add),
Chris@82 33 * 41 stack variables, 10 constants, and 36 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/n.h"
Chris@82 36
Chris@82 37 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP954188894, +0.954188894138671133499268364187245676532219158);
Chris@82 40 DK(KP363970234, +0.363970234266202361351047882776834043890471784);
Chris@82 41 DK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@82 42 DK(KP492403876, +0.492403876506104029683371512294761506835321626);
Chris@82 43 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 44 DK(KP777861913, +0.777861913430206160028177977318626690410586096);
Chris@82 45 DK(KP839099631, +0.839099631177280011763127298123181364687434283);
Chris@82 46 DK(KP176326980, +0.176326980708464973471090386868618986121633062);
Chris@82 47 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 48 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 49 {
Chris@82 50 INT i;
Chris@82 51 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
Chris@82 52 E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
Chris@82 53 E TZ, Tx, T10;
Chris@82 54 {
Chris@82 55 E T1, T2, T3, T4;
Chris@82 56 T1 = ri[0];
Chris@82 57 T2 = ri[WS(is, 3)];
Chris@82 58 T3 = ri[WS(is, 6)];
Chris@82 59 T4 = T2 + T3;
Chris@82 60 T5 = T1 + T4;
Chris@82 61 TL = FNMS(KP500000000, T4, T1);
Chris@82 62 Tm = T3 - T2;
Chris@82 63 }
Chris@82 64 {
Chris@82 65 E Th, Ti, Tj, Tk;
Chris@82 66 Th = ii[0];
Chris@82 67 Ti = ii[WS(is, 3)];
Chris@82 68 Tj = ii[WS(is, 6)];
Chris@82 69 Tk = Ti + Tj;
Chris@82 70 Tl = FNMS(KP500000000, Tk, Th);
Chris@82 71 T1f = Th + Tk;
Chris@82 72 TM = Ti - Tj;
Chris@82 73 }
Chris@82 74 {
Chris@82 75 E T6, Tz, T9, TE, TC, TH, TD, TG;
Chris@82 76 T6 = ri[WS(is, 1)];
Chris@82 77 Tz = ii[WS(is, 1)];
Chris@82 78 {
Chris@82 79 E T7, T8, TA, TB;
Chris@82 80 T7 = ri[WS(is, 4)];
Chris@82 81 T8 = ri[WS(is, 7)];
Chris@82 82 T9 = T7 + T8;
Chris@82 83 TE = T7 - T8;
Chris@82 84 TA = ii[WS(is, 4)];
Chris@82 85 TB = ii[WS(is, 7)];
Chris@82 86 TC = TA + TB;
Chris@82 87 TH = TB - TA;
Chris@82 88 }
Chris@82 89 Ta = T6 + T9;
Chris@82 90 T1c = Tz + TC;
Chris@82 91 TD = FNMS(KP500000000, TC, Tz);
Chris@82 92 TF = FNMS(KP866025403, TE, TD);
Chris@82 93 TW = FMA(KP866025403, TE, TD);
Chris@82 94 TG = FNMS(KP500000000, T9, T6);
Chris@82 95 TI = FNMS(KP866025403, TH, TG);
Chris@82 96 TX = FMA(KP866025403, TH, TG);
Chris@82 97 }
Chris@82 98 {
Chris@82 99 E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
Chris@82 100 Tb = ri[WS(is, 2)];
Chris@82 101 Tt = ii[WS(is, 2)];
Chris@82 102 {
Chris@82 103 E Tc, Td, Tp, Tq;
Chris@82 104 Tc = ri[WS(is, 5)];
Chris@82 105 Td = ri[WS(is, 8)];
Chris@82 106 Te = Tc + Td;
Chris@82 107 Tw = Td - Tc;
Chris@82 108 Tp = ii[WS(is, 5)];
Chris@82 109 Tq = ii[WS(is, 8)];
Chris@82 110 Tr = Tp - Tq;
Chris@82 111 Tu = Tp + Tq;
Chris@82 112 }
Chris@82 113 Tf = Tb + Te;
Chris@82 114 T1d = Tt + Tu;
Chris@82 115 To = FNMS(KP500000000, Te, Tb);
Chris@82 116 Ts = FMA(KP866025403, Tr, To);
Chris@82 117 TZ = FNMS(KP866025403, Tr, To);
Chris@82 118 Tv = FNMS(KP500000000, Tu, Tt);
Chris@82 119 Tx = FMA(KP866025403, Tw, Tv);
Chris@82 120 T10 = FNMS(KP866025403, Tw, Tv);
Chris@82 121 }
Chris@82 122 {
Chris@82 123 E T1e, Tg, T1b, T1i, T1g, T1h;
Chris@82 124 T1e = T1c - T1d;
Chris@82 125 Tg = Ta + Tf;
Chris@82 126 T1b = FNMS(KP500000000, Tg, T5);
Chris@82 127 ro[0] = T5 + Tg;
Chris@82 128 ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
Chris@82 129 ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
Chris@82 130 T1i = Tf - Ta;
Chris@82 131 T1g = T1c + T1d;
Chris@82 132 T1h = FNMS(KP500000000, T1g, T1f);
Chris@82 133 io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
Chris@82 134 io[0] = T1f + T1g;
Chris@82 135 io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
Chris@82 136 }
Chris@82 137 {
Chris@82 138 E Tn, TN, TK, TS, TQ, TU, TR, TT;
Chris@82 139 Tn = FMA(KP866025403, Tm, Tl);
Chris@82 140 TN = FMA(KP866025403, TM, TL);
Chris@82 141 {
Chris@82 142 E Ty, TJ, TO, TP;
Chris@82 143 Ty = FNMS(KP176326980, Tx, Ts);
Chris@82 144 TJ = FNMS(KP839099631, TI, TF);
Chris@82 145 TK = FNMS(KP777861913, TJ, Ty);
Chris@82 146 TS = FMA(KP777861913, TJ, Ty);
Chris@82 147 TO = FMA(KP176326980, Ts, Tx);
Chris@82 148 TP = FMA(KP839099631, TF, TI);
Chris@82 149 TQ = FMA(KP777861913, TP, TO);
Chris@82 150 TU = FNMS(KP777861913, TP, TO);
Chris@82 151 }
Chris@82 152 io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
Chris@82 153 ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
Chris@82 154 TR = FNMS(KP492403876, TQ, TN);
Chris@82 155 ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
Chris@82 156 ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
Chris@82 157 TT = FMA(KP492403876, TK, Tn);
Chris@82 158 io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
Chris@82 159 io[WS(os, 4)] = FMA(KP852868531, TU, TT);
Chris@82 160 }
Chris@82 161 {
Chris@82 162 E TV, T17, T12, T1a, T16, T18, T13, T19;
Chris@82 163 TV = FNMS(KP866025403, TM, TL);
Chris@82 164 T17 = FNMS(KP866025403, Tm, Tl);
Chris@82 165 {
Chris@82 166 E TY, T11, T14, T15;
Chris@82 167 TY = FMA(KP176326980, TX, TW);
Chris@82 168 T11 = FNMS(KP363970234, T10, TZ);
Chris@82 169 T12 = FNMS(KP954188894, T11, TY);
Chris@82 170 T1a = FMA(KP954188894, T11, TY);
Chris@82 171 T14 = FNMS(KP176326980, TW, TX);
Chris@82 172 T15 = FMA(KP363970234, TZ, T10);
Chris@82 173 T16 = FNMS(KP954188894, T15, T14);
Chris@82 174 T18 = FMA(KP954188894, T15, T14);
Chris@82 175 }
Chris@82 176 ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
Chris@82 177 io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
Chris@82 178 T13 = FNMS(KP492403876, T12, TV);
Chris@82 179 ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
Chris@82 180 ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
Chris@82 181 T19 = FMA(KP492403876, T18, T17);
Chris@82 182 io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
Chris@82 183 io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
Chris@82 184 }
Chris@82 185 }
Chris@82 186 }
Chris@82 187 }
Chris@82 188
Chris@82 189 static const kdft_desc desc = { 9, "n1_9", {24, 0, 56, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 190
Chris@82 191 void X(codelet_n1_9) (planner *p) {
Chris@82 192 X(kdft_register) (p, n1_9, &desc);
Chris@82 193 }
Chris@82 194
Chris@82 195 #else
Chris@82 196
Chris@82 197 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
Chris@82 198
Chris@82 199 /*
Chris@82 200 * This function contains 80 FP additions, 40 FP multiplications,
Chris@82 201 * (or, 60 additions, 20 multiplications, 20 fused multiply/add),
Chris@82 202 * 39 stack variables, 8 constants, and 36 memory accesses
Chris@82 203 */
Chris@82 204 #include "dft/scalar/n.h"
Chris@82 205
Chris@82 206 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 207 {
Chris@82 208 DK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@82 209 DK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@82 210 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@82 211 DK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@82 212 DK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@82 213 DK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@82 214 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 215 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 216 {
Chris@82 217 INT i;
Chris@82 218 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
Chris@82 219 E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
Chris@82 220 E T10, TG, TZ;
Chris@82 221 {
Chris@82 222 E T1, T2, T3, T4;
Chris@82 223 T1 = ri[0];
Chris@82 224 T2 = ri[WS(is, 3)];
Chris@82 225 T3 = ri[WS(is, 6)];
Chris@82 226 T4 = T2 + T3;
Chris@82 227 T5 = T1 + T4;
Chris@82 228 TO = KP866025403 * (T3 - T2);
Chris@82 229 Th = FNMS(KP500000000, T4, T1);
Chris@82 230 }
Chris@82 231 {
Chris@82 232 E TP, Ti, Tj, TQ;
Chris@82 233 TP = ii[0];
Chris@82 234 Ti = ii[WS(is, 3)];
Chris@82 235 Tj = ii[WS(is, 6)];
Chris@82 236 TQ = Ti + Tj;
Chris@82 237 Tk = KP866025403 * (Ti - Tj);
Chris@82 238 T1g = TP + TQ;
Chris@82 239 TR = FNMS(KP500000000, TQ, TP);
Chris@82 240 }
Chris@82 241 {
Chris@82 242 E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
Chris@82 243 T6 = ri[WS(is, 1)];
Chris@82 244 Ts = ii[WS(is, 1)];
Chris@82 245 {
Chris@82 246 E T7, T8, Tn, To;
Chris@82 247 T7 = ri[WS(is, 4)];
Chris@82 248 T8 = ri[WS(is, 7)];
Chris@82 249 T9 = T7 + T8;
Chris@82 250 Tr = KP866025403 * (T8 - T7);
Chris@82 251 Tn = ii[WS(is, 4)];
Chris@82 252 To = ii[WS(is, 7)];
Chris@82 253 Tp = KP866025403 * (Tn - To);
Chris@82 254 Tt = Tn + To;
Chris@82 255 }
Chris@82 256 Ta = T6 + T9;
Chris@82 257 T1c = Ts + Tt;
Chris@82 258 Tm = FNMS(KP500000000, T9, T6);
Chris@82 259 Tq = Tm + Tp;
Chris@82 260 TW = Tm - Tp;
Chris@82 261 Tu = FNMS(KP500000000, Tt, Ts);
Chris@82 262 Tv = Tr + Tu;
Chris@82 263 TX = Tu - Tr;
Chris@82 264 }
Chris@82 265 {
Chris@82 266 E Tb, TD, Te, TC, TA, TE, Tx, TF;
Chris@82 267 Tb = ri[WS(is, 2)];
Chris@82 268 TD = ii[WS(is, 2)];
Chris@82 269 {
Chris@82 270 E Tc, Td, Ty, Tz;
Chris@82 271 Tc = ri[WS(is, 5)];
Chris@82 272 Td = ri[WS(is, 8)];
Chris@82 273 Te = Tc + Td;
Chris@82 274 TC = KP866025403 * (Td - Tc);
Chris@82 275 Ty = ii[WS(is, 5)];
Chris@82 276 Tz = ii[WS(is, 8)];
Chris@82 277 TA = KP866025403 * (Ty - Tz);
Chris@82 278 TE = Ty + Tz;
Chris@82 279 }
Chris@82 280 Tf = Tb + Te;
Chris@82 281 T1d = TD + TE;
Chris@82 282 Tx = FNMS(KP500000000, Te, Tb);
Chris@82 283 TB = Tx + TA;
Chris@82 284 T10 = Tx - TA;
Chris@82 285 TF = FNMS(KP500000000, TE, TD);
Chris@82 286 TG = TC + TF;
Chris@82 287 TZ = TF - TC;
Chris@82 288 }
Chris@82 289 {
Chris@82 290 E T1e, Tg, T1b, T1f, T1h, T1i;
Chris@82 291 T1e = KP866025403 * (T1c - T1d);
Chris@82 292 Tg = Ta + Tf;
Chris@82 293 T1b = FNMS(KP500000000, Tg, T5);
Chris@82 294 ro[0] = T5 + Tg;
Chris@82 295 ro[WS(os, 3)] = T1b + T1e;
Chris@82 296 ro[WS(os, 6)] = T1b - T1e;
Chris@82 297 T1f = KP866025403 * (Tf - Ta);
Chris@82 298 T1h = T1c + T1d;
Chris@82 299 T1i = FNMS(KP500000000, T1h, T1g);
Chris@82 300 io[WS(os, 3)] = T1f + T1i;
Chris@82 301 io[0] = T1g + T1h;
Chris@82 302 io[WS(os, 6)] = T1i - T1f;
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E Tl, TS, TI, TN, TM, TT, TJ, TU;
Chris@82 306 Tl = Th + Tk;
Chris@82 307 TS = TO + TR;
Chris@82 308 {
Chris@82 309 E Tw, TH, TK, TL;
Chris@82 310 Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
Chris@82 311 TH = FMA(KP173648177, TB, KP984807753 * TG);
Chris@82 312 TI = Tw + TH;
Chris@82 313 TN = KP866025403 * (TH - Tw);
Chris@82 314 TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
Chris@82 315 TL = FNMS(KP984807753, TB, KP173648177 * TG);
Chris@82 316 TM = KP866025403 * (TK - TL);
Chris@82 317 TT = TK + TL;
Chris@82 318 }
Chris@82 319 ro[WS(os, 1)] = Tl + TI;
Chris@82 320 io[WS(os, 1)] = TS + TT;
Chris@82 321 TJ = FNMS(KP500000000, TI, Tl);
Chris@82 322 ro[WS(os, 7)] = TJ - TM;
Chris@82 323 ro[WS(os, 4)] = TJ + TM;
Chris@82 324 TU = FNMS(KP500000000, TT, TS);
Chris@82 325 io[WS(os, 4)] = TN + TU;
Chris@82 326 io[WS(os, 7)] = TU - TN;
Chris@82 327 }
Chris@82 328 {
Chris@82 329 E TV, T14, T12, T13, T17, T1a, T18, T19;
Chris@82 330 TV = Th - Tk;
Chris@82 331 T14 = TR - TO;
Chris@82 332 {
Chris@82 333 E TY, T11, T15, T16;
Chris@82 334 TY = FMA(KP173648177, TW, KP984807753 * TX);
Chris@82 335 T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
Chris@82 336 T12 = TY + T11;
Chris@82 337 T13 = KP866025403 * (T11 - TY);
Chris@82 338 T15 = FNMS(KP984807753, TW, KP173648177 * TX);
Chris@82 339 T16 = FMA(KP342020143, T10, KP939692620 * TZ);
Chris@82 340 T17 = T15 - T16;
Chris@82 341 T1a = KP866025403 * (T15 + T16);
Chris@82 342 }
Chris@82 343 ro[WS(os, 2)] = TV + T12;
Chris@82 344 io[WS(os, 2)] = T14 + T17;
Chris@82 345 T18 = FNMS(KP500000000, T17, T14);
Chris@82 346 io[WS(os, 5)] = T13 + T18;
Chris@82 347 io[WS(os, 8)] = T18 - T13;
Chris@82 348 T19 = FNMS(KP500000000, T12, TV);
Chris@82 349 ro[WS(os, 8)] = T19 - T1a;
Chris@82 350 ro[WS(os, 5)] = T19 + T1a;
Chris@82 351 }
Chris@82 352 }
Chris@82 353 }
Chris@82 354 }
Chris@82 355
Chris@82 356 static const kdft_desc desc = { 9, "n1_9", {60, 20, 20, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 357
Chris@82 358 void X(codelet_n1_9) (planner *p) {
Chris@82 359 X(kdft_register) (p, n1_9, &desc);
Chris@82 360 }
Chris@82 361
Chris@82 362 #endif