annotate src/fftw-3.3.5/dft/scalar/codelets/n1_9.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:35:51 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include n.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 80 FP additions, 56 FP multiplications,
Chris@42 32 * (or, 24 additions, 0 multiplications, 56 fused multiply/add),
Chris@42 33 * 59 stack variables, 10 constants, and 36 memory accesses
Chris@42 34 */
Chris@42 35 #include "n.h"
Chris@42 36
Chris@42 37 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP954188894, +0.954188894138671133499268364187245676532219158);
Chris@42 40 DK(KP363970234, +0.363970234266202361351047882776834043890471784);
Chris@42 41 DK(KP852868531, +0.852868531952443209628250963940074071936020296);
Chris@42 42 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 43 DK(KP492403876, +0.492403876506104029683371512294761506835321626);
Chris@42 44 DK(KP777861913, +0.777861913430206160028177977318626690410586096);
Chris@42 45 DK(KP839099631, +0.839099631177280011763127298123181364687434283);
Chris@42 46 DK(KP176326980, +0.176326980708464973471090386868618986121633062);
Chris@42 47 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 48 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 49 {
Chris@42 50 INT i;
Chris@42 51 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
Chris@42 52 E T17, TV, T14, TY, T11, T15;
Chris@42 53 {
Chris@42 54 E Tm, TM, TL, T5, Tl, T1f, Tb, Tt, Ta, T1c, TI, TX, TF, TW, Tc;
Chris@42 55 E Td, Tp, Tq;
Chris@42 56 {
Chris@42 57 E T1, Th, Ti, Tj, T4, T2, T3;
Chris@42 58 T1 = ri[0];
Chris@42 59 T2 = ri[WS(is, 3)];
Chris@42 60 T3 = ri[WS(is, 6)];
Chris@42 61 Th = ii[0];
Chris@42 62 Ti = ii[WS(is, 3)];
Chris@42 63 Tj = ii[WS(is, 6)];
Chris@42 64 T4 = T2 + T3;
Chris@42 65 Tm = T3 - T2;
Chris@42 66 {
Chris@42 67 E T6, Tz, T7, T8, TA, TB, Tk;
Chris@42 68 T6 = ri[WS(is, 1)];
Chris@42 69 TM = Ti - Tj;
Chris@42 70 Tk = Ti + Tj;
Chris@42 71 TL = FNMS(KP500000000, T4, T1);
Chris@42 72 T5 = T1 + T4;
Chris@42 73 Tz = ii[WS(is, 1)];
Chris@42 74 Tl = FNMS(KP500000000, Tk, Th);
Chris@42 75 T1f = Th + Tk;
Chris@42 76 T7 = ri[WS(is, 4)];
Chris@42 77 T8 = ri[WS(is, 7)];
Chris@42 78 TA = ii[WS(is, 4)];
Chris@42 79 TB = ii[WS(is, 7)];
Chris@42 80 {
Chris@42 81 E TE, T9, TH, TC, TG, TD;
Chris@42 82 Tb = ri[WS(is, 2)];
Chris@42 83 TE = T7 - T8;
Chris@42 84 T9 = T7 + T8;
Chris@42 85 TH = TB - TA;
Chris@42 86 TC = TA + TB;
Chris@42 87 Tt = ii[WS(is, 2)];
Chris@42 88 Ta = T6 + T9;
Chris@42 89 TG = FNMS(KP500000000, T9, T6);
Chris@42 90 T1c = Tz + TC;
Chris@42 91 TD = FNMS(KP500000000, TC, Tz);
Chris@42 92 TI = FNMS(KP866025403, TH, TG);
Chris@42 93 TX = FMA(KP866025403, TH, TG);
Chris@42 94 TF = FNMS(KP866025403, TE, TD);
Chris@42 95 TW = FMA(KP866025403, TE, TD);
Chris@42 96 Tc = ri[WS(is, 5)];
Chris@42 97 Td = ri[WS(is, 8)];
Chris@42 98 Tp = ii[WS(is, 5)];
Chris@42 99 Tq = ii[WS(is, 8)];
Chris@42 100 }
Chris@42 101 }
Chris@42 102 }
Chris@42 103 {
Chris@42 104 E Tn, TN, TZ, T10, TO, Ty, TJ, TP;
Chris@42 105 {
Chris@42 106 E Tw, Te, Tu, Tr;
Chris@42 107 T17 = FNMS(KP866025403, Tm, Tl);
Chris@42 108 Tn = FMA(KP866025403, Tm, Tl);
Chris@42 109 Tw = Td - Tc;
Chris@42 110 Te = Tc + Td;
Chris@42 111 Tu = Tp + Tq;
Chris@42 112 Tr = Tp - Tq;
Chris@42 113 TN = FMA(KP866025403, TM, TL);
Chris@42 114 TV = FNMS(KP866025403, TM, TL);
Chris@42 115 {
Chris@42 116 E Tf, To, T1d, Tv;
Chris@42 117 Tf = Tb + Te;
Chris@42 118 To = FNMS(KP500000000, Te, Tb);
Chris@42 119 T1d = Tt + Tu;
Chris@42 120 Tv = FNMS(KP500000000, Tu, Tt);
Chris@42 121 {
Chris@42 122 E Ts, Tg, T1i, Tx;
Chris@42 123 Ts = FMA(KP866025403, Tr, To);
Chris@42 124 TZ = FNMS(KP866025403, Tr, To);
Chris@42 125 Tg = Ta + Tf;
Chris@42 126 T1i = Tf - Ta;
Chris@42 127 Tx = FMA(KP866025403, Tw, Tv);
Chris@42 128 T10 = FNMS(KP866025403, Tw, Tv);
Chris@42 129 {
Chris@42 130 E T1e, T1g, T1b, T1h;
Chris@42 131 T1e = T1c - T1d;
Chris@42 132 T1g = T1c + T1d;
Chris@42 133 ro[0] = T5 + Tg;
Chris@42 134 T1b = FNMS(KP500000000, Tg, T5);
Chris@42 135 io[0] = T1f + T1g;
Chris@42 136 T1h = FNMS(KP500000000, T1g, T1f);
Chris@42 137 TO = FMA(KP176326980, Ts, Tx);
Chris@42 138 Ty = FNMS(KP176326980, Tx, Ts);
Chris@42 139 ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
Chris@42 140 ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
Chris@42 141 io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
Chris@42 142 io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
Chris@42 143 TJ = FNMS(KP839099631, TI, TF);
Chris@42 144 TP = FMA(KP839099631, TF, TI);
Chris@42 145 }
Chris@42 146 }
Chris@42 147 }
Chris@42 148 }
Chris@42 149 {
Chris@42 150 E TS, TK, TU, TQ, TT, TR;
Chris@42 151 TS = FMA(KP777861913, TJ, Ty);
Chris@42 152 TK = FNMS(KP777861913, TJ, Ty);
Chris@42 153 TU = FNMS(KP777861913, TP, TO);
Chris@42 154 TQ = FMA(KP777861913, TP, TO);
Chris@42 155 TT = FMA(KP492403876, TK, Tn);
Chris@42 156 io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
Chris@42 157 TR = FNMS(KP492403876, TQ, TN);
Chris@42 158 ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
Chris@42 159 io[WS(os, 4)] = FMA(KP852868531, TU, TT);
Chris@42 160 io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
Chris@42 161 ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
Chris@42 162 ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
Chris@42 163 T14 = FNMS(KP176326980, TW, TX);
Chris@42 164 TY = FMA(KP176326980, TX, TW);
Chris@42 165 T11 = FNMS(KP363970234, T10, TZ);
Chris@42 166 T15 = FMA(KP363970234, TZ, T10);
Chris@42 167 }
Chris@42 168 }
Chris@42 169 }
Chris@42 170 {
Chris@42 171 E T12, T1a, T16, T18, T13, T19;
Chris@42 172 T12 = FNMS(KP954188894, T11, TY);
Chris@42 173 T1a = FMA(KP954188894, T11, TY);
Chris@42 174 T16 = FNMS(KP954188894, T15, T14);
Chris@42 175 T18 = FMA(KP954188894, T15, T14);
Chris@42 176 T13 = FNMS(KP492403876, T12, TV);
Chris@42 177 ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
Chris@42 178 T19 = FMA(KP492403876, T18, T17);
Chris@42 179 io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
Chris@42 180 ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
Chris@42 181 ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
Chris@42 182 io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
Chris@42 183 io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
Chris@42 184 }
Chris@42 185 }
Chris@42 186 }
Chris@42 187 }
Chris@42 188
Chris@42 189 static const kdft_desc desc = { 9, "n1_9", {24, 0, 56, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 190
Chris@42 191 void X(codelet_n1_9) (planner *p) {
Chris@42 192 X(kdft_register) (p, n1_9, &desc);
Chris@42 193 }
Chris@42 194
Chris@42 195 #else /* HAVE_FMA */
Chris@42 196
Chris@42 197 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include n.h */
Chris@42 198
Chris@42 199 /*
Chris@42 200 * This function contains 80 FP additions, 40 FP multiplications,
Chris@42 201 * (or, 60 additions, 20 multiplications, 20 fused multiply/add),
Chris@42 202 * 39 stack variables, 8 constants, and 36 memory accesses
Chris@42 203 */
Chris@42 204 #include "n.h"
Chris@42 205
Chris@42 206 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 207 {
Chris@42 208 DK(KP939692620, +0.939692620785908384054109277324731469936208134);
Chris@42 209 DK(KP342020143, +0.342020143325668733044099614682259580763083368);
Chris@42 210 DK(KP984807753, +0.984807753012208059366743024589523013670643252);
Chris@42 211 DK(KP173648177, +0.173648177666930348851716626769314796000375677);
Chris@42 212 DK(KP642787609, +0.642787609686539326322643409907263432907559884);
Chris@42 213 DK(KP766044443, +0.766044443118978035202392650555416673935832457);
Chris@42 214 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 215 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 216 {
Chris@42 217 INT i;
Chris@42 218 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
Chris@42 219 E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
Chris@42 220 E T10, TG, TZ;
Chris@42 221 {
Chris@42 222 E T1, T2, T3, T4;
Chris@42 223 T1 = ri[0];
Chris@42 224 T2 = ri[WS(is, 3)];
Chris@42 225 T3 = ri[WS(is, 6)];
Chris@42 226 T4 = T2 + T3;
Chris@42 227 T5 = T1 + T4;
Chris@42 228 TO = KP866025403 * (T3 - T2);
Chris@42 229 Th = FNMS(KP500000000, T4, T1);
Chris@42 230 }
Chris@42 231 {
Chris@42 232 E TP, Ti, Tj, TQ;
Chris@42 233 TP = ii[0];
Chris@42 234 Ti = ii[WS(is, 3)];
Chris@42 235 Tj = ii[WS(is, 6)];
Chris@42 236 TQ = Ti + Tj;
Chris@42 237 Tk = KP866025403 * (Ti - Tj);
Chris@42 238 T1g = TP + TQ;
Chris@42 239 TR = FNMS(KP500000000, TQ, TP);
Chris@42 240 }
Chris@42 241 {
Chris@42 242 E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
Chris@42 243 T6 = ri[WS(is, 1)];
Chris@42 244 Ts = ii[WS(is, 1)];
Chris@42 245 {
Chris@42 246 E T7, T8, Tn, To;
Chris@42 247 T7 = ri[WS(is, 4)];
Chris@42 248 T8 = ri[WS(is, 7)];
Chris@42 249 T9 = T7 + T8;
Chris@42 250 Tr = KP866025403 * (T8 - T7);
Chris@42 251 Tn = ii[WS(is, 4)];
Chris@42 252 To = ii[WS(is, 7)];
Chris@42 253 Tp = KP866025403 * (Tn - To);
Chris@42 254 Tt = Tn + To;
Chris@42 255 }
Chris@42 256 Ta = T6 + T9;
Chris@42 257 T1c = Ts + Tt;
Chris@42 258 Tm = FNMS(KP500000000, T9, T6);
Chris@42 259 Tq = Tm + Tp;
Chris@42 260 TW = Tm - Tp;
Chris@42 261 Tu = FNMS(KP500000000, Tt, Ts);
Chris@42 262 Tv = Tr + Tu;
Chris@42 263 TX = Tu - Tr;
Chris@42 264 }
Chris@42 265 {
Chris@42 266 E Tb, TD, Te, TC, TA, TE, Tx, TF;
Chris@42 267 Tb = ri[WS(is, 2)];
Chris@42 268 TD = ii[WS(is, 2)];
Chris@42 269 {
Chris@42 270 E Tc, Td, Ty, Tz;
Chris@42 271 Tc = ri[WS(is, 5)];
Chris@42 272 Td = ri[WS(is, 8)];
Chris@42 273 Te = Tc + Td;
Chris@42 274 TC = KP866025403 * (Td - Tc);
Chris@42 275 Ty = ii[WS(is, 5)];
Chris@42 276 Tz = ii[WS(is, 8)];
Chris@42 277 TA = KP866025403 * (Ty - Tz);
Chris@42 278 TE = Ty + Tz;
Chris@42 279 }
Chris@42 280 Tf = Tb + Te;
Chris@42 281 T1d = TD + TE;
Chris@42 282 Tx = FNMS(KP500000000, Te, Tb);
Chris@42 283 TB = Tx + TA;
Chris@42 284 T10 = Tx - TA;
Chris@42 285 TF = FNMS(KP500000000, TE, TD);
Chris@42 286 TG = TC + TF;
Chris@42 287 TZ = TF - TC;
Chris@42 288 }
Chris@42 289 {
Chris@42 290 E T1e, Tg, T1b, T1f, T1h, T1i;
Chris@42 291 T1e = KP866025403 * (T1c - T1d);
Chris@42 292 Tg = Ta + Tf;
Chris@42 293 T1b = FNMS(KP500000000, Tg, T5);
Chris@42 294 ro[0] = T5 + Tg;
Chris@42 295 ro[WS(os, 3)] = T1b + T1e;
Chris@42 296 ro[WS(os, 6)] = T1b - T1e;
Chris@42 297 T1f = KP866025403 * (Tf - Ta);
Chris@42 298 T1h = T1c + T1d;
Chris@42 299 T1i = FNMS(KP500000000, T1h, T1g);
Chris@42 300 io[WS(os, 3)] = T1f + T1i;
Chris@42 301 io[0] = T1g + T1h;
Chris@42 302 io[WS(os, 6)] = T1i - T1f;
Chris@42 303 }
Chris@42 304 {
Chris@42 305 E Tl, TS, TI, TN, TM, TT, TJ, TU;
Chris@42 306 Tl = Th + Tk;
Chris@42 307 TS = TO + TR;
Chris@42 308 {
Chris@42 309 E Tw, TH, TK, TL;
Chris@42 310 Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
Chris@42 311 TH = FMA(KP173648177, TB, KP984807753 * TG);
Chris@42 312 TI = Tw + TH;
Chris@42 313 TN = KP866025403 * (TH - Tw);
Chris@42 314 TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
Chris@42 315 TL = FNMS(KP984807753, TB, KP173648177 * TG);
Chris@42 316 TM = KP866025403 * (TK - TL);
Chris@42 317 TT = TK + TL;
Chris@42 318 }
Chris@42 319 ro[WS(os, 1)] = Tl + TI;
Chris@42 320 io[WS(os, 1)] = TS + TT;
Chris@42 321 TJ = FNMS(KP500000000, TI, Tl);
Chris@42 322 ro[WS(os, 7)] = TJ - TM;
Chris@42 323 ro[WS(os, 4)] = TJ + TM;
Chris@42 324 TU = FNMS(KP500000000, TT, TS);
Chris@42 325 io[WS(os, 4)] = TN + TU;
Chris@42 326 io[WS(os, 7)] = TU - TN;
Chris@42 327 }
Chris@42 328 {
Chris@42 329 E TV, T14, T12, T13, T17, T1a, T18, T19;
Chris@42 330 TV = Th - Tk;
Chris@42 331 T14 = TR - TO;
Chris@42 332 {
Chris@42 333 E TY, T11, T15, T16;
Chris@42 334 TY = FMA(KP173648177, TW, KP984807753 * TX);
Chris@42 335 T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
Chris@42 336 T12 = TY + T11;
Chris@42 337 T13 = KP866025403 * (T11 - TY);
Chris@42 338 T15 = FNMS(KP984807753, TW, KP173648177 * TX);
Chris@42 339 T16 = FMA(KP342020143, T10, KP939692620 * TZ);
Chris@42 340 T17 = T15 - T16;
Chris@42 341 T1a = KP866025403 * (T15 + T16);
Chris@42 342 }
Chris@42 343 ro[WS(os, 2)] = TV + T12;
Chris@42 344 io[WS(os, 2)] = T14 + T17;
Chris@42 345 T18 = FNMS(KP500000000, T17, T14);
Chris@42 346 io[WS(os, 5)] = T13 + T18;
Chris@42 347 io[WS(os, 8)] = T18 - T13;
Chris@42 348 T19 = FNMS(KP500000000, T12, TV);
Chris@42 349 ro[WS(os, 8)] = T19 - T1a;
Chris@42 350 ro[WS(os, 5)] = T19 + T1a;
Chris@42 351 }
Chris@42 352 }
Chris@42 353 }
Chris@42 354 }
Chris@42 355
Chris@42 356 static const kdft_desc desc = { 9, "n1_9", {60, 20, 20, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 357
Chris@42 358 void X(codelet_n1_9) (planner *p) {
Chris@42 359 X(kdft_register) (p, n1_9, &desc);
Chris@42 360 }
Chris@42 361
Chris@42 362 #endif /* HAVE_FMA */