annotate fft/fftw/fftw-3.3.4/dft/scalar/codelets/q1_3.c @ 40:223f770b5341 kissfft-double tip

Try a double-precision kissfft
author Chris Cannam
date Wed, 07 Sep 2016 10:40:32 +0100
parents 26056e866c29
children
rev   line source
Chris@19 1 /*
Chris@19 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@19 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@19 4 *
Chris@19 5 * This program is free software; you can redistribute it and/or modify
Chris@19 6 * it under the terms of the GNU General Public License as published by
Chris@19 7 * the Free Software Foundation; either version 2 of the License, or
Chris@19 8 * (at your option) any later version.
Chris@19 9 *
Chris@19 10 * This program is distributed in the hope that it will be useful,
Chris@19 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@19 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@19 13 * GNU General Public License for more details.
Chris@19 14 *
Chris@19 15 * You should have received a copy of the GNU General Public License
Chris@19 16 * along with this program; if not, write to the Free Software
Chris@19 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@19 18 *
Chris@19 19 */
Chris@19 20
Chris@19 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@19 22 /* Generated on Tue Mar 4 13:46:00 EST 2014 */
Chris@19 23
Chris@19 24 #include "codelet-dft.h"
Chris@19 25
Chris@19 26 #ifdef HAVE_FMA
Chris@19 27
Chris@19 28 /* Generated by: ../../../genfft/gen_twidsq.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include q.h */
Chris@19 29
Chris@19 30 /*
Chris@19 31 * This function contains 48 FP additions, 42 FP multiplications,
Chris@19 32 * (or, 18 additions, 12 multiplications, 30 fused multiply/add),
Chris@19 33 * 56 stack variables, 2 constants, and 36 memory accesses
Chris@19 34 */
Chris@19 35 #include "q.h"
Chris@19 36
Chris@19 37 static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@19 38 {
Chris@19 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@19 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@19 41 {
Chris@19 42 INT m;
Chris@19 43 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@19 44 E Tk, Tn, Tm, To, Tl;
Chris@19 45 {
Chris@19 46 E T1, Td, T4, Tg, Tp, T9, Te, T6, Tf, TB, TE, Ts, TZ, Tu, Tx;
Chris@19 47 E TC, TN, TO, TD, TV, T10, TP, Tq, Tr;
Chris@19 48 {
Chris@19 49 E T2, T3, T7, T8;
Chris@19 50 T1 = rio[0];
Chris@19 51 T2 = rio[WS(rs, 1)];
Chris@19 52 T3 = rio[WS(rs, 2)];
Chris@19 53 Td = iio[0];
Chris@19 54 T7 = iio[WS(rs, 1)];
Chris@19 55 T8 = iio[WS(rs, 2)];
Chris@19 56 T4 = T2 + T3;
Chris@19 57 Tg = T3 - T2;
Chris@19 58 Tp = rio[WS(vs, 1)];
Chris@19 59 T9 = T7 - T8;
Chris@19 60 Te = T7 + T8;
Chris@19 61 T6 = FNMS(KP500000000, T4, T1);
Chris@19 62 Tq = rio[WS(vs, 1) + WS(rs, 1)];
Chris@19 63 Tr = rio[WS(vs, 1) + WS(rs, 2)];
Chris@19 64 Tf = FNMS(KP500000000, Te, Td);
Chris@19 65 }
Chris@19 66 {
Chris@19 67 E Tv, Tw, TT, TU;
Chris@19 68 TB = iio[WS(vs, 1)];
Chris@19 69 Tv = iio[WS(vs, 1) + WS(rs, 1)];
Chris@19 70 TE = Tr - Tq;
Chris@19 71 Ts = Tq + Tr;
Chris@19 72 Tw = iio[WS(vs, 1) + WS(rs, 2)];
Chris@19 73 TZ = iio[WS(vs, 2)];
Chris@19 74 TT = iio[WS(vs, 2) + WS(rs, 1)];
Chris@19 75 Tu = FNMS(KP500000000, Ts, Tp);
Chris@19 76 Tx = Tv - Tw;
Chris@19 77 TC = Tv + Tw;
Chris@19 78 TU = iio[WS(vs, 2) + WS(rs, 2)];
Chris@19 79 TN = rio[WS(vs, 2)];
Chris@19 80 TO = rio[WS(vs, 2) + WS(rs, 1)];
Chris@19 81 TD = FNMS(KP500000000, TC, TB);
Chris@19 82 TV = TT - TU;
Chris@19 83 T10 = TT + TU;
Chris@19 84 TP = rio[WS(vs, 2) + WS(rs, 2)];
Chris@19 85 }
Chris@19 86 {
Chris@19 87 E T11, T12, TS, TQ;
Chris@19 88 rio[0] = T1 + T4;
Chris@19 89 iio[0] = Td + Te;
Chris@19 90 T11 = FNMS(KP500000000, T10, TZ);
Chris@19 91 T12 = TP - TO;
Chris@19 92 TQ = TO + TP;
Chris@19 93 rio[WS(rs, 1)] = Tp + Ts;
Chris@19 94 iio[WS(rs, 1)] = TB + TC;
Chris@19 95 iio[WS(rs, 2)] = TZ + T10;
Chris@19 96 TS = FNMS(KP500000000, TQ, TN);
Chris@19 97 rio[WS(rs, 2)] = TN + TQ;
Chris@19 98 {
Chris@19 99 E TW, T13, Ty, TI, TL, TF, TH, TK;
Chris@19 100 {
Chris@19 101 E Ta, Th, T5, Tc;
Chris@19 102 Tk = FNMS(KP866025403, T9, T6);
Chris@19 103 Ta = FMA(KP866025403, T9, T6);
Chris@19 104 Th = FMA(KP866025403, Tg, Tf);
Chris@19 105 Tn = FNMS(KP866025403, Tg, Tf);
Chris@19 106 T5 = W[0];
Chris@19 107 Tc = W[1];
Chris@19 108 {
Chris@19 109 E T16, T19, T18, T1a, T17, Ti, Tb, T15;
Chris@19 110 TW = FMA(KP866025403, TV, TS);
Chris@19 111 T16 = FNMS(KP866025403, TV, TS);
Chris@19 112 T19 = FNMS(KP866025403, T12, T11);
Chris@19 113 T13 = FMA(KP866025403, T12, T11);
Chris@19 114 Ti = T5 * Th;
Chris@19 115 Tb = T5 * Ta;
Chris@19 116 T15 = W[2];
Chris@19 117 T18 = W[3];
Chris@19 118 iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
Chris@19 119 rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
Chris@19 120 T1a = T15 * T19;
Chris@19 121 T17 = T15 * T16;
Chris@19 122 Ty = FMA(KP866025403, Tx, Tu);
Chris@19 123 TI = FNMS(KP866025403, Tx, Tu);
Chris@19 124 TL = FNMS(KP866025403, TE, TD);
Chris@19 125 TF = FMA(KP866025403, TE, TD);
Chris@19 126 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
Chris@19 127 rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
Chris@19 128 TH = W[2];
Chris@19 129 TK = W[3];
Chris@19 130 }
Chris@19 131 }
Chris@19 132 {
Chris@19 133 E TA, TG, Tz, TM, TJ, Tt;
Chris@19 134 TM = TH * TL;
Chris@19 135 TJ = TH * TI;
Chris@19 136 Tt = W[0];
Chris@19 137 TA = W[1];
Chris@19 138 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
Chris@19 139 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
Chris@19 140 TG = Tt * TF;
Chris@19 141 Tz = Tt * Ty;
Chris@19 142 {
Chris@19 143 E TR, TY, T14, TX, Tj;
Chris@19 144 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
Chris@19 145 rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
Chris@19 146 TR = W[0];
Chris@19 147 TY = W[1];
Chris@19 148 T14 = TR * T13;
Chris@19 149 TX = TR * TW;
Chris@19 150 Tj = W[2];
Chris@19 151 Tm = W[3];
Chris@19 152 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
Chris@19 153 rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
Chris@19 154 To = Tj * Tn;
Chris@19 155 Tl = Tj * Tk;
Chris@19 156 }
Chris@19 157 }
Chris@19 158 }
Chris@19 159 }
Chris@19 160 }
Chris@19 161 iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
Chris@19 162 rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
Chris@19 163 }
Chris@19 164 }
Chris@19 165 }
Chris@19 166
Chris@19 167 static const tw_instr twinstr[] = {
Chris@19 168 {TW_FULL, 0, 3},
Chris@19 169 {TW_NEXT, 1, 0}
Chris@19 170 };
Chris@19 171
Chris@19 172 static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {18, 12, 30, 0}, 0, 0, 0 };
Chris@19 173
Chris@19 174 void X(codelet_q1_3) (planner *p) {
Chris@19 175 X(kdft_difsq_register) (p, q1_3, &desc);
Chris@19 176 }
Chris@19 177 #else /* HAVE_FMA */
Chris@19 178
Chris@19 179 /* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include q.h */
Chris@19 180
Chris@19 181 /*
Chris@19 182 * This function contains 48 FP additions, 36 FP multiplications,
Chris@19 183 * (or, 30 additions, 18 multiplications, 18 fused multiply/add),
Chris@19 184 * 35 stack variables, 2 constants, and 36 memory accesses
Chris@19 185 */
Chris@19 186 #include "q.h"
Chris@19 187
Chris@19 188 static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@19 189 {
Chris@19 190 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@19 191 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@19 192 {
Chris@19 193 INT m;
Chris@19 194 for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
Chris@19 195 E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
Chris@19 196 E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
Chris@19 197 {
Chris@19 198 E T2, T3, Tr, Ts;
Chris@19 199 T1 = rio[0];
Chris@19 200 T2 = rio[WS(rs, 1)];
Chris@19 201 T3 = rio[WS(rs, 2)];
Chris@19 202 T4 = T2 + T3;
Chris@19 203 T6 = FNMS(KP500000000, T4, T1);
Chris@19 204 Tc = KP866025403 * (T3 - T2);
Chris@19 205 {
Chris@19 206 E T7, T8, Tm, Tn;
Chris@19 207 Td = iio[0];
Chris@19 208 T7 = iio[WS(rs, 1)];
Chris@19 209 T8 = iio[WS(rs, 2)];
Chris@19 210 Te = T7 + T8;
Chris@19 211 T9 = KP866025403 * (T7 - T8);
Chris@19 212 Tf = FNMS(KP500000000, Te, Td);
Chris@19 213 Tl = rio[WS(vs, 1)];
Chris@19 214 Tm = rio[WS(vs, 1) + WS(rs, 1)];
Chris@19 215 Tn = rio[WS(vs, 1) + WS(rs, 2)];
Chris@19 216 To = Tm + Tn;
Chris@19 217 Tq = FNMS(KP500000000, To, Tl);
Chris@19 218 Tw = KP866025403 * (Tn - Tm);
Chris@19 219 }
Chris@19 220 Tx = iio[WS(vs, 1)];
Chris@19 221 Tr = iio[WS(vs, 1) + WS(rs, 1)];
Chris@19 222 Ts = iio[WS(vs, 1) + WS(rs, 2)];
Chris@19 223 Ty = Tr + Ts;
Chris@19 224 Tt = KP866025403 * (Tr - Ts);
Chris@19 225 Tz = FNMS(KP500000000, Ty, Tx);
Chris@19 226 {
Chris@19 227 E TL, TM, TG, TH;
Chris@19 228 TR = iio[WS(vs, 2)];
Chris@19 229 TL = iio[WS(vs, 2) + WS(rs, 1)];
Chris@19 230 TM = iio[WS(vs, 2) + WS(rs, 2)];
Chris@19 231 TS = TL + TM;
Chris@19 232 TN = KP866025403 * (TL - TM);
Chris@19 233 TT = FNMS(KP500000000, TS, TR);
Chris@19 234 TF = rio[WS(vs, 2)];
Chris@19 235 TG = rio[WS(vs, 2) + WS(rs, 1)];
Chris@19 236 TH = rio[WS(vs, 2) + WS(rs, 2)];
Chris@19 237 TI = TG + TH;
Chris@19 238 TK = FNMS(KP500000000, TI, TF);
Chris@19 239 TQ = KP866025403 * (TH - TG);
Chris@19 240 }
Chris@19 241 }
Chris@19 242 rio[0] = T1 + T4;
Chris@19 243 iio[0] = Td + Te;
Chris@19 244 rio[WS(rs, 1)] = Tl + To;
Chris@19 245 iio[WS(rs, 1)] = Tx + Ty;
Chris@19 246 iio[WS(rs, 2)] = TR + TS;
Chris@19 247 rio[WS(rs, 2)] = TF + TI;
Chris@19 248 {
Chris@19 249 E Ta, Tg, T5, Tb;
Chris@19 250 Ta = T6 + T9;
Chris@19 251 Tg = Tc + Tf;
Chris@19 252 T5 = W[0];
Chris@19 253 Tb = W[1];
Chris@19 254 rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
Chris@19 255 iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
Chris@19 256 }
Chris@19 257 {
Chris@19 258 E TW, TY, TV, TX;
Chris@19 259 TW = TK - TN;
Chris@19 260 TY = TT - TQ;
Chris@19 261 TV = W[2];
Chris@19 262 TX = W[3];
Chris@19 263 rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
Chris@19 264 iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
Chris@19 265 }
Chris@19 266 {
Chris@19 267 E TC, TE, TB, TD;
Chris@19 268 TC = Tq - Tt;
Chris@19 269 TE = Tz - Tw;
Chris@19 270 TB = W[2];
Chris@19 271 TD = W[3];
Chris@19 272 rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
Chris@19 273 iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
Chris@19 274 }
Chris@19 275 {
Chris@19 276 E Tu, TA, Tp, Tv;
Chris@19 277 Tu = Tq + Tt;
Chris@19 278 TA = Tw + Tz;
Chris@19 279 Tp = W[0];
Chris@19 280 Tv = W[1];
Chris@19 281 rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
Chris@19 282 iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
Chris@19 283 }
Chris@19 284 {
Chris@19 285 E TO, TU, TJ, TP;
Chris@19 286 TO = TK + TN;
Chris@19 287 TU = TQ + TT;
Chris@19 288 TJ = W[0];
Chris@19 289 TP = W[1];
Chris@19 290 rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
Chris@19 291 iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
Chris@19 292 }
Chris@19 293 {
Chris@19 294 E Ti, Tk, Th, Tj;
Chris@19 295 Ti = T6 - T9;
Chris@19 296 Tk = Tf - Tc;
Chris@19 297 Th = W[2];
Chris@19 298 Tj = W[3];
Chris@19 299 rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
Chris@19 300 iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
Chris@19 301 }
Chris@19 302 }
Chris@19 303 }
Chris@19 304 }
Chris@19 305
Chris@19 306 static const tw_instr twinstr[] = {
Chris@19 307 {TW_FULL, 0, 3},
Chris@19 308 {TW_NEXT, 1, 0}
Chris@19 309 };
Chris@19 310
Chris@19 311 static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, {30, 18, 18, 0}, 0, 0, 0 };
Chris@19 312
Chris@19 313 void X(codelet_q1_3) (planner *p) {
Chris@19 314 X(kdft_difsq_register) (p, q1_3, &desc);
Chris@19 315 }
Chris@19 316 #endif /* HAVE_FMA */