annotate src/fftw-3.3.3/dft/scalar/codelets/n1_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:42 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_notw.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 52 FP additions, 8 FP multiplications,
Chris@10 32 * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
Chris@10 33 * 36 stack variables, 1 constants, and 32 memory accesses
Chris@10 34 */
Chris@10 35 #include "n.h"
Chris@10 36
Chris@10 37 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 38 {
Chris@10 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 40 {
Chris@10 41 INT i;
Chris@10 42 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@10 43 E TF, TE, TD, TI;
Chris@10 44 {
Chris@10 45 E Tn, T3, TC, Ti, TB, T6, To, Tl, Td, TN, Tz, TH, Ta, Tq, Tt;
Chris@10 46 E TM;
Chris@10 47 {
Chris@10 48 E T4, T5, Tj, Tk;
Chris@10 49 {
Chris@10 50 E T1, T2, Tg, Th;
Chris@10 51 T1 = ri[0];
Chris@10 52 T2 = ri[WS(is, 4)];
Chris@10 53 Tg = ii[0];
Chris@10 54 Th = ii[WS(is, 4)];
Chris@10 55 T4 = ri[WS(is, 2)];
Chris@10 56 Tn = T1 - T2;
Chris@10 57 T3 = T1 + T2;
Chris@10 58 TC = Tg - Th;
Chris@10 59 Ti = Tg + Th;
Chris@10 60 T5 = ri[WS(is, 6)];
Chris@10 61 }
Chris@10 62 Tj = ii[WS(is, 2)];
Chris@10 63 Tk = ii[WS(is, 6)];
Chris@10 64 {
Chris@10 65 E Tb, Tc, Tw, Tx;
Chris@10 66 Tb = ri[WS(is, 7)];
Chris@10 67 TB = T4 - T5;
Chris@10 68 T6 = T4 + T5;
Chris@10 69 To = Tj - Tk;
Chris@10 70 Tl = Tj + Tk;
Chris@10 71 Tc = ri[WS(is, 3)];
Chris@10 72 Tw = ii[WS(is, 7)];
Chris@10 73 Tx = ii[WS(is, 3)];
Chris@10 74 {
Chris@10 75 E T8, Tv, Ty, T9, Tr, Ts;
Chris@10 76 T8 = ri[WS(is, 1)];
Chris@10 77 Td = Tb + Tc;
Chris@10 78 Tv = Tb - Tc;
Chris@10 79 TN = Tw + Tx;
Chris@10 80 Ty = Tw - Tx;
Chris@10 81 T9 = ri[WS(is, 5)];
Chris@10 82 Tr = ii[WS(is, 1)];
Chris@10 83 Ts = ii[WS(is, 5)];
Chris@10 84 Tz = Tv - Ty;
Chris@10 85 TH = Tv + Ty;
Chris@10 86 Ta = T8 + T9;
Chris@10 87 Tq = T8 - T9;
Chris@10 88 Tt = Tr - Ts;
Chris@10 89 TM = Tr + Ts;
Chris@10 90 }
Chris@10 91 }
Chris@10 92 }
Chris@10 93 {
Chris@10 94 E TL, TG, Tu, Tf, Tm, TO;
Chris@10 95 {
Chris@10 96 E T7, Te, TP, TQ;
Chris@10 97 TL = T3 - T6;
Chris@10 98 T7 = T3 + T6;
Chris@10 99 TG = Tt - Tq;
Chris@10 100 Tu = Tq + Tt;
Chris@10 101 Te = Ta + Td;
Chris@10 102 Tf = Td - Ta;
Chris@10 103 Tm = Ti - Tl;
Chris@10 104 TP = Ti + Tl;
Chris@10 105 TQ = TM + TN;
Chris@10 106 TO = TM - TN;
Chris@10 107 ro[0] = T7 + Te;
Chris@10 108 ro[WS(os, 4)] = T7 - Te;
Chris@10 109 io[0] = TP + TQ;
Chris@10 110 io[WS(os, 4)] = TP - TQ;
Chris@10 111 }
Chris@10 112 {
Chris@10 113 E Tp, TA, TJ, TK;
Chris@10 114 TF = Tn - To;
Chris@10 115 Tp = Tn + To;
Chris@10 116 io[WS(os, 6)] = Tm - Tf;
Chris@10 117 io[WS(os, 2)] = Tf + Tm;
Chris@10 118 ro[WS(os, 2)] = TL + TO;
Chris@10 119 ro[WS(os, 6)] = TL - TO;
Chris@10 120 TA = Tu + Tz;
Chris@10 121 TE = Tz - Tu;
Chris@10 122 TD = TB + TC;
Chris@10 123 TJ = TC - TB;
Chris@10 124 TK = TG + TH;
Chris@10 125 TI = TG - TH;
Chris@10 126 ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
Chris@10 127 ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
Chris@10 128 io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
Chris@10 129 io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
Chris@10 130 }
Chris@10 131 }
Chris@10 132 }
Chris@10 133 io[WS(os, 3)] = FMA(KP707106781, TE, TD);
Chris@10 134 io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
Chris@10 135 ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
Chris@10 136 ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
Chris@10 137 }
Chris@10 138 }
Chris@10 139 }
Chris@10 140
Chris@10 141 static const kdft_desc desc = { 8, "n1_8", {44, 0, 8, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 142
Chris@10 143 void X(codelet_n1_8) (planner *p) {
Chris@10 144 X(kdft_register) (p, n1_8, &desc);
Chris@10 145 }
Chris@10 146
Chris@10 147 #else /* HAVE_FMA */
Chris@10 148
Chris@10 149 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include n.h */
Chris@10 150
Chris@10 151 /*
Chris@10 152 * This function contains 52 FP additions, 4 FP multiplications,
Chris@10 153 * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
Chris@10 154 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@10 155 */
Chris@10 156 #include "n.h"
Chris@10 157
Chris@10 158 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@10 159 {
Chris@10 160 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 161 {
Chris@10 162 INT i;
Chris@10 163 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@10 164 E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@10 165 E TG;
Chris@10 166 {
Chris@10 167 E T1, T2, Tj, Tk;
Chris@10 168 T1 = ri[0];
Chris@10 169 T2 = ri[WS(is, 4)];
Chris@10 170 T3 = T1 + T2;
Chris@10 171 Tn = T1 - T2;
Chris@10 172 {
Chris@10 173 E Tg, Th, T4, T5;
Chris@10 174 Tg = ii[0];
Chris@10 175 Th = ii[WS(is, 4)];
Chris@10 176 Ti = Tg + Th;
Chris@10 177 TC = Tg - Th;
Chris@10 178 T4 = ri[WS(is, 2)];
Chris@10 179 T5 = ri[WS(is, 6)];
Chris@10 180 T6 = T4 + T5;
Chris@10 181 TB = T4 - T5;
Chris@10 182 }
Chris@10 183 Tj = ii[WS(is, 2)];
Chris@10 184 Tk = ii[WS(is, 6)];
Chris@10 185 Tl = Tj + Tk;
Chris@10 186 To = Tj - Tk;
Chris@10 187 {
Chris@10 188 E Tb, Tc, Tv, Tw, Tx, Ty;
Chris@10 189 Tb = ri[WS(is, 7)];
Chris@10 190 Tc = ri[WS(is, 3)];
Chris@10 191 Tv = Tb - Tc;
Chris@10 192 Tw = ii[WS(is, 7)];
Chris@10 193 Tx = ii[WS(is, 3)];
Chris@10 194 Ty = Tw - Tx;
Chris@10 195 Td = Tb + Tc;
Chris@10 196 TN = Tw + Tx;
Chris@10 197 Tz = Tv - Ty;
Chris@10 198 TH = Tv + Ty;
Chris@10 199 }
Chris@10 200 {
Chris@10 201 E T8, T9, Tq, Tr, Ts, Tt;
Chris@10 202 T8 = ri[WS(is, 1)];
Chris@10 203 T9 = ri[WS(is, 5)];
Chris@10 204 Tq = T8 - T9;
Chris@10 205 Tr = ii[WS(is, 1)];
Chris@10 206 Ts = ii[WS(is, 5)];
Chris@10 207 Tt = Tr - Ts;
Chris@10 208 Ta = T8 + T9;
Chris@10 209 TM = Tr + Ts;
Chris@10 210 Tu = Tq + Tt;
Chris@10 211 TG = Tt - Tq;
Chris@10 212 }
Chris@10 213 }
Chris@10 214 {
Chris@10 215 E T7, Te, TP, TQ;
Chris@10 216 T7 = T3 + T6;
Chris@10 217 Te = Ta + Td;
Chris@10 218 ro[WS(os, 4)] = T7 - Te;
Chris@10 219 ro[0] = T7 + Te;
Chris@10 220 TP = Ti + Tl;
Chris@10 221 TQ = TM + TN;
Chris@10 222 io[WS(os, 4)] = TP - TQ;
Chris@10 223 io[0] = TP + TQ;
Chris@10 224 }
Chris@10 225 {
Chris@10 226 E Tf, Tm, TL, TO;
Chris@10 227 Tf = Td - Ta;
Chris@10 228 Tm = Ti - Tl;
Chris@10 229 io[WS(os, 2)] = Tf + Tm;
Chris@10 230 io[WS(os, 6)] = Tm - Tf;
Chris@10 231 TL = T3 - T6;
Chris@10 232 TO = TM - TN;
Chris@10 233 ro[WS(os, 6)] = TL - TO;
Chris@10 234 ro[WS(os, 2)] = TL + TO;
Chris@10 235 }
Chris@10 236 {
Chris@10 237 E Tp, TA, TJ, TK;
Chris@10 238 Tp = Tn + To;
Chris@10 239 TA = KP707106781 * (Tu + Tz);
Chris@10 240 ro[WS(os, 5)] = Tp - TA;
Chris@10 241 ro[WS(os, 1)] = Tp + TA;
Chris@10 242 TJ = TC - TB;
Chris@10 243 TK = KP707106781 * (TG + TH);
Chris@10 244 io[WS(os, 5)] = TJ - TK;
Chris@10 245 io[WS(os, 1)] = TJ + TK;
Chris@10 246 }
Chris@10 247 {
Chris@10 248 E TD, TE, TF, TI;
Chris@10 249 TD = TB + TC;
Chris@10 250 TE = KP707106781 * (Tz - Tu);
Chris@10 251 io[WS(os, 7)] = TD - TE;
Chris@10 252 io[WS(os, 3)] = TD + TE;
Chris@10 253 TF = Tn - To;
Chris@10 254 TI = KP707106781 * (TG - TH);
Chris@10 255 ro[WS(os, 7)] = TF - TI;
Chris@10 256 ro[WS(os, 3)] = TF + TI;
Chris@10 257 }
Chris@10 258 }
Chris@10 259 }
Chris@10 260 }
Chris@10 261
Chris@10 262 static const kdft_desc desc = { 8, "n1_8", {52, 4, 0, 0}, &GENUS, 0, 0, 0, 0 };
Chris@10 263
Chris@10 264 void X(codelet_n1_8) (planner *p) {
Chris@10 265 X(kdft_register) (p, n1_8, &desc);
Chris@10 266 }
Chris@10 267
Chris@10 268 #endif /* HAVE_FMA */