annotate src/fftw-3.3.8/dft/scalar/codelets/n1_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:10 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 52 FP additions, 8 FP multiplications,
Chris@82 32 * (or, 44 additions, 0 multiplications, 8 fused multiply/add),
Chris@82 33 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/n.h"
Chris@82 36
Chris@82 37 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT i;
Chris@82 42 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 43 E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@82 44 E TG;
Chris@82 45 {
Chris@82 46 E T1, T2, Tj, Tk;
Chris@82 47 T1 = ri[0];
Chris@82 48 T2 = ri[WS(is, 4)];
Chris@82 49 T3 = T1 + T2;
Chris@82 50 Tn = T1 - T2;
Chris@82 51 {
Chris@82 52 E Tg, Th, T4, T5;
Chris@82 53 Tg = ii[0];
Chris@82 54 Th = ii[WS(is, 4)];
Chris@82 55 Ti = Tg + Th;
Chris@82 56 TC = Tg - Th;
Chris@82 57 T4 = ri[WS(is, 2)];
Chris@82 58 T5 = ri[WS(is, 6)];
Chris@82 59 T6 = T4 + T5;
Chris@82 60 TB = T4 - T5;
Chris@82 61 }
Chris@82 62 Tj = ii[WS(is, 2)];
Chris@82 63 Tk = ii[WS(is, 6)];
Chris@82 64 Tl = Tj + Tk;
Chris@82 65 To = Tj - Tk;
Chris@82 66 {
Chris@82 67 E Tb, Tc, Tv, Tw, Tx, Ty;
Chris@82 68 Tb = ri[WS(is, 7)];
Chris@82 69 Tc = ri[WS(is, 3)];
Chris@82 70 Tv = Tb - Tc;
Chris@82 71 Tw = ii[WS(is, 7)];
Chris@82 72 Tx = ii[WS(is, 3)];
Chris@82 73 Ty = Tw - Tx;
Chris@82 74 Td = Tb + Tc;
Chris@82 75 TN = Tw + Tx;
Chris@82 76 Tz = Tv - Ty;
Chris@82 77 TH = Tv + Ty;
Chris@82 78 }
Chris@82 79 {
Chris@82 80 E T8, T9, Tq, Tr, Ts, Tt;
Chris@82 81 T8 = ri[WS(is, 1)];
Chris@82 82 T9 = ri[WS(is, 5)];
Chris@82 83 Tq = T8 - T9;
Chris@82 84 Tr = ii[WS(is, 1)];
Chris@82 85 Ts = ii[WS(is, 5)];
Chris@82 86 Tt = Tr - Ts;
Chris@82 87 Ta = T8 + T9;
Chris@82 88 TM = Tr + Ts;
Chris@82 89 Tu = Tq + Tt;
Chris@82 90 TG = Tt - Tq;
Chris@82 91 }
Chris@82 92 }
Chris@82 93 {
Chris@82 94 E T7, Te, TP, TQ;
Chris@82 95 T7 = T3 + T6;
Chris@82 96 Te = Ta + Td;
Chris@82 97 ro[WS(os, 4)] = T7 - Te;
Chris@82 98 ro[0] = T7 + Te;
Chris@82 99 TP = Ti + Tl;
Chris@82 100 TQ = TM + TN;
Chris@82 101 io[WS(os, 4)] = TP - TQ;
Chris@82 102 io[0] = TP + TQ;
Chris@82 103 }
Chris@82 104 {
Chris@82 105 E Tf, Tm, TL, TO;
Chris@82 106 Tf = Td - Ta;
Chris@82 107 Tm = Ti - Tl;
Chris@82 108 io[WS(os, 2)] = Tf + Tm;
Chris@82 109 io[WS(os, 6)] = Tm - Tf;
Chris@82 110 TL = T3 - T6;
Chris@82 111 TO = TM - TN;
Chris@82 112 ro[WS(os, 6)] = TL - TO;
Chris@82 113 ro[WS(os, 2)] = TL + TO;
Chris@82 114 }
Chris@82 115 {
Chris@82 116 E Tp, TA, TJ, TK;
Chris@82 117 Tp = Tn + To;
Chris@82 118 TA = Tu + Tz;
Chris@82 119 ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
Chris@82 120 ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
Chris@82 121 TJ = TC - TB;
Chris@82 122 TK = TG + TH;
Chris@82 123 io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
Chris@82 124 io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
Chris@82 125 }
Chris@82 126 {
Chris@82 127 E TD, TE, TF, TI;
Chris@82 128 TD = TB + TC;
Chris@82 129 TE = Tz - Tu;
Chris@82 130 io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
Chris@82 131 io[WS(os, 3)] = FMA(KP707106781, TE, TD);
Chris@82 132 TF = Tn - To;
Chris@82 133 TI = TG - TH;
Chris@82 134 ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
Chris@82 135 ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
Chris@82 136 }
Chris@82 137 }
Chris@82 138 }
Chris@82 139 }
Chris@82 140
Chris@82 141 static const kdft_desc desc = { 8, "n1_8", {44, 0, 8, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 142
Chris@82 143 void X(codelet_n1_8) (planner *p) {
Chris@82 144 X(kdft_register) (p, n1_8, &desc);
Chris@82 145 }
Chris@82 146
Chris@82 147 #else
Chris@82 148
Chris@82 149 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
Chris@82 150
Chris@82 151 /*
Chris@82 152 * This function contains 52 FP additions, 4 FP multiplications,
Chris@82 153 * (or, 52 additions, 4 multiplications, 0 fused multiply/add),
Chris@82 154 * 28 stack variables, 1 constants, and 32 memory accesses
Chris@82 155 */
Chris@82 156 #include "dft/scalar/n.h"
Chris@82 157
Chris@82 158 static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 159 {
Chris@82 160 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 161 {
Chris@82 162 INT i;
Chris@82 163 for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@82 164 E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
Chris@82 165 E TG;
Chris@82 166 {
Chris@82 167 E T1, T2, Tj, Tk;
Chris@82 168 T1 = ri[0];
Chris@82 169 T2 = ri[WS(is, 4)];
Chris@82 170 T3 = T1 + T2;
Chris@82 171 Tn = T1 - T2;
Chris@82 172 {
Chris@82 173 E Tg, Th, T4, T5;
Chris@82 174 Tg = ii[0];
Chris@82 175 Th = ii[WS(is, 4)];
Chris@82 176 Ti = Tg + Th;
Chris@82 177 TC = Tg - Th;
Chris@82 178 T4 = ri[WS(is, 2)];
Chris@82 179 T5 = ri[WS(is, 6)];
Chris@82 180 T6 = T4 + T5;
Chris@82 181 TB = T4 - T5;
Chris@82 182 }
Chris@82 183 Tj = ii[WS(is, 2)];
Chris@82 184 Tk = ii[WS(is, 6)];
Chris@82 185 Tl = Tj + Tk;
Chris@82 186 To = Tj - Tk;
Chris@82 187 {
Chris@82 188 E Tb, Tc, Tv, Tw, Tx, Ty;
Chris@82 189 Tb = ri[WS(is, 7)];
Chris@82 190 Tc = ri[WS(is, 3)];
Chris@82 191 Tv = Tb - Tc;
Chris@82 192 Tw = ii[WS(is, 7)];
Chris@82 193 Tx = ii[WS(is, 3)];
Chris@82 194 Ty = Tw - Tx;
Chris@82 195 Td = Tb + Tc;
Chris@82 196 TN = Tw + Tx;
Chris@82 197 Tz = Tv - Ty;
Chris@82 198 TH = Tv + Ty;
Chris@82 199 }
Chris@82 200 {
Chris@82 201 E T8, T9, Tq, Tr, Ts, Tt;
Chris@82 202 T8 = ri[WS(is, 1)];
Chris@82 203 T9 = ri[WS(is, 5)];
Chris@82 204 Tq = T8 - T9;
Chris@82 205 Tr = ii[WS(is, 1)];
Chris@82 206 Ts = ii[WS(is, 5)];
Chris@82 207 Tt = Tr - Ts;
Chris@82 208 Ta = T8 + T9;
Chris@82 209 TM = Tr + Ts;
Chris@82 210 Tu = Tq + Tt;
Chris@82 211 TG = Tt - Tq;
Chris@82 212 }
Chris@82 213 }
Chris@82 214 {
Chris@82 215 E T7, Te, TP, TQ;
Chris@82 216 T7 = T3 + T6;
Chris@82 217 Te = Ta + Td;
Chris@82 218 ro[WS(os, 4)] = T7 - Te;
Chris@82 219 ro[0] = T7 + Te;
Chris@82 220 TP = Ti + Tl;
Chris@82 221 TQ = TM + TN;
Chris@82 222 io[WS(os, 4)] = TP - TQ;
Chris@82 223 io[0] = TP + TQ;
Chris@82 224 }
Chris@82 225 {
Chris@82 226 E Tf, Tm, TL, TO;
Chris@82 227 Tf = Td - Ta;
Chris@82 228 Tm = Ti - Tl;
Chris@82 229 io[WS(os, 2)] = Tf + Tm;
Chris@82 230 io[WS(os, 6)] = Tm - Tf;
Chris@82 231 TL = T3 - T6;
Chris@82 232 TO = TM - TN;
Chris@82 233 ro[WS(os, 6)] = TL - TO;
Chris@82 234 ro[WS(os, 2)] = TL + TO;
Chris@82 235 }
Chris@82 236 {
Chris@82 237 E Tp, TA, TJ, TK;
Chris@82 238 Tp = Tn + To;
Chris@82 239 TA = KP707106781 * (Tu + Tz);
Chris@82 240 ro[WS(os, 5)] = Tp - TA;
Chris@82 241 ro[WS(os, 1)] = Tp + TA;
Chris@82 242 TJ = TC - TB;
Chris@82 243 TK = KP707106781 * (TG + TH);
Chris@82 244 io[WS(os, 5)] = TJ - TK;
Chris@82 245 io[WS(os, 1)] = TJ + TK;
Chris@82 246 }
Chris@82 247 {
Chris@82 248 E TD, TE, TF, TI;
Chris@82 249 TD = TB + TC;
Chris@82 250 TE = KP707106781 * (Tz - Tu);
Chris@82 251 io[WS(os, 7)] = TD - TE;
Chris@82 252 io[WS(os, 3)] = TD + TE;
Chris@82 253 TF = Tn - To;
Chris@82 254 TI = KP707106781 * (TG - TH);
Chris@82 255 ro[WS(os, 7)] = TF - TI;
Chris@82 256 ro[WS(os, 3)] = TF + TI;
Chris@82 257 }
Chris@82 258 }
Chris@82 259 }
Chris@82 260 }
Chris@82 261
Chris@82 262 static const kdft_desc desc = { 8, "n1_8", {52, 4, 0, 0}, &GENUS, 0, 0, 0, 0 };
Chris@82 263
Chris@82 264 void X(codelet_n1_8) (planner *p) {
Chris@82 265 X(kdft_register) (p, n1_8, &desc);
Chris@82 266 }
Chris@82 267
Chris@82 268 #endif