annotate src/fftw-3.3.5/dft/simd/common/n1fv_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:38:45 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include n1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 72 FP additions, 34 FP multiplications,
Chris@42 32 * (or, 38 additions, 0 multiplications, 34 fused multiply/add),
Chris@42 33 * 54 stack variables, 3 constants, and 32 memory accesses
Chris@42 34 */
Chris@42 35 #include "n1f.h"
Chris@42 36
Chris@42 37 static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 {
Chris@42 43 INT i;
Chris@42 44 const R *xi;
Chris@42 45 R *xo;
Chris@42 46 xi = ri;
Chris@42 47 xo = ro;
Chris@42 48 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@42 49 V T7, Tu, TF, TB, T13, TL, TO, TX, TC, Te, TP, Th, TQ, Tk, TW;
Chris@42 50 V T16;
Chris@42 51 {
Chris@42 52 V TH, TU, Tz, Tf, TK, TV, TA, TM, Ta, TN, Td, Tg, Ti, Tj;
Chris@42 53 {
Chris@42 54 V T1, T2, T4, T5, To, Tp, Tr, Ts;
Chris@42 55 T1 = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 56 T2 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 57 T4 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 58 T5 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 59 To = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 60 Tp = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 61 Tr = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 62 Ts = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 63 {
Chris@42 64 V T8, TJ, Tq, TI, Tt, T9, Tb, Tc, T3, T6;
Chris@42 65 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 66 TH = VSUB(T1, T2);
Chris@42 67 T3 = VADD(T1, T2);
Chris@42 68 TU = VSUB(T4, T5);
Chris@42 69 T6 = VADD(T4, T5);
Chris@42 70 TJ = VSUB(To, Tp);
Chris@42 71 Tq = VADD(To, Tp);
Chris@42 72 TI = VSUB(Tr, Ts);
Chris@42 73 Tt = VADD(Tr, Ts);
Chris@42 74 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 75 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 76 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 77 T7 = VSUB(T3, T6);
Chris@42 78 Tz = VADD(T3, T6);
Chris@42 79 Tf = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 80 TK = VADD(TI, TJ);
Chris@42 81 TV = VSUB(TJ, TI);
Chris@42 82 TA = VADD(Tt, Tq);
Chris@42 83 Tu = VSUB(Tq, Tt);
Chris@42 84 TM = VSUB(T8, T9);
Chris@42 85 Ta = VADD(T8, T9);
Chris@42 86 TN = VSUB(Tb, Tc);
Chris@42 87 Td = VADD(Tb, Tc);
Chris@42 88 Tg = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 89 Ti = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 90 Tj = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 91 }
Chris@42 92 }
Chris@42 93 TF = VSUB(Tz, TA);
Chris@42 94 TB = VADD(Tz, TA);
Chris@42 95 T13 = VFNMS(LDK(KP707106781), TK, TH);
Chris@42 96 TL = VFMA(LDK(KP707106781), TK, TH);
Chris@42 97 TO = VFNMS(LDK(KP414213562), TN, TM);
Chris@42 98 TX = VFMA(LDK(KP414213562), TM, TN);
Chris@42 99 TC = VADD(Ta, Td);
Chris@42 100 Te = VSUB(Ta, Td);
Chris@42 101 TP = VSUB(Tf, Tg);
Chris@42 102 Th = VADD(Tf, Tg);
Chris@42 103 TQ = VSUB(Tj, Ti);
Chris@42 104 Tk = VADD(Ti, Tj);
Chris@42 105 TW = VFNMS(LDK(KP707106781), TV, TU);
Chris@42 106 T16 = VFMA(LDK(KP707106781), TV, TU);
Chris@42 107 }
Chris@42 108 {
Chris@42 109 V TY, TR, Tl, TD;
Chris@42 110 TY = VFMA(LDK(KP414213562), TP, TQ);
Chris@42 111 TR = VFNMS(LDK(KP414213562), TQ, TP);
Chris@42 112 Tl = VSUB(Th, Tk);
Chris@42 113 TD = VADD(Th, Tk);
Chris@42 114 {
Chris@42 115 V TS, T17, TZ, T14;
Chris@42 116 TS = VADD(TO, TR);
Chris@42 117 T17 = VSUB(TR, TO);
Chris@42 118 TZ = VSUB(TX, TY);
Chris@42 119 T14 = VADD(TX, TY);
Chris@42 120 {
Chris@42 121 V TE, TG, Tm, Tv;
Chris@42 122 TE = VADD(TC, TD);
Chris@42 123 TG = VSUB(TD, TC);
Chris@42 124 Tm = VADD(Te, Tl);
Chris@42 125 Tv = VSUB(Tl, Te);
Chris@42 126 {
Chris@42 127 V T18, T1a, TT, T11;
Chris@42 128 T18 = VFNMS(LDK(KP923879532), T17, T16);
Chris@42 129 T1a = VFMA(LDK(KP923879532), T17, T16);
Chris@42 130 TT = VFNMS(LDK(KP923879532), TS, TL);
Chris@42 131 T11 = VFMA(LDK(KP923879532), TS, TL);
Chris@42 132 {
Chris@42 133 V T15, T19, T10, T12;
Chris@42 134 T15 = VFNMS(LDK(KP923879532), T14, T13);
Chris@42 135 T19 = VFMA(LDK(KP923879532), T14, T13);
Chris@42 136 T10 = VFNMS(LDK(KP923879532), TZ, TW);
Chris@42 137 T12 = VFMA(LDK(KP923879532), TZ, TW);
Chris@42 138 ST(&(xo[WS(os, 4)]), VFMAI(TG, TF), ovs, &(xo[0]));
Chris@42 139 ST(&(xo[WS(os, 12)]), VFNMSI(TG, TF), ovs, &(xo[0]));
Chris@42 140 ST(&(xo[0]), VADD(TB, TE), ovs, &(xo[0]));
Chris@42 141 ST(&(xo[WS(os, 8)]), VSUB(TB, TE), ovs, &(xo[0]));
Chris@42 142 {
Chris@42 143 V Tw, Ty, Tn, Tx;
Chris@42 144 Tw = VFNMS(LDK(KP707106781), Tv, Tu);
Chris@42 145 Ty = VFMA(LDK(KP707106781), Tv, Tu);
Chris@42 146 Tn = VFNMS(LDK(KP707106781), Tm, T7);
Chris@42 147 Tx = VFMA(LDK(KP707106781), Tm, T7);
Chris@42 148 ST(&(xo[WS(os, 3)]), VFMAI(T1a, T19), ovs, &(xo[WS(os, 1)]));
Chris@42 149 ST(&(xo[WS(os, 13)]), VFNMSI(T1a, T19), ovs, &(xo[WS(os, 1)]));
Chris@42 150 ST(&(xo[WS(os, 11)]), VFMAI(T18, T15), ovs, &(xo[WS(os, 1)]));
Chris@42 151 ST(&(xo[WS(os, 5)]), VFNMSI(T18, T15), ovs, &(xo[WS(os, 1)]));
Chris@42 152 ST(&(xo[WS(os, 1)]), VFNMSI(T12, T11), ovs, &(xo[WS(os, 1)]));
Chris@42 153 ST(&(xo[WS(os, 15)]), VFMAI(T12, T11), ovs, &(xo[WS(os, 1)]));
Chris@42 154 ST(&(xo[WS(os, 7)]), VFMAI(T10, TT), ovs, &(xo[WS(os, 1)]));
Chris@42 155 ST(&(xo[WS(os, 9)]), VFNMSI(T10, TT), ovs, &(xo[WS(os, 1)]));
Chris@42 156 ST(&(xo[WS(os, 14)]), VFNMSI(Ty, Tx), ovs, &(xo[0]));
Chris@42 157 ST(&(xo[WS(os, 2)]), VFMAI(Ty, Tx), ovs, &(xo[0]));
Chris@42 158 ST(&(xo[WS(os, 10)]), VFMAI(Tw, Tn), ovs, &(xo[0]));
Chris@42 159 ST(&(xo[WS(os, 6)]), VFNMSI(Tw, Tn), ovs, &(xo[0]));
Chris@42 160 }
Chris@42 161 }
Chris@42 162 }
Chris@42 163 }
Chris@42 164 }
Chris@42 165 }
Chris@42 166 }
Chris@42 167 }
Chris@42 168 VLEAVE();
Chris@42 169 }
Chris@42 170
Chris@42 171 static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {38, 0, 34, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 172
Chris@42 173 void XSIMD(codelet_n1fv_16) (planner *p) {
Chris@42 174 X(kdft_register) (p, n1fv_16, &desc);
Chris@42 175 }
Chris@42 176
Chris@42 177 #else /* HAVE_FMA */
Chris@42 178
Chris@42 179 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name n1fv_16 -include n1f.h */
Chris@42 180
Chris@42 181 /*
Chris@42 182 * This function contains 72 FP additions, 12 FP multiplications,
Chris@42 183 * (or, 68 additions, 8 multiplications, 4 fused multiply/add),
Chris@42 184 * 30 stack variables, 3 constants, and 32 memory accesses
Chris@42 185 */
Chris@42 186 #include "n1f.h"
Chris@42 187
Chris@42 188 static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@42 189 {
Chris@42 190 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 191 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 192 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 193 {
Chris@42 194 INT i;
Chris@42 195 const R *xi;
Chris@42 196 R *xo;
Chris@42 197 xi = ri;
Chris@42 198 xo = ro;
Chris@42 199 for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
Chris@42 200 V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
Chris@42 201 V TQ;
Chris@42 202 {
Chris@42 203 V Tn, To, TM, Ts, Tt, TL;
Chris@42 204 Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
Chris@42 205 To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
Chris@42 206 TM = VADD(Tn, To);
Chris@42 207 Ts = LD(&(xi[0]), ivs, &(xi[0]));
Chris@42 208 Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
Chris@42 209 TL = VADD(Ts, Tt);
Chris@42 210 Tp = VSUB(Tn, To);
Chris@42 211 T13 = VADD(TL, TM);
Chris@42 212 Tu = VSUB(Ts, Tt);
Chris@42 213 TN = VSUB(TL, TM);
Chris@42 214 }
Chris@42 215 {
Chris@42 216 V Ti, TW, Tl, TX;
Chris@42 217 {
Chris@42 218 V Tg, Th, Tj, Tk;
Chris@42 219 Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
Chris@42 220 Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
Chris@42 221 Ti = VSUB(Tg, Th);
Chris@42 222 TW = VADD(Tg, Th);
Chris@42 223 Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
Chris@42 224 Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
Chris@42 225 Tl = VSUB(Tj, Tk);
Chris@42 226 TX = VADD(Tj, Tk);
Chris@42 227 }
Chris@42 228 Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
Chris@42 229 T14 = VADD(TX, TW);
Chris@42 230 Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
Chris@42 231 TY = VSUB(TW, TX);
Chris@42 232 }
Chris@42 233 {
Chris@42 234 V T3, TR, T6, TS;
Chris@42 235 {
Chris@42 236 V T1, T2, T4, T5;
Chris@42 237 T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
Chris@42 238 T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
Chris@42 239 T3 = VSUB(T1, T2);
Chris@42 240 TR = VADD(T1, T2);
Chris@42 241 T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
Chris@42 242 T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
Chris@42 243 T6 = VSUB(T4, T5);
Chris@42 244 TS = VADD(T4, T5);
Chris@42 245 }
Chris@42 246 T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
Chris@42 247 T17 = VADD(TR, TS);
Chris@42 248 Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
Chris@42 249 TT = VSUB(TR, TS);
Chris@42 250 }
Chris@42 251 {
Chris@42 252 V Ta, TO, Td, TP;
Chris@42 253 {
Chris@42 254 V T8, T9, Tb, Tc;
Chris@42 255 T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
Chris@42 256 T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
Chris@42 257 Ta = VSUB(T8, T9);
Chris@42 258 TO = VADD(T8, T9);
Chris@42 259 Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
Chris@42 260 Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
Chris@42 261 Td = VSUB(Tb, Tc);
Chris@42 262 TP = VADD(Tb, Tc);
Chris@42 263 }
Chris@42 264 Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
Chris@42 265 T16 = VADD(TO, TP);
Chris@42 266 Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
Chris@42 267 TQ = VSUB(TO, TP);
Chris@42 268 }
Chris@42 269 {
Chris@42 270 V T15, T18, T19, T1a;
Chris@42 271 T15 = VADD(T13, T14);
Chris@42 272 T18 = VADD(T16, T17);
Chris@42 273 ST(&(xo[WS(os, 8)]), VSUB(T15, T18), ovs, &(xo[0]));
Chris@42 274 ST(&(xo[0]), VADD(T15, T18), ovs, &(xo[0]));
Chris@42 275 T19 = VSUB(T13, T14);
Chris@42 276 T1a = VBYI(VSUB(T17, T16));
Chris@42 277 ST(&(xo[WS(os, 12)]), VSUB(T19, T1a), ovs, &(xo[0]));
Chris@42 278 ST(&(xo[WS(os, 4)]), VADD(T19, T1a), ovs, &(xo[0]));
Chris@42 279 }
Chris@42 280 {
Chris@42 281 V TV, T11, T10, T12, TU, TZ;
Chris@42 282 TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
Chris@42 283 TV = VADD(TN, TU);
Chris@42 284 T11 = VSUB(TN, TU);
Chris@42 285 TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
Chris@42 286 T10 = VBYI(VADD(TY, TZ));
Chris@42 287 T12 = VBYI(VSUB(TZ, TY));
Chris@42 288 ST(&(xo[WS(os, 14)]), VSUB(TV, T10), ovs, &(xo[0]));
Chris@42 289 ST(&(xo[WS(os, 6)]), VADD(T11, T12), ovs, &(xo[0]));
Chris@42 290 ST(&(xo[WS(os, 2)]), VADD(TV, T10), ovs, &(xo[0]));
Chris@42 291 ST(&(xo[WS(os, 10)]), VSUB(T11, T12), ovs, &(xo[0]));
Chris@42 292 }
Chris@42 293 {
Chris@42 294 V Tr, TB, TA, TC;
Chris@42 295 {
Chris@42 296 V Tf, Tq, Tw, Tz;
Chris@42 297 Tf = VSUB(T7, Te);
Chris@42 298 Tq = VSUB(Tm, Tp);
Chris@42 299 Tr = VBYI(VSUB(Tf, Tq));
Chris@42 300 TB = VBYI(VADD(Tq, Tf));
Chris@42 301 Tw = VADD(Tu, Tv);
Chris@42 302 Tz = VADD(Tx, Ty);
Chris@42 303 TA = VSUB(Tw, Tz);
Chris@42 304 TC = VADD(Tw, Tz);
Chris@42 305 }
Chris@42 306 ST(&(xo[WS(os, 7)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
Chris@42 307 ST(&(xo[WS(os, 15)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
Chris@42 308 ST(&(xo[WS(os, 9)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
Chris@42 309 ST(&(xo[WS(os, 1)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
Chris@42 310 }
Chris@42 311 {
Chris@42 312 V TF, TJ, TI, TK;
Chris@42 313 {
Chris@42 314 V TD, TE, TG, TH;
Chris@42 315 TD = VSUB(Tu, Tv);
Chris@42 316 TE = VADD(Te, T7);
Chris@42 317 TF = VADD(TD, TE);
Chris@42 318 TJ = VSUB(TD, TE);
Chris@42 319 TG = VADD(Tp, Tm);
Chris@42 320 TH = VSUB(Ty, Tx);
Chris@42 321 TI = VBYI(VADD(TG, TH));
Chris@42 322 TK = VBYI(VSUB(TH, TG));
Chris@42 323 }
Chris@42 324 ST(&(xo[WS(os, 13)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
Chris@42 325 ST(&(xo[WS(os, 5)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
Chris@42 326 ST(&(xo[WS(os, 3)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
Chris@42 327 ST(&(xo[WS(os, 11)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
Chris@42 328 }
Chris@42 329 }
Chris@42 330 }
Chris@42 331 VLEAVE();
Chris@42 332 }
Chris@42 333
Chris@42 334 static const kdft_desc desc = { 16, XSIMD_STRING("n1fv_16"), {68, 8, 4, 0}, &GENUS, 0, 0, 0, 0 };
Chris@42 335
Chris@42 336 void XSIMD(codelet_n1fv_16) (planner *p) {
Chris@42 337 X(kdft_register) (p, n1fv_16, &desc);
Chris@42 338 }
Chris@42 339
Chris@42 340 #endif /* HAVE_FMA */