annotate src/fftw-3.3.5/rdft/scalar/r2cb/hc2cbdft2_4.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:52:05 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include hc2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 30 FP additions, 12 FP multiplications,
Chris@42 32 * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
Chris@42 33 * 35 stack variables, 0 constants, and 16 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cb.h"
Chris@42 36
Chris@42 37 static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 {
Chris@42 40 INT m;
Chris@42 41 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 42 E Ty, TB, Tw, TE, TA, TF, Tz, TG, TC;
Chris@42 43 {
Chris@42 44 E T4, Tg, T3, Tm, Tc, T5, Th, Ti;
Chris@42 45 {
Chris@42 46 E T1, T2, Ta, Tb;
Chris@42 47 T1 = Rp[0];
Chris@42 48 T2 = Rm[WS(rs, 1)];
Chris@42 49 Ta = Ip[0];
Chris@42 50 Tb = Im[WS(rs, 1)];
Chris@42 51 T4 = Rp[WS(rs, 1)];
Chris@42 52 Tg = T1 - T2;
Chris@42 53 T3 = T1 + T2;
Chris@42 54 Tm = Ta - Tb;
Chris@42 55 Tc = Ta + Tb;
Chris@42 56 T5 = Rm[0];
Chris@42 57 Th = Ip[WS(rs, 1)];
Chris@42 58 Ti = Im[0];
Chris@42 59 }
Chris@42 60 {
Chris@42 61 E T8, Td, T7, Ts, To, Tv, Tk, Te, Tf;
Chris@42 62 T8 = W[0];
Chris@42 63 {
Chris@42 64 E T9, T6, Tn, Tj;
Chris@42 65 T9 = T4 - T5;
Chris@42 66 T6 = T4 + T5;
Chris@42 67 Tn = Th - Ti;
Chris@42 68 Tj = Th + Ti;
Chris@42 69 Ty = Tc - T9;
Chris@42 70 Td = T9 + Tc;
Chris@42 71 T7 = T3 + T6;
Chris@42 72 Ts = T3 - T6;
Chris@42 73 To = Tm + Tn;
Chris@42 74 Tv = Tm - Tn;
Chris@42 75 TB = Tg + Tj;
Chris@42 76 Tk = Tg - Tj;
Chris@42 77 Te = T8 * Td;
Chris@42 78 }
Chris@42 79 Tf = W[1];
Chris@42 80 {
Chris@42 81 E Tr, Tu, Tt, TD, Tx, Tp, Tl, Tq;
Chris@42 82 Tr = W[2];
Chris@42 83 Tp = T8 * Tk;
Chris@42 84 Tu = W[3];
Chris@42 85 Tl = FMA(Tf, Tk, Te);
Chris@42 86 Tt = Tr * Ts;
Chris@42 87 Tq = FNMS(Tf, Td, Tp);
Chris@42 88 TD = Tu * Ts;
Chris@42 89 Rm[0] = T7 + Tl;
Chris@42 90 Rp[0] = T7 - Tl;
Chris@42 91 Im[0] = Tq - To;
Chris@42 92 Ip[0] = To + Tq;
Chris@42 93 Tx = W[4];
Chris@42 94 Tw = FNMS(Tu, Tv, Tt);
Chris@42 95 TE = FMA(Tr, Tv, TD);
Chris@42 96 TA = W[5];
Chris@42 97 TF = Tx * TB;
Chris@42 98 Tz = Tx * Ty;
Chris@42 99 }
Chris@42 100 }
Chris@42 101 }
Chris@42 102 TG = FNMS(TA, Ty, TF);
Chris@42 103 TC = FMA(TA, TB, Tz);
Chris@42 104 Im[WS(rs, 1)] = TG - TE;
Chris@42 105 Ip[WS(rs, 1)] = TE + TG;
Chris@42 106 Rm[WS(rs, 1)] = Tw + TC;
Chris@42 107 Rp[WS(rs, 1)] = Tw - TC;
Chris@42 108 }
Chris@42 109 }
Chris@42 110 }
Chris@42 111
Chris@42 112 static const tw_instr twinstr[] = {
Chris@42 113 {TW_FULL, 1, 4},
Chris@42 114 {TW_NEXT, 1, 0}
Chris@42 115 };
Chris@42 116
Chris@42 117 static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, {24, 6, 6, 0} };
Chris@42 118
Chris@42 119 void X(codelet_hc2cbdft2_4) (planner *p) {
Chris@42 120 X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
Chris@42 121 }
Chris@42 122 #else /* HAVE_FMA */
Chris@42 123
Chris@42 124 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 4 -dif -name hc2cbdft2_4 -include hc2cb.h */
Chris@42 125
Chris@42 126 /*
Chris@42 127 * This function contains 30 FP additions, 12 FP multiplications,
Chris@42 128 * (or, 24 additions, 6 multiplications, 6 fused multiply/add),
Chris@42 129 * 19 stack variables, 0 constants, and 16 memory accesses
Chris@42 130 */
Chris@42 131 #include "hc2cb.h"
Chris@42 132
Chris@42 133 static void hc2cbdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 134 {
Chris@42 135 {
Chris@42 136 INT m;
Chris@42 137 for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@42 138 E T3, Tl, T6, Tm, Td, Tj, Tx, Tv, Ts, Tq;
Chris@42 139 {
Chris@42 140 E Tf, Tc, T9, Ti;
Chris@42 141 {
Chris@42 142 E T1, T2, Ta, Tb;
Chris@42 143 T1 = Rp[0];
Chris@42 144 T2 = Rm[WS(rs, 1)];
Chris@42 145 T3 = T1 + T2;
Chris@42 146 Tf = T1 - T2;
Chris@42 147 Ta = Ip[0];
Chris@42 148 Tb = Im[WS(rs, 1)];
Chris@42 149 Tc = Ta + Tb;
Chris@42 150 Tl = Ta - Tb;
Chris@42 151 }
Chris@42 152 {
Chris@42 153 E T4, T5, Tg, Th;
Chris@42 154 T4 = Rp[WS(rs, 1)];
Chris@42 155 T5 = Rm[0];
Chris@42 156 T6 = T4 + T5;
Chris@42 157 T9 = T4 - T5;
Chris@42 158 Tg = Ip[WS(rs, 1)];
Chris@42 159 Th = Im[0];
Chris@42 160 Ti = Tg + Th;
Chris@42 161 Tm = Tg - Th;
Chris@42 162 }
Chris@42 163 Td = T9 + Tc;
Chris@42 164 Tj = Tf - Ti;
Chris@42 165 Tx = Tf + Ti;
Chris@42 166 Tv = Tc - T9;
Chris@42 167 Ts = Tl - Tm;
Chris@42 168 Tq = T3 - T6;
Chris@42 169 }
Chris@42 170 {
Chris@42 171 E T7, Tn, Tk, To, T8, Te;
Chris@42 172 T7 = T3 + T6;
Chris@42 173 Tn = Tl + Tm;
Chris@42 174 T8 = W[0];
Chris@42 175 Te = W[1];
Chris@42 176 Tk = FMA(T8, Td, Te * Tj);
Chris@42 177 To = FNMS(Te, Td, T8 * Tj);
Chris@42 178 Rp[0] = T7 - Tk;
Chris@42 179 Ip[0] = Tn + To;
Chris@42 180 Rm[0] = T7 + Tk;
Chris@42 181 Im[0] = To - Tn;
Chris@42 182 }
Chris@42 183 {
Chris@42 184 E Tt, Tz, Ty, TA;
Chris@42 185 {
Chris@42 186 E Tp, Tr, Tu, Tw;
Chris@42 187 Tp = W[2];
Chris@42 188 Tr = W[3];
Chris@42 189 Tt = FNMS(Tr, Ts, Tp * Tq);
Chris@42 190 Tz = FMA(Tr, Tq, Tp * Ts);
Chris@42 191 Tu = W[4];
Chris@42 192 Tw = W[5];
Chris@42 193 Ty = FMA(Tu, Tv, Tw * Tx);
Chris@42 194 TA = FNMS(Tw, Tv, Tu * Tx);
Chris@42 195 }
Chris@42 196 Rp[WS(rs, 1)] = Tt - Ty;
Chris@42 197 Ip[WS(rs, 1)] = Tz + TA;
Chris@42 198 Rm[WS(rs, 1)] = Tt + Ty;
Chris@42 199 Im[WS(rs, 1)] = TA - Tz;
Chris@42 200 }
Chris@42 201 }
Chris@42 202 }
Chris@42 203 }
Chris@42 204
Chris@42 205 static const tw_instr twinstr[] = {
Chris@42 206 {TW_FULL, 1, 4},
Chris@42 207 {TW_NEXT, 1, 0}
Chris@42 208 };
Chris@42 209
Chris@42 210 static const hc2c_desc desc = { 4, "hc2cbdft2_4", twinstr, &GENUS, {24, 6, 6, 0} };
Chris@42 211
Chris@42 212 void X(codelet_hc2cbdft2_4) (planner *p) {
Chris@42 213 X(khc2c_register) (p, hc2cbdft2_4, &desc, HC2C_VIA_DFT);
Chris@42 214 }
Chris@42 215 #endif /* HAVE_FMA */