annotate src/fftw-3.3.8/rdft/scalar/r2cb/hb_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:32 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 66 FP additions, 36 FP multiplications,
Chris@82 32 * (or, 44 additions, 14 multiplications, 22 fused multiply/add),
Chris@82 33 * 33 stack variables, 1 constants, and 32 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hb.h"
Chris@82 36
Chris@82 37 static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 43 E T7, T1i, T1n, Tk, TD, TV, T1b, TQ, Te, T1e, T1o, T1j, TE, TF, TR;
Chris@82 44 E Tv, TW;
Chris@82 45 {
Chris@82 46 E T3, Tg, TC, T19, T6, Tz, Tj, T1a;
Chris@82 47 {
Chris@82 48 E T1, T2, TA, TB;
Chris@82 49 T1 = cr[0];
Chris@82 50 T2 = ci[WS(rs, 3)];
Chris@82 51 T3 = T1 + T2;
Chris@82 52 Tg = T1 - T2;
Chris@82 53 TA = ci[WS(rs, 7)];
Chris@82 54 TB = cr[WS(rs, 4)];
Chris@82 55 TC = TA + TB;
Chris@82 56 T19 = TA - TB;
Chris@82 57 }
Chris@82 58 {
Chris@82 59 E T4, T5, Th, Ti;
Chris@82 60 T4 = cr[WS(rs, 2)];
Chris@82 61 T5 = ci[WS(rs, 1)];
Chris@82 62 T6 = T4 + T5;
Chris@82 63 Tz = T4 - T5;
Chris@82 64 Th = ci[WS(rs, 5)];
Chris@82 65 Ti = cr[WS(rs, 6)];
Chris@82 66 Tj = Th + Ti;
Chris@82 67 T1a = Th - Ti;
Chris@82 68 }
Chris@82 69 T7 = T3 + T6;
Chris@82 70 T1i = T3 - T6;
Chris@82 71 T1n = T19 - T1a;
Chris@82 72 Tk = Tg - Tj;
Chris@82 73 TD = Tz + TC;
Chris@82 74 TV = TC - Tz;
Chris@82 75 T1b = T19 + T1a;
Chris@82 76 TQ = Tg + Tj;
Chris@82 77 }
Chris@82 78 {
Chris@82 79 E Ta, Tl, Tt, T1d, Td, Tq, To, T1c, Tp, Tu;
Chris@82 80 {
Chris@82 81 E T8, T9, Tr, Ts;
Chris@82 82 T8 = cr[WS(rs, 1)];
Chris@82 83 T9 = ci[WS(rs, 2)];
Chris@82 84 Ta = T8 + T9;
Chris@82 85 Tl = T8 - T9;
Chris@82 86 Tr = ci[WS(rs, 4)];
Chris@82 87 Ts = cr[WS(rs, 7)];
Chris@82 88 Tt = Tr + Ts;
Chris@82 89 T1d = Tr - Ts;
Chris@82 90 }
Chris@82 91 {
Chris@82 92 E Tb, Tc, Tm, Tn;
Chris@82 93 Tb = ci[0];
Chris@82 94 Tc = cr[WS(rs, 3)];
Chris@82 95 Td = Tb + Tc;
Chris@82 96 Tq = Tb - Tc;
Chris@82 97 Tm = ci[WS(rs, 6)];
Chris@82 98 Tn = cr[WS(rs, 5)];
Chris@82 99 To = Tm + Tn;
Chris@82 100 T1c = Tm - Tn;
Chris@82 101 }
Chris@82 102 Te = Ta + Td;
Chris@82 103 T1e = T1c + T1d;
Chris@82 104 T1o = Ta - Td;
Chris@82 105 T1j = T1d - T1c;
Chris@82 106 TE = Tl + To;
Chris@82 107 TF = Tq + Tt;
Chris@82 108 TR = TE + TF;
Chris@82 109 Tp = Tl - To;
Chris@82 110 Tu = Tq - Tt;
Chris@82 111 Tv = Tp + Tu;
Chris@82 112 TW = Tp - Tu;
Chris@82 113 }
Chris@82 114 cr[0] = T7 + Te;
Chris@82 115 ci[0] = T1b + T1e;
Chris@82 116 {
Chris@82 117 E TS, TX, TT, TY, TP, TU;
Chris@82 118 TS = FNMS(KP707106781, TR, TQ);
Chris@82 119 TX = FMA(KP707106781, TW, TV);
Chris@82 120 TP = W[4];
Chris@82 121 TT = TP * TS;
Chris@82 122 TY = TP * TX;
Chris@82 123 TU = W[5];
Chris@82 124 cr[WS(rs, 3)] = FNMS(TU, TX, TT);
Chris@82 125 ci[WS(rs, 3)] = FMA(TU, TS, TY);
Chris@82 126 }
Chris@82 127 {
Chris@82 128 E T1s, T1v, T1t, T1w, T1r, T1u;
Chris@82 129 T1s = T1i + T1j;
Chris@82 130 T1v = T1o + T1n;
Chris@82 131 T1r = W[2];
Chris@82 132 T1t = T1r * T1s;
Chris@82 133 T1w = T1r * T1v;
Chris@82 134 T1u = W[3];
Chris@82 135 cr[WS(rs, 2)] = FNMS(T1u, T1v, T1t);
Chris@82 136 ci[WS(rs, 2)] = FMA(T1u, T1s, T1w);
Chris@82 137 }
Chris@82 138 {
Chris@82 139 E T10, T13, T11, T14, TZ, T12;
Chris@82 140 T10 = FMA(KP707106781, TR, TQ);
Chris@82 141 T13 = FNMS(KP707106781, TW, TV);
Chris@82 142 TZ = W[12];
Chris@82 143 T11 = TZ * T10;
Chris@82 144 T14 = TZ * T13;
Chris@82 145 T12 = W[13];
Chris@82 146 cr[WS(rs, 7)] = FNMS(T12, T13, T11);
Chris@82 147 ci[WS(rs, 7)] = FMA(T12, T10, T14);
Chris@82 148 }
Chris@82 149 {
Chris@82 150 E T1f, T15, T17, T18, T1g, T16;
Chris@82 151 T1f = T1b - T1e;
Chris@82 152 T16 = T7 - Te;
Chris@82 153 T15 = W[6];
Chris@82 154 T17 = T15 * T16;
Chris@82 155 T18 = W[7];
Chris@82 156 T1g = T18 * T16;
Chris@82 157 cr[WS(rs, 4)] = FNMS(T18, T1f, T17);
Chris@82 158 ci[WS(rs, 4)] = FMA(T15, T1f, T1g);
Chris@82 159 }
Chris@82 160 {
Chris@82 161 E T1k, T1p, T1l, T1q, T1h, T1m;
Chris@82 162 T1k = T1i - T1j;
Chris@82 163 T1p = T1n - T1o;
Chris@82 164 T1h = W[10];
Chris@82 165 T1l = T1h * T1k;
Chris@82 166 T1q = T1h * T1p;
Chris@82 167 T1m = W[11];
Chris@82 168 cr[WS(rs, 6)] = FNMS(T1m, T1p, T1l);
Chris@82 169 ci[WS(rs, 6)] = FMA(T1m, T1k, T1q);
Chris@82 170 }
Chris@82 171 {
Chris@82 172 E TH, TN, TJ, TL, TM, TO, Tf, Tx, Ty, TI, TG, TK, Tw;
Chris@82 173 TG = TE - TF;
Chris@82 174 TH = FNMS(KP707106781, TG, TD);
Chris@82 175 TN = FMA(KP707106781, TG, TD);
Chris@82 176 TK = FMA(KP707106781, Tv, Tk);
Chris@82 177 TJ = W[0];
Chris@82 178 TL = TJ * TK;
Chris@82 179 TM = W[1];
Chris@82 180 TO = TM * TK;
Chris@82 181 Tw = FNMS(KP707106781, Tv, Tk);
Chris@82 182 Tf = W[8];
Chris@82 183 Tx = Tf * Tw;
Chris@82 184 Ty = W[9];
Chris@82 185 TI = Ty * Tw;
Chris@82 186 cr[WS(rs, 5)] = FNMS(Ty, TH, Tx);
Chris@82 187 ci[WS(rs, 5)] = FMA(Tf, TH, TI);
Chris@82 188 cr[WS(rs, 1)] = FNMS(TM, TN, TL);
Chris@82 189 ci[WS(rs, 1)] = FMA(TJ, TN, TO);
Chris@82 190 }
Chris@82 191 }
Chris@82 192 }
Chris@82 193 }
Chris@82 194
Chris@82 195 static const tw_instr twinstr[] = {
Chris@82 196 {TW_FULL, 1, 8},
Chris@82 197 {TW_NEXT, 1, 0}
Chris@82 198 };
Chris@82 199
Chris@82 200 static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, {44, 14, 22, 0} };
Chris@82 201
Chris@82 202 void X(codelet_hb_8) (planner *p) {
Chris@82 203 X(khc2hc_register) (p, hb_8, &desc);
Chris@82 204 }
Chris@82 205 #else
Chris@82 206
Chris@82 207 /* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 8 -dif -name hb_8 -include rdft/scalar/hb.h */
Chris@82 208
Chris@82 209 /*
Chris@82 210 * This function contains 66 FP additions, 32 FP multiplications,
Chris@82 211 * (or, 52 additions, 18 multiplications, 14 fused multiply/add),
Chris@82 212 * 30 stack variables, 1 constants, and 32 memory accesses
Chris@82 213 */
Chris@82 214 #include "rdft/scalar/hb.h"
Chris@82 215
Chris@82 216 static void hb_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 217 {
Chris@82 218 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 219 {
Chris@82 220 INT m;
Chris@82 221 for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
Chris@82 222 E T7, T18, T1c, To, Ty, TM, TY, TC, Te, TZ, T10, Tv, Tz, TP, TS;
Chris@82 223 E TD;
Chris@82 224 {
Chris@82 225 E T3, TK, Tn, TL, T6, TW, Tk, TX;
Chris@82 226 {
Chris@82 227 E T1, T2, Tl, Tm;
Chris@82 228 T1 = cr[0];
Chris@82 229 T2 = ci[WS(rs, 3)];
Chris@82 230 T3 = T1 + T2;
Chris@82 231 TK = T1 - T2;
Chris@82 232 Tl = ci[WS(rs, 5)];
Chris@82 233 Tm = cr[WS(rs, 6)];
Chris@82 234 Tn = Tl - Tm;
Chris@82 235 TL = Tl + Tm;
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T4, T5, Ti, Tj;
Chris@82 239 T4 = cr[WS(rs, 2)];
Chris@82 240 T5 = ci[WS(rs, 1)];
Chris@82 241 T6 = T4 + T5;
Chris@82 242 TW = T4 - T5;
Chris@82 243 Ti = ci[WS(rs, 7)];
Chris@82 244 Tj = cr[WS(rs, 4)];
Chris@82 245 Tk = Ti - Tj;
Chris@82 246 TX = Ti + Tj;
Chris@82 247 }
Chris@82 248 T7 = T3 + T6;
Chris@82 249 T18 = TK + TL;
Chris@82 250 T1c = TX - TW;
Chris@82 251 To = Tk + Tn;
Chris@82 252 Ty = T3 - T6;
Chris@82 253 TM = TK - TL;
Chris@82 254 TY = TW + TX;
Chris@82 255 TC = Tk - Tn;
Chris@82 256 }
Chris@82 257 {
Chris@82 258 E Ta, TN, Tu, TR, Td, TQ, Tr, TO;
Chris@82 259 {
Chris@82 260 E T8, T9, Ts, Tt;
Chris@82 261 T8 = cr[WS(rs, 1)];
Chris@82 262 T9 = ci[WS(rs, 2)];
Chris@82 263 Ta = T8 + T9;
Chris@82 264 TN = T8 - T9;
Chris@82 265 Ts = ci[WS(rs, 4)];
Chris@82 266 Tt = cr[WS(rs, 7)];
Chris@82 267 Tu = Ts - Tt;
Chris@82 268 TR = Ts + Tt;
Chris@82 269 }
Chris@82 270 {
Chris@82 271 E Tb, Tc, Tp, Tq;
Chris@82 272 Tb = ci[0];
Chris@82 273 Tc = cr[WS(rs, 3)];
Chris@82 274 Td = Tb + Tc;
Chris@82 275 TQ = Tb - Tc;
Chris@82 276 Tp = ci[WS(rs, 6)];
Chris@82 277 Tq = cr[WS(rs, 5)];
Chris@82 278 Tr = Tp - Tq;
Chris@82 279 TO = Tp + Tq;
Chris@82 280 }
Chris@82 281 Te = Ta + Td;
Chris@82 282 TZ = TN + TO;
Chris@82 283 T10 = TQ + TR;
Chris@82 284 Tv = Tr + Tu;
Chris@82 285 Tz = Tu - Tr;
Chris@82 286 TP = TN - TO;
Chris@82 287 TS = TQ - TR;
Chris@82 288 TD = Ta - Td;
Chris@82 289 }
Chris@82 290 cr[0] = T7 + Te;
Chris@82 291 ci[0] = To + Tv;
Chris@82 292 {
Chris@82 293 E Tg, Tw, Tf, Th;
Chris@82 294 Tg = T7 - Te;
Chris@82 295 Tw = To - Tv;
Chris@82 296 Tf = W[6];
Chris@82 297 Th = W[7];
Chris@82 298 cr[WS(rs, 4)] = FNMS(Th, Tw, Tf * Tg);
Chris@82 299 ci[WS(rs, 4)] = FMA(Th, Tg, Tf * Tw);
Chris@82 300 }
Chris@82 301 {
Chris@82 302 E TG, TI, TF, TH;
Chris@82 303 TG = Ty + Tz;
Chris@82 304 TI = TD + TC;
Chris@82 305 TF = W[2];
Chris@82 306 TH = W[3];
Chris@82 307 cr[WS(rs, 2)] = FNMS(TH, TI, TF * TG);
Chris@82 308 ci[WS(rs, 2)] = FMA(TF, TI, TH * TG);
Chris@82 309 }
Chris@82 310 {
Chris@82 311 E TA, TE, Tx, TB;
Chris@82 312 TA = Ty - Tz;
Chris@82 313 TE = TC - TD;
Chris@82 314 Tx = W[10];
Chris@82 315 TB = W[11];
Chris@82 316 cr[WS(rs, 6)] = FNMS(TB, TE, Tx * TA);
Chris@82 317 ci[WS(rs, 6)] = FMA(Tx, TE, TB * TA);
Chris@82 318 }
Chris@82 319 {
Chris@82 320 E T1a, T1g, T1e, T1i, T19, T1d;
Chris@82 321 T19 = KP707106781 * (TZ + T10);
Chris@82 322 T1a = T18 - T19;
Chris@82 323 T1g = T18 + T19;
Chris@82 324 T1d = KP707106781 * (TP - TS);
Chris@82 325 T1e = T1c + T1d;
Chris@82 326 T1i = T1c - T1d;
Chris@82 327 {
Chris@82 328 E T17, T1b, T1f, T1h;
Chris@82 329 T17 = W[4];
Chris@82 330 T1b = W[5];
Chris@82 331 cr[WS(rs, 3)] = FNMS(T1b, T1e, T17 * T1a);
Chris@82 332 ci[WS(rs, 3)] = FMA(T17, T1e, T1b * T1a);
Chris@82 333 T1f = W[12];
Chris@82 334 T1h = W[13];
Chris@82 335 cr[WS(rs, 7)] = FNMS(T1h, T1i, T1f * T1g);
Chris@82 336 ci[WS(rs, 7)] = FMA(T1f, T1i, T1h * T1g);
Chris@82 337 }
Chris@82 338 }
Chris@82 339 {
Chris@82 340 E TU, T14, T12, T16, TT, T11;
Chris@82 341 TT = KP707106781 * (TP + TS);
Chris@82 342 TU = TM - TT;
Chris@82 343 T14 = TM + TT;
Chris@82 344 T11 = KP707106781 * (TZ - T10);
Chris@82 345 T12 = TY - T11;
Chris@82 346 T16 = TY + T11;
Chris@82 347 {
Chris@82 348 E TJ, TV, T13, T15;
Chris@82 349 TJ = W[8];
Chris@82 350 TV = W[9];
Chris@82 351 cr[WS(rs, 5)] = FNMS(TV, T12, TJ * TU);
Chris@82 352 ci[WS(rs, 5)] = FMA(TV, TU, TJ * T12);
Chris@82 353 T13 = W[0];
Chris@82 354 T15 = W[1];
Chris@82 355 cr[WS(rs, 1)] = FNMS(T15, T16, T13 * T14);
Chris@82 356 ci[WS(rs, 1)] = FMA(T15, T14, T13 * T16);
Chris@82 357 }
Chris@82 358 }
Chris@82 359 }
Chris@82 360 }
Chris@82 361 }
Chris@82 362
Chris@82 363 static const tw_instr twinstr[] = {
Chris@82 364 {TW_FULL, 1, 8},
Chris@82 365 {TW_NEXT, 1, 0}
Chris@82 366 };
Chris@82 367
Chris@82 368 static const hc2hc_desc desc = { 8, "hb_8", twinstr, &GENUS, {52, 18, 14, 0} };
Chris@82 369
Chris@82 370 void X(codelet_hb_8) (planner *p) {
Chris@82 371 X(khc2hc_register) (p, hb_8, &desc);
Chris@82 372 }
Chris@82 373 #endif