annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cb_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:49:31 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include r2cb.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 156 FP additions, 84 FP multiplications,
Chris@42 32 * (or, 72 additions, 0 multiplications, 84 fused multiply/add),
Chris@42 33 * 82 stack variables, 9 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cb.h"
Chris@42 36
Chris@42 37 static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 40 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 41 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 42 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 43 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 44 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 45 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 46 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 47 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 48 {
Chris@42 49 INT i;
Chris@42 50 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
Chris@42 51 E T1F, T1C, T1H, T1z, T1G, T1I;
Chris@42 52 {
Chris@42 53 E T8, T1t, Tz, T1R, T5, T1S, T1u, TE, T1w, TP, T1U, Tg, T2m, T1X, T1x;
Chris@42 54 E TK, T1D, T1d, T20, To, T2p, T28, T1A, TW, T11, T1e, Tv, T25, T23, T2q;
Chris@42 55 E T16, T1f, TA, TD;
Chris@42 56 {
Chris@42 57 E T4, Ty, T1, T2, T6, T7;
Chris@42 58 T4 = Cr[WS(csr, 8)];
Chris@42 59 Ty = Ci[WS(csi, 8)];
Chris@42 60 T1 = Cr[0];
Chris@42 61 T2 = Cr[WS(csr, 16)];
Chris@42 62 T6 = Cr[WS(csr, 4)];
Chris@42 63 T7 = Cr[WS(csr, 12)];
Chris@42 64 {
Chris@42 65 E TB, Tx, T3, TC;
Chris@42 66 TB = Ci[WS(csi, 4)];
Chris@42 67 Tx = T1 - T2;
Chris@42 68 T3 = T1 + T2;
Chris@42 69 TA = T6 - T7;
Chris@42 70 T8 = T6 + T7;
Chris@42 71 TC = Ci[WS(csi, 12)];
Chris@42 72 T1t = FMA(KP2_000000000, Ty, Tx);
Chris@42 73 Tz = FNMS(KP2_000000000, Ty, Tx);
Chris@42 74 T1R = FNMS(KP2_000000000, T4, T3);
Chris@42 75 T5 = FMA(KP2_000000000, T4, T3);
Chris@42 76 TD = TB + TC;
Chris@42 77 T1S = TB - TC;
Chris@42 78 }
Chris@42 79 }
Chris@42 80 {
Chris@42 81 E Td, TG, Tc, T1V, TO, Te, TH, TI;
Chris@42 82 {
Chris@42 83 E Ta, Tb, TM, TN;
Chris@42 84 Ta = Cr[WS(csr, 2)];
Chris@42 85 T1u = TA + TD;
Chris@42 86 TE = TA - TD;
Chris@42 87 Tb = Cr[WS(csr, 14)];
Chris@42 88 TM = Ci[WS(csi, 2)];
Chris@42 89 TN = Ci[WS(csi, 14)];
Chris@42 90 Td = Cr[WS(csr, 10)];
Chris@42 91 TG = Ta - Tb;
Chris@42 92 Tc = Ta + Tb;
Chris@42 93 T1V = TM - TN;
Chris@42 94 TO = TM + TN;
Chris@42 95 Te = Cr[WS(csr, 6)];
Chris@42 96 TH = Ci[WS(csi, 10)];
Chris@42 97 TI = Ci[WS(csi, 6)];
Chris@42 98 }
Chris@42 99 {
Chris@42 100 E Tl, TS, Tk, T26, T1c, Tm, TT, TU;
Chris@42 101 {
Chris@42 102 E Ti, Tj, T1a, T1b;
Chris@42 103 Ti = Cr[WS(csr, 1)];
Chris@42 104 {
Chris@42 105 E TL, Tf, T1W, TJ;
Chris@42 106 TL = Td - Te;
Chris@42 107 Tf = Td + Te;
Chris@42 108 T1W = TH - TI;
Chris@42 109 TJ = TH + TI;
Chris@42 110 T1w = TO - TL;
Chris@42 111 TP = TL + TO;
Chris@42 112 T1U = Tc - Tf;
Chris@42 113 Tg = Tc + Tf;
Chris@42 114 T2m = T1W + T1V;
Chris@42 115 T1X = T1V - T1W;
Chris@42 116 T1x = TG + TJ;
Chris@42 117 TK = TG - TJ;
Chris@42 118 Tj = Cr[WS(csr, 15)];
Chris@42 119 }
Chris@42 120 T1a = Ci[WS(csi, 1)];
Chris@42 121 T1b = Ci[WS(csi, 15)];
Chris@42 122 Tl = Cr[WS(csr, 9)];
Chris@42 123 TS = Ti - Tj;
Chris@42 124 Tk = Ti + Tj;
Chris@42 125 T26 = T1a - T1b;
Chris@42 126 T1c = T1a + T1b;
Chris@42 127 Tm = Cr[WS(csr, 7)];
Chris@42 128 TT = Ci[WS(csi, 9)];
Chris@42 129 TU = Ci[WS(csi, 7)];
Chris@42 130 }
Chris@42 131 {
Chris@42 132 E Ts, TX, Tr, T22, T10, Tt, T13, T14;
Chris@42 133 {
Chris@42 134 E Tp, Tq, TY, TZ;
Chris@42 135 Tp = Cr[WS(csr, 5)];
Chris@42 136 {
Chris@42 137 E T19, Tn, T27, TV;
Chris@42 138 T19 = Tl - Tm;
Chris@42 139 Tn = Tl + Tm;
Chris@42 140 T27 = TT - TU;
Chris@42 141 TV = TT + TU;
Chris@42 142 T1D = T1c - T19;
Chris@42 143 T1d = T19 + T1c;
Chris@42 144 T20 = Tk - Tn;
Chris@42 145 To = Tk + Tn;
Chris@42 146 T2p = T27 + T26;
Chris@42 147 T28 = T26 - T27;
Chris@42 148 T1A = TS + TV;
Chris@42 149 TW = TS - TV;
Chris@42 150 Tq = Cr[WS(csr, 11)];
Chris@42 151 }
Chris@42 152 TY = Ci[WS(csi, 5)];
Chris@42 153 TZ = Ci[WS(csi, 11)];
Chris@42 154 Ts = Cr[WS(csr, 3)];
Chris@42 155 TX = Tp - Tq;
Chris@42 156 Tr = Tp + Tq;
Chris@42 157 T22 = TY - TZ;
Chris@42 158 T10 = TY + TZ;
Chris@42 159 Tt = Cr[WS(csr, 13)];
Chris@42 160 T13 = Ci[WS(csi, 3)];
Chris@42 161 T14 = Ci[WS(csi, 13)];
Chris@42 162 }
Chris@42 163 {
Chris@42 164 E T12, Tu, T21, T15;
Chris@42 165 T11 = TX - T10;
Chris@42 166 T1e = TX + T10;
Chris@42 167 T12 = Ts - Tt;
Chris@42 168 Tu = Ts + Tt;
Chris@42 169 T21 = T14 - T13;
Chris@42 170 T15 = T13 + T14;
Chris@42 171 Tv = Tr + Tu;
Chris@42 172 T25 = Tr - Tu;
Chris@42 173 T23 = T21 - T22;
Chris@42 174 T2q = T22 + T21;
Chris@42 175 T16 = T12 - T15;
Chris@42 176 T1f = T12 + T15;
Chris@42 177 }
Chris@42 178 }
Chris@42 179 }
Chris@42 180 }
Chris@42 181 {
Chris@42 182 E T1B, T1E, T1l, T1m, T1p, T1o, T1T, T1Y, T29, T2g, T2j, T2f, T2h, T24;
Chris@42 183 {
Chris@42 184 E T1g, T17, T2n, T2t, T2u, T2s;
Chris@42 185 {
Chris@42 186 E T2o, Tw, T2w, T2r, T2l, T9, Th, T2v;
Chris@42 187 T2o = To - Tv;
Chris@42 188 Tw = To + Tv;
Chris@42 189 T2w = T2q + T2p;
Chris@42 190 T2r = T2p - T2q;
Chris@42 191 T1g = T1e - T1f;
Chris@42 192 T1B = T1e + T1f;
Chris@42 193 T17 = T11 + T16;
Chris@42 194 T1E = T16 - T11;
Chris@42 195 T2l = FNMS(KP2_000000000, T8, T5);
Chris@42 196 T9 = FMA(KP2_000000000, T8, T5);
Chris@42 197 Th = FMA(KP2_000000000, Tg, T9);
Chris@42 198 T2v = FNMS(KP2_000000000, Tg, T9);
Chris@42 199 T2n = FNMS(KP2_000000000, T2m, T2l);
Chris@42 200 T2t = FMA(KP2_000000000, T2m, T2l);
Chris@42 201 R0[WS(rs, 4)] = FNMS(KP2_000000000, T2w, T2v);
Chris@42 202 R0[WS(rs, 12)] = FMA(KP2_000000000, T2w, T2v);
Chris@42 203 R0[0] = FMA(KP2_000000000, Tw, Th);
Chris@42 204 R0[WS(rs, 8)] = FNMS(KP2_000000000, Tw, Th);
Chris@42 205 T2u = T2o + T2r;
Chris@42 206 T2s = T2o - T2r;
Chris@42 207 }
Chris@42 208 {
Chris@42 209 E T1j, TR, T18, T1h, TF, TQ;
Chris@42 210 T1l = FNMS(KP1_414213562, TE, Tz);
Chris@42 211 TF = FMA(KP1_414213562, TE, Tz);
Chris@42 212 TQ = FNMS(KP414213562, TP, TK);
Chris@42 213 T1m = FMA(KP414213562, TK, TP);
Chris@42 214 R0[WS(rs, 2)] = FMA(KP1_414213562, T2s, T2n);
Chris@42 215 R0[WS(rs, 10)] = FNMS(KP1_414213562, T2s, T2n);
Chris@42 216 R0[WS(rs, 6)] = FNMS(KP1_414213562, T2u, T2t);
Chris@42 217 R0[WS(rs, 14)] = FMA(KP1_414213562, T2u, T2t);
Chris@42 218 T1j = FNMS(KP1_847759065, TQ, TF);
Chris@42 219 TR = FMA(KP1_847759065, TQ, TF);
Chris@42 220 T1p = FNMS(KP707106781, T17, TW);
Chris@42 221 T18 = FMA(KP707106781, T17, TW);
Chris@42 222 T1h = FMA(KP707106781, T1g, T1d);
Chris@42 223 T1o = FNMS(KP707106781, T1g, T1d);
Chris@42 224 {
Chris@42 225 E T2d, T2e, T1k, T1i;
Chris@42 226 T1T = FNMS(KP2_000000000, T1S, T1R);
Chris@42 227 T2d = FMA(KP2_000000000, T1S, T1R);
Chris@42 228 T2e = T1U + T1X;
Chris@42 229 T1Y = T1U - T1X;
Chris@42 230 T29 = T25 + T28;
Chris@42 231 T2g = T28 - T25;
Chris@42 232 T1k = FMA(KP198912367, T18, T1h);
Chris@42 233 T1i = FNMS(KP198912367, T1h, T18);
Chris@42 234 T2j = FMA(KP1_414213562, T2e, T2d);
Chris@42 235 T2f = FNMS(KP1_414213562, T2e, T2d);
Chris@42 236 R1[WS(rs, 4)] = FNMS(KP1_961570560, T1k, T1j);
Chris@42 237 R1[WS(rs, 12)] = FMA(KP1_961570560, T1k, T1j);
Chris@42 238 R1[0] = FMA(KP1_961570560, T1i, TR);
Chris@42 239 R1[WS(rs, 8)] = FNMS(KP1_961570560, T1i, TR);
Chris@42 240 T2h = T20 - T23;
Chris@42 241 T24 = T20 + T23;
Chris@42 242 }
Chris@42 243 }
Chris@42 244 }
Chris@42 245 {
Chris@42 246 E T1v, T1y, T1M, T1P, T1L, T1N;
Chris@42 247 {
Chris@42 248 E T1r, T1n, T2k, T2i;
Chris@42 249 T2k = FMA(KP414213562, T2g, T2h);
Chris@42 250 T2i = FNMS(KP414213562, T2h, T2g);
Chris@42 251 T1r = FMA(KP1_847759065, T1m, T1l);
Chris@42 252 T1n = FNMS(KP1_847759065, T1m, T1l);
Chris@42 253 R0[WS(rs, 7)] = FNMS(KP1_847759065, T2k, T2j);
Chris@42 254 R0[WS(rs, 15)] = FMA(KP1_847759065, T2k, T2j);
Chris@42 255 R0[WS(rs, 11)] = FMA(KP1_847759065, T2i, T2f);
Chris@42 256 R0[WS(rs, 3)] = FNMS(KP1_847759065, T2i, T2f);
Chris@42 257 {
Chris@42 258 E T1J, T1K, T1s, T1q;
Chris@42 259 T1v = FNMS(KP1_414213562, T1u, T1t);
Chris@42 260 T1J = FMA(KP1_414213562, T1u, T1t);
Chris@42 261 T1K = FMA(KP414213562, T1w, T1x);
Chris@42 262 T1y = FNMS(KP414213562, T1x, T1w);
Chris@42 263 T1F = FNMS(KP707106781, T1E, T1D);
Chris@42 264 T1M = FMA(KP707106781, T1E, T1D);
Chris@42 265 T1s = FMA(KP668178637, T1o, T1p);
Chris@42 266 T1q = FNMS(KP668178637, T1p, T1o);
Chris@42 267 T1P = FMA(KP1_847759065, T1K, T1J);
Chris@42 268 T1L = FNMS(KP1_847759065, T1K, T1J);
Chris@42 269 R1[WS(rs, 6)] = FNMS(KP1_662939224, T1s, T1r);
Chris@42 270 R1[WS(rs, 14)] = FMA(KP1_662939224, T1s, T1r);
Chris@42 271 R1[WS(rs, 10)] = FMA(KP1_662939224, T1q, T1n);
Chris@42 272 R1[WS(rs, 2)] = FNMS(KP1_662939224, T1q, T1n);
Chris@42 273 T1N = FMA(KP707106781, T1B, T1A);
Chris@42 274 T1C = FNMS(KP707106781, T1B, T1A);
Chris@42 275 }
Chris@42 276 }
Chris@42 277 {
Chris@42 278 E T2b, T1Z, T1Q, T1O, T2c, T2a;
Chris@42 279 T1Q = FMA(KP198912367, T1M, T1N);
Chris@42 280 T1O = FNMS(KP198912367, T1N, T1M);
Chris@42 281 T2b = FNMS(KP1_414213562, T1Y, T1T);
Chris@42 282 T1Z = FMA(KP1_414213562, T1Y, T1T);
Chris@42 283 R1[WS(rs, 7)] = FNMS(KP1_961570560, T1Q, T1P);
Chris@42 284 R1[WS(rs, 15)] = FMA(KP1_961570560, T1Q, T1P);
Chris@42 285 R1[WS(rs, 11)] = FMA(KP1_961570560, T1O, T1L);
Chris@42 286 R1[WS(rs, 3)] = FNMS(KP1_961570560, T1O, T1L);
Chris@42 287 T2c = FMA(KP414213562, T24, T29);
Chris@42 288 T2a = FNMS(KP414213562, T29, T24);
Chris@42 289 T1H = FMA(KP1_847759065, T1y, T1v);
Chris@42 290 T1z = FNMS(KP1_847759065, T1y, T1v);
Chris@42 291 R0[WS(rs, 5)] = FNMS(KP1_847759065, T2c, T2b);
Chris@42 292 R0[WS(rs, 13)] = FMA(KP1_847759065, T2c, T2b);
Chris@42 293 R0[WS(rs, 1)] = FMA(KP1_847759065, T2a, T1Z);
Chris@42 294 R0[WS(rs, 9)] = FNMS(KP1_847759065, T2a, T1Z);
Chris@42 295 }
Chris@42 296 }
Chris@42 297 }
Chris@42 298 }
Chris@42 299 T1G = FNMS(KP668178637, T1F, T1C);
Chris@42 300 T1I = FMA(KP668178637, T1C, T1F);
Chris@42 301 R1[WS(rs, 5)] = FNMS(KP1_662939224, T1I, T1H);
Chris@42 302 R1[WS(rs, 13)] = FMA(KP1_662939224, T1I, T1H);
Chris@42 303 R1[WS(rs, 1)] = FMA(KP1_662939224, T1G, T1z);
Chris@42 304 R1[WS(rs, 9)] = FNMS(KP1_662939224, T1G, T1z);
Chris@42 305 }
Chris@42 306 }
Chris@42 307 }
Chris@42 308
Chris@42 309 static const kr2c_desc desc = { 32, "r2cb_32", {72, 0, 84, 0}, &GENUS };
Chris@42 310
Chris@42 311 void X(codelet_r2cb_32) (planner *p) {
Chris@42 312 X(kr2c_register) (p, r2cb_32, &desc);
Chris@42 313 }
Chris@42 314
Chris@42 315 #else /* HAVE_FMA */
Chris@42 316
Chris@42 317 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cb_32 -include r2cb.h */
Chris@42 318
Chris@42 319 /*
Chris@42 320 * This function contains 156 FP additions, 50 FP multiplications,
Chris@42 321 * (or, 140 additions, 34 multiplications, 16 fused multiply/add),
Chris@42 322 * 54 stack variables, 9 constants, and 64 memory accesses
Chris@42 323 */
Chris@42 324 #include "r2cb.h"
Chris@42 325
Chris@42 326 static void r2cb_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 327 {
Chris@42 328 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 329 DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
Chris@42 330 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 331 DK(KP390180644, +0.390180644032256535696569736954044481855383236);
Chris@42 332 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@42 333 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 334 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 335 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 336 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 337 {
Chris@42 338 INT i;
Chris@42 339 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
Chris@42 340 E T9, T2c, TB, T1y, T6, T2b, Ty, T1v, Th, T2e, T2f, TD, TK, T1C, T1F;
Chris@42 341 E T1h, Tp, T2i, T2m, TN, T13, T1K, T1Y, T1k, Tw, TU, T1l, TW, T1V, T2j;
Chris@42 342 E T1R, T2l;
Chris@42 343 {
Chris@42 344 E T7, T8, T1w, Tz, TA, T1x;
Chris@42 345 T7 = Cr[WS(csr, 4)];
Chris@42 346 T8 = Cr[WS(csr, 12)];
Chris@42 347 T1w = T7 - T8;
Chris@42 348 Tz = Ci[WS(csi, 4)];
Chris@42 349 TA = Ci[WS(csi, 12)];
Chris@42 350 T1x = Tz + TA;
Chris@42 351 T9 = KP2_000000000 * (T7 + T8);
Chris@42 352 T2c = KP1_414213562 * (T1w + T1x);
Chris@42 353 TB = KP2_000000000 * (Tz - TA);
Chris@42 354 T1y = KP1_414213562 * (T1w - T1x);
Chris@42 355 }
Chris@42 356 {
Chris@42 357 E T5, T1u, T3, T1s;
Chris@42 358 {
Chris@42 359 E T4, T1t, T1, T2;
Chris@42 360 T4 = Cr[WS(csr, 8)];
Chris@42 361 T5 = KP2_000000000 * T4;
Chris@42 362 T1t = Ci[WS(csi, 8)];
Chris@42 363 T1u = KP2_000000000 * T1t;
Chris@42 364 T1 = Cr[0];
Chris@42 365 T2 = Cr[WS(csr, 16)];
Chris@42 366 T3 = T1 + T2;
Chris@42 367 T1s = T1 - T2;
Chris@42 368 }
Chris@42 369 T6 = T3 + T5;
Chris@42 370 T2b = T1s + T1u;
Chris@42 371 Ty = T3 - T5;
Chris@42 372 T1v = T1s - T1u;
Chris@42 373 }
Chris@42 374 {
Chris@42 375 E Td, T1A, TG, T1E, Tg, T1D, TJ, T1B;
Chris@42 376 {
Chris@42 377 E Tb, Tc, TE, TF;
Chris@42 378 Tb = Cr[WS(csr, 2)];
Chris@42 379 Tc = Cr[WS(csr, 14)];
Chris@42 380 Td = Tb + Tc;
Chris@42 381 T1A = Tb - Tc;
Chris@42 382 TE = Ci[WS(csi, 2)];
Chris@42 383 TF = Ci[WS(csi, 14)];
Chris@42 384 TG = TE - TF;
Chris@42 385 T1E = TE + TF;
Chris@42 386 }
Chris@42 387 {
Chris@42 388 E Te, Tf, TH, TI;
Chris@42 389 Te = Cr[WS(csr, 10)];
Chris@42 390 Tf = Cr[WS(csr, 6)];
Chris@42 391 Tg = Te + Tf;
Chris@42 392 T1D = Te - Tf;
Chris@42 393 TH = Ci[WS(csi, 10)];
Chris@42 394 TI = Ci[WS(csi, 6)];
Chris@42 395 TJ = TH - TI;
Chris@42 396 T1B = TH + TI;
Chris@42 397 }
Chris@42 398 Th = KP2_000000000 * (Td + Tg);
Chris@42 399 T2e = T1A + T1B;
Chris@42 400 T2f = T1E - T1D;
Chris@42 401 TD = Td - Tg;
Chris@42 402 TK = TG - TJ;
Chris@42 403 T1C = T1A - T1B;
Chris@42 404 T1F = T1D + T1E;
Chris@42 405 T1h = KP2_000000000 * (TJ + TG);
Chris@42 406 }
Chris@42 407 {
Chris@42 408 E Tl, T1I, TZ, T1X, To, T1W, T12, T1J;
Chris@42 409 {
Chris@42 410 E Tj, Tk, TX, TY;
Chris@42 411 Tj = Cr[WS(csr, 1)];
Chris@42 412 Tk = Cr[WS(csr, 15)];
Chris@42 413 Tl = Tj + Tk;
Chris@42 414 T1I = Tj - Tk;
Chris@42 415 TX = Ci[WS(csi, 1)];
Chris@42 416 TY = Ci[WS(csi, 15)];
Chris@42 417 TZ = TX - TY;
Chris@42 418 T1X = TX + TY;
Chris@42 419 }
Chris@42 420 {
Chris@42 421 E Tm, Tn, T10, T11;
Chris@42 422 Tm = Cr[WS(csr, 9)];
Chris@42 423 Tn = Cr[WS(csr, 7)];
Chris@42 424 To = Tm + Tn;
Chris@42 425 T1W = Tm - Tn;
Chris@42 426 T10 = Ci[WS(csi, 9)];
Chris@42 427 T11 = Ci[WS(csi, 7)];
Chris@42 428 T12 = T10 - T11;
Chris@42 429 T1J = T10 + T11;
Chris@42 430 }
Chris@42 431 Tp = Tl + To;
Chris@42 432 T2i = T1I + T1J;
Chris@42 433 T2m = T1X - T1W;
Chris@42 434 TN = Tl - To;
Chris@42 435 T13 = TZ - T12;
Chris@42 436 T1K = T1I - T1J;
Chris@42 437 T1Y = T1W + T1X;
Chris@42 438 T1k = T12 + TZ;
Chris@42 439 }
Chris@42 440 {
Chris@42 441 E Ts, T1L, TT, T1M, Tv, T1O, TQ, T1P;
Chris@42 442 {
Chris@42 443 E Tq, Tr, TR, TS;
Chris@42 444 Tq = Cr[WS(csr, 5)];
Chris@42 445 Tr = Cr[WS(csr, 11)];
Chris@42 446 Ts = Tq + Tr;
Chris@42 447 T1L = Tq - Tr;
Chris@42 448 TR = Ci[WS(csi, 5)];
Chris@42 449 TS = Ci[WS(csi, 11)];
Chris@42 450 TT = TR - TS;
Chris@42 451 T1M = TR + TS;
Chris@42 452 }
Chris@42 453 {
Chris@42 454 E Tt, Tu, TO, TP;
Chris@42 455 Tt = Cr[WS(csr, 3)];
Chris@42 456 Tu = Cr[WS(csr, 13)];
Chris@42 457 Tv = Tt + Tu;
Chris@42 458 T1O = Tt - Tu;
Chris@42 459 TO = Ci[WS(csi, 13)];
Chris@42 460 TP = Ci[WS(csi, 3)];
Chris@42 461 TQ = TO - TP;
Chris@42 462 T1P = TP + TO;
Chris@42 463 }
Chris@42 464 Tw = Ts + Tv;
Chris@42 465 TU = TQ - TT;
Chris@42 466 T1l = TT + TQ;
Chris@42 467 TW = Ts - Tv;
Chris@42 468 {
Chris@42 469 E T1T, T1U, T1N, T1Q;
Chris@42 470 T1T = T1L + T1M;
Chris@42 471 T1U = T1O + T1P;
Chris@42 472 T1V = KP707106781 * (T1T - T1U);
Chris@42 473 T2j = KP707106781 * (T1T + T1U);
Chris@42 474 T1N = T1L - T1M;
Chris@42 475 T1Q = T1O - T1P;
Chris@42 476 T1R = KP707106781 * (T1N + T1Q);
Chris@42 477 T2l = KP707106781 * (T1N - T1Q);
Chris@42 478 }
Chris@42 479 }
Chris@42 480 {
Chris@42 481 E Tx, T1r, Ti, T1q, Ta;
Chris@42 482 Tx = KP2_000000000 * (Tp + Tw);
Chris@42 483 T1r = KP2_000000000 * (T1l + T1k);
Chris@42 484 Ta = T6 + T9;
Chris@42 485 Ti = Ta + Th;
Chris@42 486 T1q = Ta - Th;
Chris@42 487 R0[WS(rs, 8)] = Ti - Tx;
Chris@42 488 R0[WS(rs, 12)] = T1q + T1r;
Chris@42 489 R0[0] = Ti + Tx;
Chris@42 490 R0[WS(rs, 4)] = T1q - T1r;
Chris@42 491 }
Chris@42 492 {
Chris@42 493 E T1i, T1o, T1n, T1p, T1g, T1j, T1m;
Chris@42 494 T1g = T6 - T9;
Chris@42 495 T1i = T1g - T1h;
Chris@42 496 T1o = T1g + T1h;
Chris@42 497 T1j = Tp - Tw;
Chris@42 498 T1m = T1k - T1l;
Chris@42 499 T1n = KP1_414213562 * (T1j - T1m);
Chris@42 500 T1p = KP1_414213562 * (T1j + T1m);
Chris@42 501 R0[WS(rs, 10)] = T1i - T1n;
Chris@42 502 R0[WS(rs, 14)] = T1o + T1p;
Chris@42 503 R0[WS(rs, 2)] = T1i + T1n;
Chris@42 504 R0[WS(rs, 6)] = T1o - T1p;
Chris@42 505 }
Chris@42 506 {
Chris@42 507 E TM, T16, T15, T17;
Chris@42 508 {
Chris@42 509 E TC, TL, TV, T14;
Chris@42 510 TC = Ty - TB;
Chris@42 511 TL = KP1_414213562 * (TD - TK);
Chris@42 512 TM = TC + TL;
Chris@42 513 T16 = TC - TL;
Chris@42 514 TV = TN + TU;
Chris@42 515 T14 = TW + T13;
Chris@42 516 T15 = FNMS(KP765366864, T14, KP1_847759065 * TV);
Chris@42 517 T17 = FMA(KP765366864, TV, KP1_847759065 * T14);
Chris@42 518 }
Chris@42 519 R0[WS(rs, 9)] = TM - T15;
Chris@42 520 R0[WS(rs, 13)] = T16 + T17;
Chris@42 521 R0[WS(rs, 1)] = TM + T15;
Chris@42 522 R0[WS(rs, 5)] = T16 - T17;
Chris@42 523 }
Chris@42 524 {
Chris@42 525 E T2t, T2x, T2w, T2y;
Chris@42 526 {
Chris@42 527 E T2r, T2s, T2u, T2v;
Chris@42 528 T2r = T2b + T2c;
Chris@42 529 T2s = FMA(KP1_847759065, T2e, KP765366864 * T2f);
Chris@42 530 T2t = T2r - T2s;
Chris@42 531 T2x = T2r + T2s;
Chris@42 532 T2u = T2i + T2j;
Chris@42 533 T2v = T2m - T2l;
Chris@42 534 T2w = FNMS(KP1_961570560, T2v, KP390180644 * T2u);
Chris@42 535 T2y = FMA(KP1_961570560, T2u, KP390180644 * T2v);
Chris@42 536 }
Chris@42 537 R1[WS(rs, 11)] = T2t - T2w;
Chris@42 538 R1[WS(rs, 15)] = T2x + T2y;
Chris@42 539 R1[WS(rs, 3)] = T2t + T2w;
Chris@42 540 R1[WS(rs, 7)] = T2x - T2y;
Chris@42 541 }
Chris@42 542 {
Chris@42 543 E T1a, T1e, T1d, T1f;
Chris@42 544 {
Chris@42 545 E T18, T19, T1b, T1c;
Chris@42 546 T18 = Ty + TB;
Chris@42 547 T19 = KP1_414213562 * (TD + TK);
Chris@42 548 T1a = T18 - T19;
Chris@42 549 T1e = T18 + T19;
Chris@42 550 T1b = TN - TU;
Chris@42 551 T1c = T13 - TW;
Chris@42 552 T1d = FNMS(KP1_847759065, T1c, KP765366864 * T1b);
Chris@42 553 T1f = FMA(KP1_847759065, T1b, KP765366864 * T1c);
Chris@42 554 }
Chris@42 555 R0[WS(rs, 11)] = T1a - T1d;
Chris@42 556 R0[WS(rs, 15)] = T1e + T1f;
Chris@42 557 R0[WS(rs, 3)] = T1a + T1d;
Chris@42 558 R0[WS(rs, 7)] = T1e - T1f;
Chris@42 559 }
Chris@42 560 {
Chris@42 561 E T25, T29, T28, T2a;
Chris@42 562 {
Chris@42 563 E T23, T24, T26, T27;
Chris@42 564 T23 = T1v - T1y;
Chris@42 565 T24 = FMA(KP765366864, T1C, KP1_847759065 * T1F);
Chris@42 566 T25 = T23 - T24;
Chris@42 567 T29 = T23 + T24;
Chris@42 568 T26 = T1K - T1R;
Chris@42 569 T27 = T1Y - T1V;
Chris@42 570 T28 = FNMS(KP1_662939224, T27, KP1_111140466 * T26);
Chris@42 571 T2a = FMA(KP1_662939224, T26, KP1_111140466 * T27);
Chris@42 572 }
Chris@42 573 R1[WS(rs, 10)] = T25 - T28;
Chris@42 574 R1[WS(rs, 14)] = T29 + T2a;
Chris@42 575 R1[WS(rs, 2)] = T25 + T28;
Chris@42 576 R1[WS(rs, 6)] = T29 - T2a;
Chris@42 577 }
Chris@42 578 {
Chris@42 579 E T2h, T2p, T2o, T2q;
Chris@42 580 {
Chris@42 581 E T2d, T2g, T2k, T2n;
Chris@42 582 T2d = T2b - T2c;
Chris@42 583 T2g = FNMS(KP1_847759065, T2f, KP765366864 * T2e);
Chris@42 584 T2h = T2d + T2g;
Chris@42 585 T2p = T2d - T2g;
Chris@42 586 T2k = T2i - T2j;
Chris@42 587 T2n = T2l + T2m;
Chris@42 588 T2o = FNMS(KP1_111140466, T2n, KP1_662939224 * T2k);
Chris@42 589 T2q = FMA(KP1_111140466, T2k, KP1_662939224 * T2n);
Chris@42 590 }
Chris@42 591 R1[WS(rs, 9)] = T2h - T2o;
Chris@42 592 R1[WS(rs, 13)] = T2p + T2q;
Chris@42 593 R1[WS(rs, 1)] = T2h + T2o;
Chris@42 594 R1[WS(rs, 5)] = T2p - T2q;
Chris@42 595 }
Chris@42 596 {
Chris@42 597 E T1H, T21, T20, T22;
Chris@42 598 {
Chris@42 599 E T1z, T1G, T1S, T1Z;
Chris@42 600 T1z = T1v + T1y;
Chris@42 601 T1G = FNMS(KP765366864, T1F, KP1_847759065 * T1C);
Chris@42 602 T1H = T1z + T1G;
Chris@42 603 T21 = T1z - T1G;
Chris@42 604 T1S = T1K + T1R;
Chris@42 605 T1Z = T1V + T1Y;
Chris@42 606 T20 = FNMS(KP390180644, T1Z, KP1_961570560 * T1S);
Chris@42 607 T22 = FMA(KP390180644, T1S, KP1_961570560 * T1Z);
Chris@42 608 }
Chris@42 609 R1[WS(rs, 8)] = T1H - T20;
Chris@42 610 R1[WS(rs, 12)] = T21 + T22;
Chris@42 611 R1[0] = T1H + T20;
Chris@42 612 R1[WS(rs, 4)] = T21 - T22;
Chris@42 613 }
Chris@42 614 }
Chris@42 615 }
Chris@42 616 }
Chris@42 617
Chris@42 618 static const kr2c_desc desc = { 32, "r2cb_32", {140, 34, 16, 0}, &GENUS };
Chris@42 619
Chris@42 620 void X(codelet_r2cb_32) (planner *p) {
Chris@42 621 X(kr2c_register) (p, r2cb_32, &desc);
Chris@42 622 }
Chris@42 623
Chris@42 624 #endif /* HAVE_FMA */