annotate src/fftw-3.3.5/rdft/scalar/r2cb/r2cbIII_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:50:45 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_r2cb.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cbIII_32 -dft-III -include r2cbIII.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@42 32 * (or, 106 additions, 32 multiplications, 68 fused multiply/add),
Chris@42 33 * 101 stack variables, 18 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "r2cbIII.h"
Chris@42 36
Chris@42 37 static void r2cbIII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 38 {
Chris@42 39 DK(KP534511135, +0.534511135950791641089685961295362908582039528);
Chris@42 40 DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
Chris@42 41 DK(KP303346683, +0.303346683607342391675883946941299872384187453);
Chris@42 42 DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
Chris@42 43 DK(KP098491403, +0.098491403357164253077197521291327432293052451);
Chris@42 44 DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
Chris@42 45 DK(KP820678790, +0.820678790828660330972281985331011598767386482);
Chris@42 46 DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
Chris@42 47 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 48 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 49 DK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@42 50 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 51 DK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@42 52 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 53 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 54 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 55 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 56 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 57 {
Chris@42 58 INT i;
Chris@42 59 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
Chris@42 60 E T1N, T1K, T1Q, T1H, T1O, T1P;
Chris@42 61 {
Chris@42 62 E T1I, T1e, T1Z, T7, T2E, T2i, T1x, Tz, Te, T2j, T22, T2F, T1h, T1y, TK;
Chris@42 63 E T1J, Tm, T2B, TX, Tp, T2m, T28, T1M, T1C, T1k, TW, TY, T2a, T14, T15;
Chris@42 64 E Ts, TZ;
Chris@42 65 {
Chris@42 66 E TE, T1g, TJ, T1f;
Chris@42 67 {
Chris@42 68 E T4, Tv, T3, T2g, T1d, T5, Tw, Tx;
Chris@42 69 {
Chris@42 70 E T1, T2, T1b, T1c;
Chris@42 71 T1 = Cr[0];
Chris@42 72 T2 = Cr[WS(csr, 15)];
Chris@42 73 T1b = Ci[0];
Chris@42 74 T1c = Ci[WS(csi, 15)];
Chris@42 75 T4 = Cr[WS(csr, 8)];
Chris@42 76 Tv = T1 - T2;
Chris@42 77 T3 = T1 + T2;
Chris@42 78 T2g = T1c - T1b;
Chris@42 79 T1d = T1b + T1c;
Chris@42 80 T5 = Cr[WS(csr, 7)];
Chris@42 81 Tw = Ci[WS(csi, 8)];
Chris@42 82 Tx = Ci[WS(csi, 7)];
Chris@42 83 }
Chris@42 84 {
Chris@42 85 E Tb, TA, Ta, T20, TD, Tc, TG, TH;
Chris@42 86 {
Chris@42 87 E T8, T9, TB, TC;
Chris@42 88 T8 = Cr[WS(csr, 4)];
Chris@42 89 {
Chris@42 90 E T1a, T6, T2h, Ty;
Chris@42 91 T1a = T4 - T5;
Chris@42 92 T6 = T4 + T5;
Chris@42 93 T2h = Tx - Tw;
Chris@42 94 Ty = Tw + Tx;
Chris@42 95 T1I = T1a - T1d;
Chris@42 96 T1e = T1a + T1d;
Chris@42 97 T1Z = T3 - T6;
Chris@42 98 T7 = T3 + T6;
Chris@42 99 T2E = T2h + T2g;
Chris@42 100 T2i = T2g - T2h;
Chris@42 101 T1x = Tv + Ty;
Chris@42 102 Tz = Tv - Ty;
Chris@42 103 T9 = Cr[WS(csr, 11)];
Chris@42 104 }
Chris@42 105 TB = Ci[WS(csi, 4)];
Chris@42 106 TC = Ci[WS(csi, 11)];
Chris@42 107 Tb = Cr[WS(csr, 3)];
Chris@42 108 TA = T8 - T9;
Chris@42 109 Ta = T8 + T9;
Chris@42 110 T20 = TC - TB;
Chris@42 111 TD = TB + TC;
Chris@42 112 Tc = Cr[WS(csr, 12)];
Chris@42 113 TG = Ci[WS(csi, 3)];
Chris@42 114 TH = Ci[WS(csi, 12)];
Chris@42 115 }
Chris@42 116 {
Chris@42 117 E TF, Td, T21, TI;
Chris@42 118 TE = TA - TD;
Chris@42 119 T1g = TA + TD;
Chris@42 120 TF = Tb - Tc;
Chris@42 121 Td = Tb + Tc;
Chris@42 122 T21 = TG - TH;
Chris@42 123 TI = TG + TH;
Chris@42 124 Te = Ta + Td;
Chris@42 125 T2j = Ta - Td;
Chris@42 126 T22 = T20 - T21;
Chris@42 127 T2F = T20 + T21;
Chris@42 128 TJ = TF - TI;
Chris@42 129 T1f = TF + TI;
Chris@42 130 }
Chris@42 131 }
Chris@42 132 }
Chris@42 133 {
Chris@42 134 E TM, Ti, TN, T25, TU, TR, Tl, TO;
Chris@42 135 {
Chris@42 136 E TS, TT, Tg, Th, Tj, Tk;
Chris@42 137 Tg = Cr[WS(csr, 2)];
Chris@42 138 Th = Cr[WS(csr, 13)];
Chris@42 139 T1h = T1f - T1g;
Chris@42 140 T1y = T1g + T1f;
Chris@42 141 TK = TE + TJ;
Chris@42 142 T1J = TE - TJ;
Chris@42 143 TM = Tg - Th;
Chris@42 144 Ti = Tg + Th;
Chris@42 145 TS = Ci[WS(csi, 2)];
Chris@42 146 TT = Ci[WS(csi, 13)];
Chris@42 147 Tj = Cr[WS(csr, 10)];
Chris@42 148 Tk = Cr[WS(csr, 5)];
Chris@42 149 TN = Ci[WS(csi, 10)];
Chris@42 150 T25 = TS - TT;
Chris@42 151 TU = TS + TT;
Chris@42 152 TR = Tj - Tk;
Chris@42 153 Tl = Tj + Tk;
Chris@42 154 TO = Ci[WS(csi, 5)];
Chris@42 155 }
Chris@42 156 {
Chris@42 157 E T12, T13, Tq, Tr;
Chris@42 158 {
Chris@42 159 E Tn, T1A, TV, T24, T26, TP, To, T27, T1B, TQ;
Chris@42 160 Tn = Cr[WS(csr, 1)];
Chris@42 161 T1A = TR - TU;
Chris@42 162 TV = TR + TU;
Chris@42 163 T24 = Ti - Tl;
Chris@42 164 Tm = Ti + Tl;
Chris@42 165 T26 = TN - TO;
Chris@42 166 TP = TN + TO;
Chris@42 167 To = Cr[WS(csr, 14)];
Chris@42 168 T12 = Ci[WS(csi, 1)];
Chris@42 169 T27 = T25 - T26;
Chris@42 170 T2B = T26 + T25;
Chris@42 171 T1B = TM + TP;
Chris@42 172 TQ = TM - TP;
Chris@42 173 TX = Tn - To;
Chris@42 174 Tp = Tn + To;
Chris@42 175 T2m = T24 + T27;
Chris@42 176 T28 = T24 - T27;
Chris@42 177 T1M = FNMS(KP414213562, T1A, T1B);
Chris@42 178 T1C = FMA(KP414213562, T1B, T1A);
Chris@42 179 T1k = FMA(KP414213562, TQ, TV);
Chris@42 180 TW = FNMS(KP414213562, TV, TQ);
Chris@42 181 T13 = Ci[WS(csi, 14)];
Chris@42 182 }
Chris@42 183 Tq = Cr[WS(csr, 6)];
Chris@42 184 Tr = Cr[WS(csr, 9)];
Chris@42 185 TY = Ci[WS(csi, 6)];
Chris@42 186 T2a = T13 - T12;
Chris@42 187 T14 = T12 + T13;
Chris@42 188 T15 = Tq - Tr;
Chris@42 189 Ts = Tq + Tr;
Chris@42 190 TZ = Ci[WS(csi, 9)];
Chris@42 191 }
Chris@42 192 }
Chris@42 193 }
Chris@42 194 {
Chris@42 195 E T1L, T1F, T23, T2n, T2k, T2e, T1p, T1t, T1s, T1i, T1o, T19, T1l, T1q;
Chris@42 196 {
Chris@42 197 E T2z, T2G, T2H, T2C, T1j, T17, T2r, T2s, T2u, T2v, T2K, T2D;
Chris@42 198 {
Chris@42 199 E T2L, T2d, T2l, T2O;
Chris@42 200 {
Chris@42 201 E Tf, T2N, Tu, T2M;
Chris@42 202 {
Chris@42 203 E T1D, T16, T29, Tt, T2b, T10;
Chris@42 204 T2z = T7 - Te;
Chris@42 205 Tf = T7 + Te;
Chris@42 206 T1D = T15 + T14;
Chris@42 207 T16 = T14 - T15;
Chris@42 208 T29 = Tp - Ts;
Chris@42 209 Tt = Tp + Ts;
Chris@42 210 T2b = TY - TZ;
Chris@42 211 T10 = TY + TZ;
Chris@42 212 T2N = T2F + T2E;
Chris@42 213 T2G = T2E - T2F;
Chris@42 214 T2H = Tm - Tt;
Chris@42 215 Tu = Tm + Tt;
Chris@42 216 {
Chris@42 217 E T2c, T2A, T1E, T11;
Chris@42 218 T2c = T2a - T2b;
Chris@42 219 T2A = T2b + T2a;
Chris@42 220 T1E = TX + T10;
Chris@42 221 T11 = TX - T10;
Chris@42 222 T2L = Tf - Tu;
Chris@42 223 T2d = T29 + T2c;
Chris@42 224 T2l = T29 - T2c;
Chris@42 225 T2C = T2A - T2B;
Chris@42 226 T2M = T2B + T2A;
Chris@42 227 T1L = FMA(KP414213562, T1D, T1E);
Chris@42 228 T1F = FNMS(KP414213562, T1E, T1D);
Chris@42 229 T1j = FMA(KP414213562, T11, T16);
Chris@42 230 T17 = FNMS(KP414213562, T16, T11);
Chris@42 231 T2O = T2M + T2N;
Chris@42 232 }
Chris@42 233 }
Chris@42 234 R0[0] = KP2_000000000 * (Tf + Tu);
Chris@42 235 R0[WS(rs, 8)] = KP2_000000000 * (T2N - T2M);
Chris@42 236 }
Chris@42 237 T23 = T1Z + T22;
Chris@42 238 T2r = T1Z - T22;
Chris@42 239 R0[WS(rs, 12)] = KP1_414213562 * (T2O - T2L);
Chris@42 240 R0[WS(rs, 4)] = KP1_414213562 * (T2L + T2O);
Chris@42 241 T2s = T2m + T2l;
Chris@42 242 T2n = T2l - T2m;
Chris@42 243 T2k = T2i - T2j;
Chris@42 244 T2u = T2j + T2i;
Chris@42 245 T2v = T28 - T2d;
Chris@42 246 T2e = T28 + T2d;
Chris@42 247 }
Chris@42 248 {
Chris@42 249 E T2y, T2t, T2x, T2w;
Chris@42 250 T2y = FMA(KP707106781, T2s, T2r);
Chris@42 251 T2t = FNMS(KP707106781, T2s, T2r);
Chris@42 252 T2x = FMA(KP707106781, T2v, T2u);
Chris@42 253 T2w = FNMS(KP707106781, T2v, T2u);
Chris@42 254 R0[WS(rs, 7)] = KP1_961570560 * (FMA(KP198912367, T2y, T2x));
Chris@42 255 R0[WS(rs, 15)] = -(KP1_961570560 * (FNMS(KP198912367, T2x, T2y)));
Chris@42 256 R0[WS(rs, 11)] = KP1_662939224 * (FNMS(KP668178637, T2t, T2w));
Chris@42 257 R0[WS(rs, 3)] = KP1_662939224 * (FMA(KP668178637, T2w, T2t));
Chris@42 258 T2K = T2z - T2C;
Chris@42 259 T2D = T2z + T2C;
Chris@42 260 }
Chris@42 261 {
Chris@42 262 E TL, T18, T2J, T2I;
Chris@42 263 T1p = FNMS(KP707106781, TK, Tz);
Chris@42 264 TL = FMA(KP707106781, TK, Tz);
Chris@42 265 T18 = TW + T17;
Chris@42 266 T1t = TW - T17;
Chris@42 267 T1s = FMA(KP707106781, T1h, T1e);
Chris@42 268 T1i = FNMS(KP707106781, T1h, T1e);
Chris@42 269 T2J = T2H + T2G;
Chris@42 270 T2I = T2G - T2H;
Chris@42 271 T1o = FNMS(KP923879532, T18, TL);
Chris@42 272 T19 = FMA(KP923879532, T18, TL);
Chris@42 273 R0[WS(rs, 6)] = KP1_847759065 * (FMA(KP414213562, T2K, T2J));
Chris@42 274 R0[WS(rs, 14)] = -(KP1_847759065 * (FNMS(KP414213562, T2J, T2K)));
Chris@42 275 R0[WS(rs, 10)] = KP1_847759065 * (FNMS(KP414213562, T2D, T2I));
Chris@42 276 R0[WS(rs, 2)] = KP1_847759065 * (FMA(KP414213562, T2I, T2D));
Chris@42 277 T1l = T1j - T1k;
Chris@42 278 T1q = T1k + T1j;
Chris@42 279 }
Chris@42 280 }
Chris@42 281 {
Chris@42 282 E T1z, T1U, T1Y, T1T, T1V, T1G;
Chris@42 283 {
Chris@42 284 E T1w, T1r, T1n, T1m;
Chris@42 285 T1n = FMA(KP923879532, T1l, T1i);
Chris@42 286 T1m = FNMS(KP923879532, T1l, T1i);
Chris@42 287 T1w = FMA(KP923879532, T1q, T1p);
Chris@42 288 T1r = FNMS(KP923879532, T1q, T1p);
Chris@42 289 R1[WS(rs, 4)] = -(KP1_546020906 * (FNMS(KP820678790, T1o, T1n)));
Chris@42 290 R1[WS(rs, 12)] = -(KP1_546020906 * (FMA(KP820678790, T1n, T1o)));
Chris@42 291 R1[WS(rs, 8)] = -(KP1_990369453 * (FMA(KP098491403, T19, T1m)));
Chris@42 292 R1[0] = KP1_990369453 * (FNMS(KP098491403, T1m, T19));
Chris@42 293 {
Chris@42 294 E T1R, T1S, T1v, T1u;
Chris@42 295 T1z = FNMS(KP707106781, T1y, T1x);
Chris@42 296 T1R = FMA(KP707106781, T1y, T1x);
Chris@42 297 T1S = T1M + T1L;
Chris@42 298 T1N = T1L - T1M;
Chris@42 299 T1K = FNMS(KP707106781, T1J, T1I);
Chris@42 300 T1U = FMA(KP707106781, T1J, T1I);
Chris@42 301 T1v = FNMS(KP923879532, T1t, T1s);
Chris@42 302 T1u = FMA(KP923879532, T1t, T1s);
Chris@42 303 T1Y = FMA(KP923879532, T1S, T1R);
Chris@42 304 T1T = FNMS(KP923879532, T1S, T1R);
Chris@42 305 R1[WS(rs, 6)] = -(KP1_913880671 * (FNMS(KP303346683, T1w, T1v)));
Chris@42 306 R1[WS(rs, 14)] = -(KP1_913880671 * (FMA(KP303346683, T1v, T1w)));
Chris@42 307 R1[WS(rs, 10)] = -(KP1_763842528 * (FMA(KP534511135, T1r, T1u)));
Chris@42 308 R1[WS(rs, 2)] = KP1_763842528 * (FNMS(KP534511135, T1u, T1r));
Chris@42 309 T1V = T1C + T1F;
Chris@42 310 T1G = T1C - T1F;
Chris@42 311 }
Chris@42 312 }
Chris@42 313 {
Chris@42 314 E T2q, T2f, T1X, T1W, T2p, T2o;
Chris@42 315 T1X = FMA(KP923879532, T1V, T1U);
Chris@42 316 T1W = FNMS(KP923879532, T1V, T1U);
Chris@42 317 T2q = FNMS(KP707106781, T2e, T23);
Chris@42 318 T2f = FMA(KP707106781, T2e, T23);
Chris@42 319 R1[WS(rs, 7)] = KP1_990369453 * (FMA(KP098491403, T1Y, T1X));
Chris@42 320 R1[WS(rs, 15)] = -(KP1_990369453 * (FNMS(KP098491403, T1X, T1Y)));
Chris@42 321 R1[WS(rs, 11)] = KP1_546020906 * (FNMS(KP820678790, T1T, T1W));
Chris@42 322 R1[WS(rs, 3)] = KP1_546020906 * (FMA(KP820678790, T1W, T1T));
Chris@42 323 T2p = FNMS(KP707106781, T2n, T2k);
Chris@42 324 T2o = FMA(KP707106781, T2n, T2k);
Chris@42 325 T1Q = FNMS(KP923879532, T1G, T1z);
Chris@42 326 T1H = FMA(KP923879532, T1G, T1z);
Chris@42 327 R0[WS(rs, 5)] = KP1_662939224 * (FMA(KP668178637, T2q, T2p));
Chris@42 328 R0[WS(rs, 13)] = -(KP1_662939224 * (FNMS(KP668178637, T2p, T2q)));
Chris@42 329 R0[WS(rs, 9)] = KP1_961570560 * (FNMS(KP198912367, T2f, T2o));
Chris@42 330 R0[WS(rs, 1)] = KP1_961570560 * (FMA(KP198912367, T2o, T2f));
Chris@42 331 }
Chris@42 332 }
Chris@42 333 }
Chris@42 334 }
Chris@42 335 T1O = FMA(KP923879532, T1N, T1K);
Chris@42 336 T1P = FNMS(KP923879532, T1N, T1K);
Chris@42 337 R1[WS(rs, 5)] = KP1_763842528 * (FMA(KP534511135, T1Q, T1P));
Chris@42 338 R1[WS(rs, 13)] = -(KP1_763842528 * (FNMS(KP534511135, T1P, T1Q)));
Chris@42 339 R1[WS(rs, 9)] = KP1_913880671 * (FNMS(KP303346683, T1H, T1O));
Chris@42 340 R1[WS(rs, 1)] = KP1_913880671 * (FMA(KP303346683, T1O, T1H));
Chris@42 341 }
Chris@42 342 }
Chris@42 343 }
Chris@42 344
Chris@42 345 static const kr2c_desc desc = { 32, "r2cbIII_32", {106, 32, 68, 0}, &GENUS };
Chris@42 346
Chris@42 347 void X(codelet_r2cbIII_32) (planner *p) {
Chris@42 348 X(kr2c_register) (p, r2cbIII_32, &desc);
Chris@42 349 }
Chris@42 350
Chris@42 351 #else /* HAVE_FMA */
Chris@42 352
Chris@42 353 /* Generated by: ../../../genfft/gen_r2cb.native -compact -variables 4 -pipeline-latency 4 -sign 1 -n 32 -name r2cbIII_32 -dft-III -include r2cbIII.h */
Chris@42 354
Chris@42 355 /*
Chris@42 356 * This function contains 174 FP additions, 84 FP multiplications,
Chris@42 357 * (or, 138 additions, 48 multiplications, 36 fused multiply/add),
Chris@42 358 * 66 stack variables, 19 constants, and 64 memory accesses
Chris@42 359 */
Chris@42 360 #include "r2cbIII.h"
Chris@42 361
Chris@42 362 static void r2cbIII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
Chris@42 363 {
Chris@42 364 DK(KP1_913880671, +1.913880671464417729871595773960539938965698411);
Chris@42 365 DK(KP580569354, +0.580569354508924735272384751634790549382952557);
Chris@42 366 DK(KP942793473, +0.942793473651995297112775251810508755314920638);
Chris@42 367 DK(KP1_763842528, +1.763842528696710059425513727320776699016885241);
Chris@42 368 DK(KP1_546020906, +1.546020906725473921621813219516939601942082586);
Chris@42 369 DK(KP1_268786568, +1.268786568327290996430343226450986741351374190);
Chris@42 370 DK(KP196034280, +0.196034280659121203988391127777283691722273346);
Chris@42 371 DK(KP1_990369453, +1.990369453344393772489673906218959843150949737);
Chris@42 372 DK(KP765366864, +0.765366864730179543456919968060797733522689125);
Chris@42 373 DK(KP1_847759065, +1.847759065022573512256366378793576573644833252);
Chris@42 374 DK(KP1_961570560, +1.961570560806460898252364472268478073947867462);
Chris@42 375 DK(KP390180644, +0.390180644032256535696569736954044481855383236);
Chris@42 376 DK(KP1_111140466, +1.111140466039204449485661627897065748749874382);
Chris@42 377 DK(KP1_662939224, +1.662939224605090474157576755235811513477121624);
Chris@42 378 DK(KP1_414213562, +1.414213562373095048801688724209698078569671875);
Chris@42 379 DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
Chris@42 380 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@42 381 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 382 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 383 {
Chris@42 384 INT i;
Chris@42 385 for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
Chris@42 386 E T7, T2i, T2F, Tz, T1k, T1I, T1Z, T1x, Te, T22, T2E, T2j, T1f, T1y, TK;
Chris@42 387 E T1J, Tm, T2B, TW, T1a, T1C, T1L, T28, T2l, Tt, T2A, T17, T1b, T1F, T1M;
Chris@42 388 E T2d, T2m;
Chris@42 389 {
Chris@42 390 E T3, Tv, T1j, T2h, T6, T1g, Ty, T2g;
Chris@42 391 {
Chris@42 392 E T1, T2, T1h, T1i;
Chris@42 393 T1 = Cr[0];
Chris@42 394 T2 = Cr[WS(csr, 15)];
Chris@42 395 T3 = T1 + T2;
Chris@42 396 Tv = T1 - T2;
Chris@42 397 T1h = Ci[0];
Chris@42 398 T1i = Ci[WS(csi, 15)];
Chris@42 399 T1j = T1h + T1i;
Chris@42 400 T2h = T1i - T1h;
Chris@42 401 }
Chris@42 402 {
Chris@42 403 E T4, T5, Tw, Tx;
Chris@42 404 T4 = Cr[WS(csr, 8)];
Chris@42 405 T5 = Cr[WS(csr, 7)];
Chris@42 406 T6 = T4 + T5;
Chris@42 407 T1g = T4 - T5;
Chris@42 408 Tw = Ci[WS(csi, 8)];
Chris@42 409 Tx = Ci[WS(csi, 7)];
Chris@42 410 Ty = Tw + Tx;
Chris@42 411 T2g = Tw - Tx;
Chris@42 412 }
Chris@42 413 T7 = T3 + T6;
Chris@42 414 T2i = T2g + T2h;
Chris@42 415 T2F = T2h - T2g;
Chris@42 416 Tz = Tv - Ty;
Chris@42 417 T1k = T1g + T1j;
Chris@42 418 T1I = T1g - T1j;
Chris@42 419 T1Z = T3 - T6;
Chris@42 420 T1x = Tv + Ty;
Chris@42 421 }
Chris@42 422 {
Chris@42 423 E Ta, TA, TD, T21, Td, TF, TI, T20;
Chris@42 424 {
Chris@42 425 E T8, T9, TB, TC;
Chris@42 426 T8 = Cr[WS(csr, 4)];
Chris@42 427 T9 = Cr[WS(csr, 11)];
Chris@42 428 Ta = T8 + T9;
Chris@42 429 TA = T8 - T9;
Chris@42 430 TB = Ci[WS(csi, 4)];
Chris@42 431 TC = Ci[WS(csi, 11)];
Chris@42 432 TD = TB + TC;
Chris@42 433 T21 = TB - TC;
Chris@42 434 }
Chris@42 435 {
Chris@42 436 E Tb, Tc, TG, TH;
Chris@42 437 Tb = Cr[WS(csr, 3)];
Chris@42 438 Tc = Cr[WS(csr, 12)];
Chris@42 439 Td = Tb + Tc;
Chris@42 440 TF = Tb - Tc;
Chris@42 441 TG = Ci[WS(csi, 3)];
Chris@42 442 TH = Ci[WS(csi, 12)];
Chris@42 443 TI = TG + TH;
Chris@42 444 T20 = TH - TG;
Chris@42 445 }
Chris@42 446 Te = Ta + Td;
Chris@42 447 T22 = T20 - T21;
Chris@42 448 T2E = T21 + T20;
Chris@42 449 T2j = Ta - Td;
Chris@42 450 {
Chris@42 451 E T1d, T1e, TE, TJ;
Chris@42 452 T1d = TA + TD;
Chris@42 453 T1e = TF + TI;
Chris@42 454 T1f = KP707106781 * (T1d - T1e);
Chris@42 455 T1y = KP707106781 * (T1d + T1e);
Chris@42 456 TE = TA - TD;
Chris@42 457 TJ = TF - TI;
Chris@42 458 TK = KP707106781 * (TE + TJ);
Chris@42 459 T1J = KP707106781 * (TE - TJ);
Chris@42 460 }
Chris@42 461 }
Chris@42 462 {
Chris@42 463 E Ti, TM, TU, T25, Tl, TR, TP, T26, TQ, TV;
Chris@42 464 {
Chris@42 465 E Tg, Th, TS, TT;
Chris@42 466 Tg = Cr[WS(csr, 2)];
Chris@42 467 Th = Cr[WS(csr, 13)];
Chris@42 468 Ti = Tg + Th;
Chris@42 469 TM = Tg - Th;
Chris@42 470 TS = Ci[WS(csi, 2)];
Chris@42 471 TT = Ci[WS(csi, 13)];
Chris@42 472 TU = TS + TT;
Chris@42 473 T25 = TS - TT;
Chris@42 474 }
Chris@42 475 {
Chris@42 476 E Tj, Tk, TN, TO;
Chris@42 477 Tj = Cr[WS(csr, 10)];
Chris@42 478 Tk = Cr[WS(csr, 5)];
Chris@42 479 Tl = Tj + Tk;
Chris@42 480 TR = Tj - Tk;
Chris@42 481 TN = Ci[WS(csi, 10)];
Chris@42 482 TO = Ci[WS(csi, 5)];
Chris@42 483 TP = TN + TO;
Chris@42 484 T26 = TN - TO;
Chris@42 485 }
Chris@42 486 Tm = Ti + Tl;
Chris@42 487 T2B = T26 + T25;
Chris@42 488 TQ = TM - TP;
Chris@42 489 TV = TR + TU;
Chris@42 490 TW = FNMS(KP382683432, TV, KP923879532 * TQ);
Chris@42 491 T1a = FMA(KP382683432, TQ, KP923879532 * TV);
Chris@42 492 {
Chris@42 493 E T1A, T1B, T24, T27;
Chris@42 494 T1A = TM + TP;
Chris@42 495 T1B = TU - TR;
Chris@42 496 T1C = FNMS(KP923879532, T1B, KP382683432 * T1A);
Chris@42 497 T1L = FMA(KP923879532, T1A, KP382683432 * T1B);
Chris@42 498 T24 = Ti - Tl;
Chris@42 499 T27 = T25 - T26;
Chris@42 500 T28 = T24 - T27;
Chris@42 501 T2l = T24 + T27;
Chris@42 502 }
Chris@42 503 }
Chris@42 504 {
Chris@42 505 E Tp, TX, T15, T2a, Ts, T12, T10, T2b, T11, T16;
Chris@42 506 {
Chris@42 507 E Tn, To, T13, T14;
Chris@42 508 Tn = Cr[WS(csr, 1)];
Chris@42 509 To = Cr[WS(csr, 14)];
Chris@42 510 Tp = Tn + To;
Chris@42 511 TX = Tn - To;
Chris@42 512 T13 = Ci[WS(csi, 1)];
Chris@42 513 T14 = Ci[WS(csi, 14)];
Chris@42 514 T15 = T13 + T14;
Chris@42 515 T2a = T14 - T13;
Chris@42 516 }
Chris@42 517 {
Chris@42 518 E Tq, Tr, TY, TZ;
Chris@42 519 Tq = Cr[WS(csr, 6)];
Chris@42 520 Tr = Cr[WS(csr, 9)];
Chris@42 521 Ts = Tq + Tr;
Chris@42 522 T12 = Tq - Tr;
Chris@42 523 TY = Ci[WS(csi, 6)];
Chris@42 524 TZ = Ci[WS(csi, 9)];
Chris@42 525 T10 = TY + TZ;
Chris@42 526 T2b = TY - TZ;
Chris@42 527 }
Chris@42 528 Tt = Tp + Ts;
Chris@42 529 T2A = T2b + T2a;
Chris@42 530 T11 = TX - T10;
Chris@42 531 T16 = T12 - T15;
Chris@42 532 T17 = FMA(KP923879532, T11, KP382683432 * T16);
Chris@42 533 T1b = FNMS(KP382683432, T11, KP923879532 * T16);
Chris@42 534 {
Chris@42 535 E T1D, T1E, T29, T2c;
Chris@42 536 T1D = TX + T10;
Chris@42 537 T1E = T12 + T15;
Chris@42 538 T1F = FNMS(KP923879532, T1E, KP382683432 * T1D);
Chris@42 539 T1M = FMA(KP923879532, T1D, KP382683432 * T1E);
Chris@42 540 T29 = Tp - Ts;
Chris@42 541 T2c = T2a - T2b;
Chris@42 542 T2d = T29 + T2c;
Chris@42 543 T2m = T2c - T29;
Chris@42 544 }
Chris@42 545 }
Chris@42 546 {
Chris@42 547 E Tf, Tu, T2L, T2M, T2N, T2O;
Chris@42 548 Tf = T7 + Te;
Chris@42 549 Tu = Tm + Tt;
Chris@42 550 T2L = Tf - Tu;
Chris@42 551 T2M = T2B + T2A;
Chris@42 552 T2N = T2F - T2E;
Chris@42 553 T2O = T2M + T2N;
Chris@42 554 R0[0] = KP2_000000000 * (Tf + Tu);
Chris@42 555 R0[WS(rs, 8)] = KP2_000000000 * (T2N - T2M);
Chris@42 556 R0[WS(rs, 4)] = KP1_414213562 * (T2L + T2O);
Chris@42 557 R0[WS(rs, 12)] = KP1_414213562 * (T2O - T2L);
Chris@42 558 }
Chris@42 559 {
Chris@42 560 E T2t, T2x, T2w, T2y;
Chris@42 561 {
Chris@42 562 E T2r, T2s, T2u, T2v;
Chris@42 563 T2r = T1Z - T22;
Chris@42 564 T2s = KP707106781 * (T2m - T2l);
Chris@42 565 T2t = T2r + T2s;
Chris@42 566 T2x = T2r - T2s;
Chris@42 567 T2u = T2j + T2i;
Chris@42 568 T2v = KP707106781 * (T28 - T2d);
Chris@42 569 T2w = T2u - T2v;
Chris@42 570 T2y = T2v + T2u;
Chris@42 571 }
Chris@42 572 R0[WS(rs, 3)] = FMA(KP1_662939224, T2t, KP1_111140466 * T2w);
Chris@42 573 R0[WS(rs, 15)] = FNMS(KP1_961570560, T2x, KP390180644 * T2y);
Chris@42 574 R0[WS(rs, 11)] = FNMS(KP1_111140466, T2t, KP1_662939224 * T2w);
Chris@42 575 R0[WS(rs, 7)] = FMA(KP390180644, T2x, KP1_961570560 * T2y);
Chris@42 576 }
Chris@42 577 {
Chris@42 578 E T2D, T2J, T2I, T2K;
Chris@42 579 {
Chris@42 580 E T2z, T2C, T2G, T2H;
Chris@42 581 T2z = T7 - Te;
Chris@42 582 T2C = T2A - T2B;
Chris@42 583 T2D = T2z + T2C;
Chris@42 584 T2J = T2z - T2C;
Chris@42 585 T2G = T2E + T2F;
Chris@42 586 T2H = Tm - Tt;
Chris@42 587 T2I = T2G - T2H;
Chris@42 588 T2K = T2H + T2G;
Chris@42 589 }
Chris@42 590 R0[WS(rs, 2)] = FMA(KP1_847759065, T2D, KP765366864 * T2I);
Chris@42 591 R0[WS(rs, 14)] = FNMS(KP1_847759065, T2J, KP765366864 * T2K);
Chris@42 592 R0[WS(rs, 10)] = FNMS(KP765366864, T2D, KP1_847759065 * T2I);
Chris@42 593 R0[WS(rs, 6)] = FMA(KP765366864, T2J, KP1_847759065 * T2K);
Chris@42 594 }
Chris@42 595 {
Chris@42 596 E T19, T1n, T1m, T1o;
Chris@42 597 {
Chris@42 598 E TL, T18, T1c, T1l;
Chris@42 599 TL = Tz + TK;
Chris@42 600 T18 = TW + T17;
Chris@42 601 T19 = TL + T18;
Chris@42 602 T1n = TL - T18;
Chris@42 603 T1c = T1a + T1b;
Chris@42 604 T1l = T1f + T1k;
Chris@42 605 T1m = T1c + T1l;
Chris@42 606 T1o = T1c - T1l;
Chris@42 607 }
Chris@42 608 R1[0] = FNMS(KP196034280, T1m, KP1_990369453 * T19);
Chris@42 609 R1[WS(rs, 12)] = FNMS(KP1_546020906, T1n, KP1_268786568 * T1o);
Chris@42 610 R1[WS(rs, 8)] = -(FMA(KP196034280, T19, KP1_990369453 * T1m));
Chris@42 611 R1[WS(rs, 4)] = FMA(KP1_268786568, T1n, KP1_546020906 * T1o);
Chris@42 612 }
Chris@42 613 {
Chris@42 614 E T1r, T1v, T1u, T1w;
Chris@42 615 {
Chris@42 616 E T1p, T1q, T1s, T1t;
Chris@42 617 T1p = Tz - TK;
Chris@42 618 T1q = T1b - T1a;
Chris@42 619 T1r = T1p + T1q;
Chris@42 620 T1v = T1p - T1q;
Chris@42 621 T1s = T1f - T1k;
Chris@42 622 T1t = TW - T17;
Chris@42 623 T1u = T1s - T1t;
Chris@42 624 T1w = T1t + T1s;
Chris@42 625 }
Chris@42 626 R1[WS(rs, 2)] = FMA(KP1_763842528, T1r, KP942793473 * T1u);
Chris@42 627 R1[WS(rs, 14)] = FNMS(KP1_913880671, T1v, KP580569354 * T1w);
Chris@42 628 R1[WS(rs, 10)] = FNMS(KP942793473, T1r, KP1_763842528 * T1u);
Chris@42 629 R1[WS(rs, 6)] = FMA(KP580569354, T1v, KP1_913880671 * T1w);
Chris@42 630 }
Chris@42 631 {
Chris@42 632 E T1T, T1X, T1W, T1Y;
Chris@42 633 {
Chris@42 634 E T1R, T1S, T1U, T1V;
Chris@42 635 T1R = T1x + T1y;
Chris@42 636 T1S = T1L + T1M;
Chris@42 637 T1T = T1R - T1S;
Chris@42 638 T1X = T1R + T1S;
Chris@42 639 T1U = T1J + T1I;
Chris@42 640 T1V = T1C - T1F;
Chris@42 641 T1W = T1U - T1V;
Chris@42 642 T1Y = T1V + T1U;
Chris@42 643 }
Chris@42 644 R1[WS(rs, 3)] = FMA(KP1_546020906, T1T, KP1_268786568 * T1W);
Chris@42 645 R1[WS(rs, 15)] = FNMS(KP1_990369453, T1X, KP196034280 * T1Y);
Chris@42 646 R1[WS(rs, 11)] = FNMS(KP1_268786568, T1T, KP1_546020906 * T1W);
Chris@42 647 R1[WS(rs, 7)] = FMA(KP196034280, T1X, KP1_990369453 * T1Y);
Chris@42 648 }
Chris@42 649 {
Chris@42 650 E T2f, T2p, T2o, T2q;
Chris@42 651 {
Chris@42 652 E T23, T2e, T2k, T2n;
Chris@42 653 T23 = T1Z + T22;
Chris@42 654 T2e = KP707106781 * (T28 + T2d);
Chris@42 655 T2f = T23 + T2e;
Chris@42 656 T2p = T23 - T2e;
Chris@42 657 T2k = T2i - T2j;
Chris@42 658 T2n = KP707106781 * (T2l + T2m);
Chris@42 659 T2o = T2k - T2n;
Chris@42 660 T2q = T2n + T2k;
Chris@42 661 }
Chris@42 662 R0[WS(rs, 1)] = FMA(KP1_961570560, T2f, KP390180644 * T2o);
Chris@42 663 R0[WS(rs, 13)] = FNMS(KP1_662939224, T2p, KP1_111140466 * T2q);
Chris@42 664 R0[WS(rs, 9)] = FNMS(KP390180644, T2f, KP1_961570560 * T2o);
Chris@42 665 R0[WS(rs, 5)] = FMA(KP1_111140466, T2p, KP1_662939224 * T2q);
Chris@42 666 }
Chris@42 667 {
Chris@42 668 E T1H, T1P, T1O, T1Q;
Chris@42 669 {
Chris@42 670 E T1z, T1G, T1K, T1N;
Chris@42 671 T1z = T1x - T1y;
Chris@42 672 T1G = T1C + T1F;
Chris@42 673 T1H = T1z + T1G;
Chris@42 674 T1P = T1z - T1G;
Chris@42 675 T1K = T1I - T1J;
Chris@42 676 T1N = T1L - T1M;
Chris@42 677 T1O = T1K - T1N;
Chris@42 678 T1Q = T1N + T1K;
Chris@42 679 }
Chris@42 680 R1[WS(rs, 1)] = FMA(KP1_913880671, T1H, KP580569354 * T1O);
Chris@42 681 R1[WS(rs, 13)] = FNMS(KP1_763842528, T1P, KP942793473 * T1Q);
Chris@42 682 R1[WS(rs, 9)] = FNMS(KP580569354, T1H, KP1_913880671 * T1O);
Chris@42 683 R1[WS(rs, 5)] = FMA(KP942793473, T1P, KP1_763842528 * T1Q);
Chris@42 684 }
Chris@42 685 }
Chris@42 686 }
Chris@42 687 }
Chris@42 688
Chris@42 689 static const kr2c_desc desc = { 32, "r2cbIII_32", {138, 48, 36, 0}, &GENUS };
Chris@42 690
Chris@42 691 void X(codelet_r2cbIII_32) (planner *p) {
Chris@42 692 X(kr2c_register) (p, r2cbIII_32, &desc);
Chris@42 693 }
Chris@42 694
Chris@42 695 #endif /* HAVE_FMA */