annotate src/fftw-3.3.5/dft/simd/common/t2bv_25.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:44:45 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include t2b.h -sign 1 */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 248 FP additions, 241 FP multiplications,
Chris@42 32 * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
Chris@42 33 * 208 stack variables, 67 constants, and 50 memory accesses
Chris@42 34 */
Chris@42 35 #include "t2b.h"
Chris@42 36
Chris@42 37 static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
Chris@42 40 DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
Chris@42 41 DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
Chris@42 42 DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
Chris@42 43 DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
Chris@42 44 DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
Chris@42 45 DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
Chris@42 46 DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
Chris@42 47 DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
Chris@42 48 DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
Chris@42 49 DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
Chris@42 50 DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
Chris@42 51 DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
Chris@42 52 DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
Chris@42 53 DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
Chris@42 54 DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
Chris@42 55 DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
Chris@42 56 DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
Chris@42 57 DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
Chris@42 58 DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
Chris@42 59 DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
Chris@42 60 DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
Chris@42 61 DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
Chris@42 62 DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
Chris@42 63 DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
Chris@42 64 DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
Chris@42 65 DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
Chris@42 66 DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
Chris@42 67 DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
Chris@42 68 DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
Chris@42 69 DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
Chris@42 70 DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
Chris@42 71 DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
Chris@42 72 DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
Chris@42 73 DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
Chris@42 74 DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
Chris@42 75 DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
Chris@42 76 DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
Chris@42 77 DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
Chris@42 78 DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
Chris@42 79 DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
Chris@42 80 DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
Chris@42 81 DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
Chris@42 82 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 83 DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
Chris@42 84 DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
Chris@42 85 DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
Chris@42 86 DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
Chris@42 87 DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
Chris@42 88 DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
Chris@42 89 DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
Chris@42 90 DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
Chris@42 91 DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
Chris@42 92 DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
Chris@42 93 DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
Chris@42 94 DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
Chris@42 95 DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
Chris@42 96 DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
Chris@42 97 DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
Chris@42 98 DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
Chris@42 99 DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
Chris@42 100 DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
Chris@42 101 DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
Chris@42 102 DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
Chris@42 103 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 104 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 105 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 106 {
Chris@42 107 INT m;
Chris@42 108 R *x;
Chris@42 109 x = ii;
Chris@42 110 for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
Chris@42 111 V T25, T1B, T2y, T1K, T2s, T23, T1S, T26, T20, T1X;
Chris@42 112 {
Chris@42 113 V T1O, T2X, Te, T3L, Td, T3Q, T3j, T3b, T2R, T2M, T2f, T27, T1y, T1H, T3M;
Chris@42 114 V TW, TR, TK, T2B, T3n, T3e, T2U, T2F, T2i, T2a, Tz, T1C, T3N, TQ, T11;
Chris@42 115 V T1b, T1c, T16;
Chris@42 116 {
Chris@42 117 V T1, T1g, T1i, T1p, T1k, T1m, Tb, T1N, T6, T1M;
Chris@42 118 {
Chris@42 119 V T7, T9, T2, T4, T1f, T1h, T1o;
Chris@42 120 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 121 T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 122 T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 123 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 124 T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 125 T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 126 T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 127 T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 128 {
Chris@42 129 V T8, Ta, T3, T5, T1j;
Chris@42 130 T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 131 T8 = BYTW(&(W[TWVL * 18]), T7);
Chris@42 132 Ta = BYTW(&(W[TWVL * 28]), T9);
Chris@42 133 T3 = BYTW(&(W[TWVL * 8]), T2);
Chris@42 134 T5 = BYTW(&(W[TWVL * 38]), T4);
Chris@42 135 T1g = BYTW(&(W[TWVL * 4]), T1f);
Chris@42 136 T1i = BYTW(&(W[TWVL * 14]), T1h);
Chris@42 137 T1p = BYTW(&(W[TWVL * 34]), T1o);
Chris@42 138 T1k = BYTW(&(W[TWVL * 44]), T1j);
Chris@42 139 T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 140 Tb = VADD(T8, Ta);
Chris@42 141 T1N = VSUB(T8, Ta);
Chris@42 142 T6 = VADD(T3, T5);
Chris@42 143 T1M = VSUB(T3, T5);
Chris@42 144 }
Chris@42 145 }
Chris@42 146 {
Chris@42 147 V T1v, T1l, Th, Tj, T1w, T1q, Tq, Tk, Tn, Tg;
Chris@42 148 Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 149 {
Chris@42 150 V Tc, Ti, T1n, Tp;
Chris@42 151 Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 152 T1v = VSUB(T1i, T1k);
Chris@42 153 T1l = VADD(T1i, T1k);
Chris@42 154 T1n = BYTW(&(W[TWVL * 24]), T1m);
Chris@42 155 Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 156 T1O = VFMA(LDK(KP618033988), T1N, T1M);
Chris@42 157 T2X = VFNMS(LDK(KP618033988), T1M, T1N);
Chris@42 158 Te = VSUB(T6, Tb);
Chris@42 159 Tc = VADD(T6, Tb);
Chris@42 160 Th = BYTW(&(W[0]), Tg);
Chris@42 161 Tj = BYTW(&(W[TWVL * 10]), Ti);
Chris@42 162 T1w = VSUB(T1n, T1p);
Chris@42 163 T1q = VADD(T1n, T1p);
Chris@42 164 Tq = BYTW(&(W[TWVL * 30]), Tp);
Chris@42 165 Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 166 T3L = VADD(T1, Tc);
Chris@42 167 Td = VFNMS(LDK(KP250000000), Tc, T1);
Chris@42 168 Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 169 }
Chris@42 170 {
Chris@42 171 V T1x, T2K, TM, TB, Tw, Tm, Tx, Tr, TI, T2L, T1u, TD, TF, TL;
Chris@42 172 TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 173 {
Chris@42 174 V T1t, Tl, To, TH, T1s, T1r, TA, TC;
Chris@42 175 TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 176 T1r = VADD(T1l, T1q);
Chris@42 177 T1t = VSUB(T1q, T1l);
Chris@42 178 T1x = VFMA(LDK(KP618033988), T1w, T1v);
Chris@42 179 T2K = VFNMS(LDK(KP618033988), T1v, T1w);
Chris@42 180 Tl = BYTW(&(W[TWVL * 40]), Tk);
Chris@42 181 To = BYTW(&(W[TWVL * 20]), Tn);
Chris@42 182 TM = BYTW(&(W[TWVL * 6]), TL);
Chris@42 183 TB = BYTW(&(W[TWVL * 46]), TA);
Chris@42 184 TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 185 T1s = VFNMS(LDK(KP250000000), T1r, T1g);
Chris@42 186 T3Q = VADD(T1g, T1r);
Chris@42 187 TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 188 Tw = VSUB(Tj, Tl);
Chris@42 189 Tm = VADD(Tj, Tl);
Chris@42 190 Tx = VSUB(Tq, To);
Chris@42 191 Tr = VADD(To, Tq);
Chris@42 192 TI = BYTW(&(W[TWVL * 26]), TH);
Chris@42 193 T2L = VFMA(LDK(KP559016994), T1t, T1s);
Chris@42 194 T1u = VFNMS(LDK(KP559016994), T1t, T1s);
Chris@42 195 TD = BYTW(&(W[TWVL * 16]), TC);
Chris@42 196 TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 197 }
Chris@42 198 {
Chris@42 199 V Tu, Ty, T2E, TE, TN, TG, Tt, TV, Ts;
Chris@42 200 TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 201 Ts = VADD(Tm, Tr);
Chris@42 202 Tu = VSUB(Tm, Tr);
Chris@42 203 Ty = VFNMS(LDK(KP618033988), Tx, Tw);
Chris@42 204 T2E = VFMA(LDK(KP618033988), Tw, Tx);
Chris@42 205 T3j = VFNMS(LDK(KP059835404), T2K, T2L);
Chris@42 206 T3b = VFMA(LDK(KP066152395), T2L, T2K);
Chris@42 207 T2R = VFNMS(LDK(KP786782374), T2K, T2L);
Chris@42 208 T2M = VFMA(LDK(KP869845200), T2L, T2K);
Chris@42 209 T2f = VFMA(LDK(KP132830569), T1u, T1x);
Chris@42 210 T27 = VFNMS(LDK(KP120146378), T1x, T1u);
Chris@42 211 T1y = VFNMS(LDK(KP893101515), T1x, T1u);
Chris@42 212 T1H = VFMA(LDK(KP987388751), T1u, T1x);
Chris@42 213 TE = VSUB(TB, TD);
Chris@42 214 TN = VADD(TD, TB);
Chris@42 215 TG = BYTW(&(W[TWVL * 36]), TF);
Chris@42 216 Tt = VFNMS(LDK(KP250000000), Ts, Th);
Chris@42 217 T3M = VADD(Th, Ts);
Chris@42 218 TW = BYTW(&(W[TWVL * 2]), TV);
Chris@42 219 {
Chris@42 220 V TJ, TO, Tv, T2D, TY, T15, T10, T13, TP;
Chris@42 221 {
Chris@42 222 V TX, T14, TZ, T12;
Chris@42 223 TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 224 T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 225 TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 226 T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 227 TJ = VSUB(TG, TI);
Chris@42 228 TO = VADD(TI, TG);
Chris@42 229 Tv = VFMA(LDK(KP559016994), Tu, Tt);
Chris@42 230 T2D = VFNMS(LDK(KP559016994), Tu, Tt);
Chris@42 231 TY = BYTW(&(W[TWVL * 12]), TX);
Chris@42 232 T15 = BYTW(&(W[TWVL * 32]), T14);
Chris@42 233 T10 = BYTW(&(W[TWVL * 42]), TZ);
Chris@42 234 T13 = BYTW(&(W[TWVL * 22]), T12);
Chris@42 235 }
Chris@42 236 TP = VADD(TN, TO);
Chris@42 237 TR = VSUB(TN, TO);
Chris@42 238 TK = VFMA(LDK(KP618033988), TJ, TE);
Chris@42 239 T2B = VFNMS(LDK(KP618033988), TE, TJ);
Chris@42 240 T3n = VFMA(LDK(KP578046249), T2D, T2E);
Chris@42 241 T3e = VFNMS(LDK(KP522847744), T2E, T2D);
Chris@42 242 T2U = VFNMS(LDK(KP987388751), T2D, T2E);
Chris@42 243 T2F = VFMA(LDK(KP893101515), T2E, T2D);
Chris@42 244 T2i = VFNMS(LDK(KP603558818), Ty, Tv);
Chris@42 245 T2a = VFMA(LDK(KP667278218), Tv, Ty);
Chris@42 246 Tz = VFNMS(LDK(KP244189809), Ty, Tv);
Chris@42 247 T1C = VFMA(LDK(KP269969613), Tv, Ty);
Chris@42 248 T3N = VADD(TM, TP);
Chris@42 249 TQ = VFMS(LDK(KP250000000), TP, TM);
Chris@42 250 T11 = VADD(TY, T10);
Chris@42 251 T1b = VSUB(TY, T10);
Chris@42 252 T1c = VSUB(T15, T13);
Chris@42 253 T16 = VADD(T13, T15);
Chris@42 254 }
Chris@42 255 }
Chris@42 256 }
Chris@42 257 }
Chris@42 258 }
Chris@42 259 {
Chris@42 260 V T2z, Tf, T3W, T3O, T1d, T2H, T3m, T2j, T2b, TT, T1D, T2G, T35, T2V, T2Z;
Chris@42 261 V T3A, T3g, T2I, T1a, T3R, T3X;
Chris@42 262 T2z = VFNMS(LDK(KP559016994), Te, Td);
Chris@42 263 Tf = VFMA(LDK(KP559016994), Te, Td);
Chris@42 264 {
Chris@42 265 V TS, T2A, T17, T19;
Chris@42 266 TS = VFNMS(LDK(KP559016994), TR, TQ);
Chris@42 267 T2A = VFMA(LDK(KP559016994), TR, TQ);
Chris@42 268 T3W = VSUB(T3M, T3N);
Chris@42 269 T3O = VADD(T3M, T3N);
Chris@42 270 T1d = VFNMS(LDK(KP618033988), T1c, T1b);
Chris@42 271 T2H = VFMA(LDK(KP618033988), T1b, T1c);
Chris@42 272 T17 = VADD(T11, T16);
Chris@42 273 T19 = VSUB(T16, T11);
Chris@42 274 {
Chris@42 275 V T3f, T2T, T2C, T18, T3P;
Chris@42 276 T3m = VFMA(LDK(KP447533225), T2B, T2A);
Chris@42 277 T3f = VFNMS(LDK(KP494780565), T2A, T2B);
Chris@42 278 T2T = VFNMS(LDK(KP132830569), T2A, T2B);
Chris@42 279 T2C = VFMA(LDK(KP120146378), T2B, T2A);
Chris@42 280 T2j = VFNMS(LDK(KP786782374), TK, TS);
Chris@42 281 T2b = VFMA(LDK(KP869845200), TS, TK);
Chris@42 282 TT = VFNMS(LDK(KP667278218), TS, TK);
Chris@42 283 T1D = VFMA(LDK(KP603558818), TK, TS);
Chris@42 284 T18 = VFNMS(LDK(KP250000000), T17, TW);
Chris@42 285 T3P = VADD(TW, T17);
Chris@42 286 T2G = VFMA(LDK(KP734762448), T2F, T2C);
Chris@42 287 T35 = VFNMS(LDK(KP734762448), T2F, T2C);
Chris@42 288 T2V = VFNMS(LDK(KP734762448), T2U, T2T);
Chris@42 289 T2Z = VFMA(LDK(KP734762448), T2U, T2T);
Chris@42 290 T3A = VFMA(LDK(KP982009705), T3f, T3e);
Chris@42 291 T3g = VFNMS(LDK(KP982009705), T3f, T3e);
Chris@42 292 T2I = VFMA(LDK(KP559016994), T19, T18);
Chris@42 293 T1a = VFNMS(LDK(KP559016994), T19, T18);
Chris@42 294 T3R = VADD(T3P, T3Q);
Chris@42 295 T3X = VSUB(T3P, T3Q);
Chris@42 296 }
Chris@42 297 }
Chris@42 298 {
Chris@42 299 V T2n, T2t, T1V, T22, T2l, T2d, T1Q, T1I, T2w, T1A, T1F, T2q;
Chris@42 300 {
Chris@42 301 V T2k, T1G, T28, T2g, T3K, T3E, T3a, T34, T3x, T3H, T2c, TU, T1T, T1U, T1z;
Chris@42 302 V T3o, T3t;
Chris@42 303 T2n = VFNMS(LDK(KP912575812), T2j, T2i);
Chris@42 304 T2k = VFMA(LDK(KP912575812), T2j, T2i);
Chris@42 305 T3o = VFNMS(LDK(KP921078979), T3n, T3m);
Chris@42 306 T3t = VFMA(LDK(KP921078979), T3n, T3m);
Chris@42 307 {
Chris@42 308 V T3c, T2Q, T2J, T3k, T1e;
Chris@42 309 T3c = VFNMS(LDK(KP667278218), T2I, T2H);
Chris@42 310 T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
Chris@42 311 T2J = VFMA(LDK(KP066152395), T2I, T2H);
Chris@42 312 T3k = VFMA(LDK(KP603558818), T2H, T2I);
Chris@42 313 T1G = VFMA(LDK(KP578046249), T1a, T1d);
Chris@42 314 T1e = VFNMS(LDK(KP522847744), T1d, T1a);
Chris@42 315 T28 = VFNMS(LDK(KP494780565), T1a, T1d);
Chris@42 316 T2g = VFMA(LDK(KP447533225), T1d, T1a);
Chris@42 317 {
Chris@42 318 V T3U, T3S, T40, T3Y;
Chris@42 319 T3U = VSUB(T3O, T3R);
Chris@42 320 T3S = VADD(T3O, T3R);
Chris@42 321 T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
Chris@42 322 T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
Chris@42 323 {
Chris@42 324 V T3s, T3l, T2N, T36;
Chris@42 325 T3s = VFNMS(LDK(KP845997307), T3k, T3j);
Chris@42 326 T3l = VFMA(LDK(KP845997307), T3k, T3j);
Chris@42 327 T2N = VFNMS(LDK(KP772036680), T2M, T2J);
Chris@42 328 T36 = VFMA(LDK(KP772036680), T2M, T2J);
Chris@42 329 {
Chris@42 330 V T30, T2S, T3d, T3z, T3T;
Chris@42 331 T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
Chris@42 332 T2S = VFMA(LDK(KP772036680), T2R, T2Q);
Chris@42 333 T3d = VFNMS(LDK(KP845997307), T3c, T3b);
Chris@42 334 T3z = VFMA(LDK(KP845997307), T3c, T3b);
Chris@42 335 ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
Chris@42 336 T3T = VFNMS(LDK(KP250000000), T3S, T3L);
Chris@42 337 {
Chris@42 338 V T3C, T3p, T2O, T37;
Chris@42 339 T3C = VFMA(LDK(KP906616052), T3o, T3l);
Chris@42 340 T3p = VFNMS(LDK(KP906616052), T3o, T3l);
Chris@42 341 T2O = VFMA(LDK(KP956723877), T2N, T2G);
Chris@42 342 T37 = VFMA(LDK(KP522616830), T2V, T36);
Chris@42 343 {
Chris@42 344 V T31, T2W, T3u, T3h;
Chris@42 345 T31 = VFNMS(LDK(KP522616830), T2G, T30);
Chris@42 346 T2W = VFMA(LDK(KP945422727), T2V, T2S);
Chris@42 347 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
Chris@42 348 T3h = VFMA(LDK(KP923225144), T3g, T3d);
Chris@42 349 {
Chris@42 350 V T3I, T3B, T3V, T3Z;
Chris@42 351 T3I = VFNMS(LDK(KP669429328), T3z, T3A);
Chris@42 352 T3B = VFMA(LDK(KP570584518), T3A, T3z);
Chris@42 353 T3V = VFMA(LDK(KP559016994), T3U, T3T);
Chris@42 354 T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
Chris@42 355 {
Chris@42 356 V T3y, T3q, T2P, T38;
Chris@42 357 T3y = VFMA(LDK(KP262346850), T3p, T2X);
Chris@42 358 T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
Chris@42 359 T2P = VFMA(LDK(KP992114701), T2O, T2z);
Chris@42 360 T38 = VFNMS(LDK(KP690983005), T37, T2S);
Chris@42 361 {
Chris@42 362 V T32, T2Y, T3v, T3F;
Chris@42 363 T32 = VFMA(LDK(KP763932022), T31, T2N);
Chris@42 364 T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
Chris@42 365 T3v = VFNMS(LDK(KP997675361), T3u, T3t);
Chris@42 366 T3F = VFNMS(LDK(KP904508497), T3u, T3s);
Chris@42 367 {
Chris@42 368 V T3i, T3r, T3J, T3D;
Chris@42 369 T3i = VFMA(LDK(KP949179823), T3h, T2z);
Chris@42 370 T3r = VFNMS(LDK(KP237294955), T3h, T2z);
Chris@42 371 T3J = VFNMS(LDK(KP669429328), T3C, T3I);
Chris@42 372 T3D = VFMA(LDK(KP618033988), T3C, T3B);
Chris@42 373 ST(&(x[WS(rs, 20)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@42 374 ST(&(x[WS(rs, 5)]), VFMAI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
Chris@42 375 ST(&(x[WS(rs, 15)]), VFMAI(T40, T3Z), ms, &(x[WS(rs, 1)]));
Chris@42 376 ST(&(x[WS(rs, 10)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@42 377 {
Chris@42 378 V T39, T33, T3w, T3G;
Chris@42 379 T39 = VFMA(LDK(KP855719849), T38, T35);
Chris@42 380 T33 = VFNMS(LDK(KP855719849), T32, T2Z);
Chris@42 381 ST(&(x[WS(rs, 3)]), VFMAI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
Chris@42 382 ST(&(x[WS(rs, 22)]), VFNMSI(T2Y, T2P), ms, &(x[0]));
Chris@42 383 T3w = VFMA(LDK(KP560319534), T3v, T3s);
Chris@42 384 T3G = VFNMS(LDK(KP681693190), T3F, T3t);
Chris@42 385 ST(&(x[WS(rs, 2)]), VFMAI(T3q, T3i), ms, &(x[0]));
Chris@42 386 ST(&(x[WS(rs, 23)]), VFNMSI(T3q, T3i), ms, &(x[WS(rs, 1)]));
Chris@42 387 T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
Chris@42 388 T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
Chris@42 389 T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
Chris@42 390 T34 = VFMA(LDK(KP897376177), T33, T2z);
Chris@42 391 T3x = VFNMS(LDK(KP949179823), T3w, T3r);
Chris@42 392 T3H = VFNMS(LDK(KP860541664), T3G, T3r);
Chris@42 393 T2t = VFNMS(LDK(KP912575812), T2b, T2a);
Chris@42 394 T2c = VFMA(LDK(KP912575812), T2b, T2a);
Chris@42 395 TU = VFMA(LDK(KP829049696), TT, Tz);
Chris@42 396 T1T = VFNMS(LDK(KP829049696), TT, Tz);
Chris@42 397 T1U = VFNMS(LDK(KP831864738), T1y, T1e);
Chris@42 398 T1z = VFMA(LDK(KP831864738), T1y, T1e);
Chris@42 399 }
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 }
Chris@42 404 }
Chris@42 405 }
Chris@42 406 }
Chris@42 407 }
Chris@42 408 }
Chris@42 409 }
Chris@42 410 {
Chris@42 411 V T2o, T2h, T29, T2u, T2v, T2p;
Chris@42 412 T2o = VFNMS(LDK(KP958953096), T2g, T2f);
Chris@42 413 T2h = VFMA(LDK(KP958953096), T2g, T2f);
Chris@42 414 ST(&(x[WS(rs, 17)]), VFNMSI(T3a, T34), ms, &(x[WS(rs, 1)]));
Chris@42 415 ST(&(x[WS(rs, 8)]), VFMAI(T3a, T34), ms, &(x[0]));
Chris@42 416 ST(&(x[WS(rs, 13)]), VFMAI(T3E, T3x), ms, &(x[WS(rs, 1)]));
Chris@42 417 ST(&(x[WS(rs, 12)]), VFNMSI(T3E, T3x), ms, &(x[0]));
Chris@42 418 ST(&(x[WS(rs, 7)]), VFNMSI(T3K, T3H), ms, &(x[WS(rs, 1)]));
Chris@42 419 ST(&(x[WS(rs, 18)]), VFMAI(T3K, T3H), ms, &(x[0]));
Chris@42 420 T1V = VFMA(LDK(KP559154169), T1U, T1T);
Chris@42 421 T22 = VFNMS(LDK(KP683113946), T1T, T1U);
Chris@42 422 T29 = VFNMS(LDK(KP867381224), T28, T27);
Chris@42 423 T2u = VFMA(LDK(KP867381224), T28, T27);
Chris@42 424 T2l = VFMA(LDK(KP894834959), T2k, T2h);
Chris@42 425 T2v = VFMA(LDK(KP447417479), T2k, T2u);
Chris@42 426 T2d = VFNMS(LDK(KP809385824), T2c, T29);
Chris@42 427 T2p = VFMA(LDK(KP447417479), T2c, T2o);
Chris@42 428 T1Q = VFMA(LDK(KP831864738), T1H, T1G);
Chris@42 429 T1I = VFNMS(LDK(KP831864738), T1H, T1G);
Chris@42 430 T2w = VFNMS(LDK(KP763932022), T2v, T2h);
Chris@42 431 T1A = VFMA(LDK(KP904730450), T1z, TU);
Chris@42 432 T1F = VFNMS(LDK(KP904730450), T1z, TU);
Chris@42 433 T2q = VFMA(LDK(KP690983005), T2p, T29);
Chris@42 434 }
Chris@42 435 }
Chris@42 436 {
Chris@42 437 V T2e, T1E, T1P, T2m;
Chris@42 438 T2e = VFNMS(LDK(KP992114701), T2d, Tf);
Chris@42 439 T1E = VFMA(LDK(KP916574801), T1D, T1C);
Chris@42 440 T1P = VFNMS(LDK(KP916574801), T1D, T1C);
Chris@42 441 T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
Chris@42 442 {
Chris@42 443 V T1J, T2r, T1R, T1W, T1Z, T2x;
Chris@42 444 T2x = VFNMS(LDK(KP999544308), T2w, T2t);
Chris@42 445 T1J = VFNMS(LDK(KP904730450), T1I, T1F);
Chris@42 446 T25 = VFMA(LDK(KP968583161), T1A, Tf);
Chris@42 447 T1B = VFNMS(LDK(KP242145790), T1A, Tf);
Chris@42 448 T2r = VFNMS(LDK(KP999544308), T2q, T2n);
Chris@42 449 T1R = VFMA(LDK(KP904730450), T1Q, T1P);
Chris@42 450 T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
Chris@42 451 T1Z = VADD(T1E, T1F);
Chris@42 452 ST(&(x[WS(rs, 21)]), VFMAI(T2m, T2e), ms, &(x[WS(rs, 1)]));
Chris@42 453 ST(&(x[WS(rs, 4)]), VFNMSI(T2m, T2e), ms, &(x[0]));
Chris@42 454 T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
Chris@42 455 T1K = VFNMS(LDK(KP618033988), T1J, T1E);
Chris@42 456 T2s = VFNMS(LDK(KP803003575), T2r, Tf);
Chris@42 457 T23 = VFMA(LDK(KP617882369), T1W, T22);
Chris@42 458 T1S = VFNMS(LDK(KP242145790), T1R, T1O);
Chris@42 459 T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
Chris@42 460 T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
Chris@42 461 T1X = VFMA(LDK(KP559016994), T1W, T1V);
Chris@42 462 }
Chris@42 463 }
Chris@42 464 }
Chris@42 465 }
Chris@42 466 }
Chris@42 467 {
Chris@42 468 V T1L, T24, T21, T1Y;
Chris@42 469 T1L = VFNMS(LDK(KP876091699), T1K, T1B);
Chris@42 470 ST(&(x[WS(rs, 16)]), VFMAI(T2y, T2s), ms, &(x[0]));
Chris@42 471 ST(&(x[WS(rs, 9)]), VFNMSI(T2y, T2s), ms, &(x[WS(rs, 1)]));
Chris@42 472 T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
Chris@42 473 ST(&(x[WS(rs, 24)]), VFNMSI(T26, T25), ms, &(x[0]));
Chris@42 474 ST(&(x[WS(rs, 1)]), VFMAI(T26, T25), ms, &(x[WS(rs, 1)]));
Chris@42 475 T21 = VFMA(LDK(KP792626838), T20, T1B);
Chris@42 476 T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
Chris@42 477 ST(&(x[WS(rs, 11)]), VFMAI(T24, T21), ms, &(x[WS(rs, 1)]));
Chris@42 478 ST(&(x[WS(rs, 14)]), VFNMSI(T24, T21), ms, &(x[0]));
Chris@42 479 ST(&(x[WS(rs, 19)]), VFNMSI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
Chris@42 480 ST(&(x[WS(rs, 6)]), VFMAI(T1Y, T1L), ms, &(x[0]));
Chris@42 481 }
Chris@42 482 }
Chris@42 483 }
Chris@42 484 VLEAVE();
Chris@42 485 }
Chris@42 486
Chris@42 487 static const tw_instr twinstr[] = {
Chris@42 488 VTW(0, 1),
Chris@42 489 VTW(0, 2),
Chris@42 490 VTW(0, 3),
Chris@42 491 VTW(0, 4),
Chris@42 492 VTW(0, 5),
Chris@42 493 VTW(0, 6),
Chris@42 494 VTW(0, 7),
Chris@42 495 VTW(0, 8),
Chris@42 496 VTW(0, 9),
Chris@42 497 VTW(0, 10),
Chris@42 498 VTW(0, 11),
Chris@42 499 VTW(0, 12),
Chris@42 500 VTW(0, 13),
Chris@42 501 VTW(0, 14),
Chris@42 502 VTW(0, 15),
Chris@42 503 VTW(0, 16),
Chris@42 504 VTW(0, 17),
Chris@42 505 VTW(0, 18),
Chris@42 506 VTW(0, 19),
Chris@42 507 VTW(0, 20),
Chris@42 508 VTW(0, 21),
Chris@42 509 VTW(0, 22),
Chris@42 510 VTW(0, 23),
Chris@42 511 VTW(0, 24),
Chris@42 512 {TW_NEXT, VL, 0}
Chris@42 513 };
Chris@42 514
Chris@42 515 static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
Chris@42 516
Chris@42 517 void XSIMD(codelet_t2bv_25) (planner *p) {
Chris@42 518 X(kdft_dit_register) (p, t2bv_25, &desc);
Chris@42 519 }
Chris@42 520 #else /* HAVE_FMA */
Chris@42 521
Chris@42 522 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include t2b.h -sign 1 */
Chris@42 523
Chris@42 524 /*
Chris@42 525 * This function contains 248 FP additions, 188 FP multiplications,
Chris@42 526 * (or, 171 additions, 111 multiplications, 77 fused multiply/add),
Chris@42 527 * 100 stack variables, 40 constants, and 50 memory accesses
Chris@42 528 */
Chris@42 529 #include "t2b.h"
Chris@42 530
Chris@42 531 static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 532 {
Chris@42 533 DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
Chris@42 534 DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
Chris@42 535 DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
Chris@42 536 DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
Chris@42 537 DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
Chris@42 538 DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
Chris@42 539 DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
Chris@42 540 DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
Chris@42 541 DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
Chris@42 542 DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
Chris@42 543 DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
Chris@42 544 DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
Chris@42 545 DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
Chris@42 546 DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
Chris@42 547 DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
Chris@42 548 DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
Chris@42 549 DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
Chris@42 550 DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
Chris@42 551 DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
Chris@42 552 DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
Chris@42 553 DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
Chris@42 554 DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
Chris@42 555 DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
Chris@42 556 DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
Chris@42 557 DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
Chris@42 558 DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
Chris@42 559 DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
Chris@42 560 DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
Chris@42 561 DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
Chris@42 562 DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
Chris@42 563 DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
Chris@42 564 DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
Chris@42 565 DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
Chris@42 566 DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
Chris@42 567 DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@42 568 DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@42 569 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 570 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 571 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 572 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 573 {
Chris@42 574 INT m;
Chris@42 575 R *x;
Chris@42 576 x = ii;
Chris@42 577 for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
Chris@42 578 V T1A, T1z, T1R, T1S, T1B, T1C, T1Q, T2L, T1l, T2v, T1i, T3e, T2u, Tb, T2i;
Chris@42 579 V Tj, T3b, T2h, Tv, T2k, TD, T3a, T2l, T11, T2s, TY, T3d, T2r;
Chris@42 580 {
Chris@42 581 V T1v, T1x, T1y, T1q, T1s, T1t, T1P;
Chris@42 582 T1A = LD(&(x[0]), ms, &(x[0]));
Chris@42 583 {
Chris@42 584 V T1u, T1w, T1p, T1r;
Chris@42 585 T1u = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@42 586 T1v = BYTW(&(W[TWVL * 18]), T1u);
Chris@42 587 T1w = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@42 588 T1x = BYTW(&(W[TWVL * 28]), T1w);
Chris@42 589 T1y = VADD(T1v, T1x);
Chris@42 590 T1p = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 591 T1q = BYTW(&(W[TWVL * 8]), T1p);
Chris@42 592 T1r = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@42 593 T1s = BYTW(&(W[TWVL * 38]), T1r);
Chris@42 594 T1t = VADD(T1q, T1s);
Chris@42 595 }
Chris@42 596 T1z = VMUL(LDK(KP559016994), VSUB(T1t, T1y));
Chris@42 597 T1R = VSUB(T1v, T1x);
Chris@42 598 T1S = VMUL(LDK(KP587785252), T1R);
Chris@42 599 T1B = VADD(T1t, T1y);
Chris@42 600 T1C = VFNMS(LDK(KP250000000), T1B, T1A);
Chris@42 601 T1P = VSUB(T1q, T1s);
Chris@42 602 T1Q = VMUL(LDK(KP951056516), T1P);
Chris@42 603 T2L = VMUL(LDK(KP587785252), T1P);
Chris@42 604 }
Chris@42 605 {
Chris@42 606 V T1f, T19, T1b, T1c, T14, T16, T17, T1e;
Chris@42 607 T1e = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 608 T1f = BYTW(&(W[TWVL * 4]), T1e);
Chris@42 609 {
Chris@42 610 V T18, T1a, T13, T15;
Chris@42 611 T18 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@42 612 T19 = BYTW(&(W[TWVL * 24]), T18);
Chris@42 613 T1a = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@42 614 T1b = BYTW(&(W[TWVL * 34]), T1a);
Chris@42 615 T1c = VADD(T19, T1b);
Chris@42 616 T13 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@42 617 T14 = BYTW(&(W[TWVL * 14]), T13);
Chris@42 618 T15 = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@42 619 T16 = BYTW(&(W[TWVL * 44]), T15);
Chris@42 620 T17 = VADD(T14, T16);
Chris@42 621 }
Chris@42 622 {
Chris@42 623 V T1j, T1k, T1d, T1g, T1h;
Chris@42 624 T1j = VSUB(T14, T16);
Chris@42 625 T1k = VSUB(T19, T1b);
Chris@42 626 T1l = VFMA(LDK(KP475528258), T1j, VMUL(LDK(KP293892626), T1k));
Chris@42 627 T2v = VFNMS(LDK(KP475528258), T1k, VMUL(LDK(KP293892626), T1j));
Chris@42 628 T1d = VMUL(LDK(KP559016994), VSUB(T17, T1c));
Chris@42 629 T1g = VADD(T17, T1c);
Chris@42 630 T1h = VFNMS(LDK(KP250000000), T1g, T1f);
Chris@42 631 T1i = VADD(T1d, T1h);
Chris@42 632 T3e = VADD(T1f, T1g);
Chris@42 633 T2u = VSUB(T1h, T1d);
Chris@42 634 }
Chris@42 635 }
Chris@42 636 {
Chris@42 637 V Tg, T7, T9, Td, T2, T4, Tc, Tf;
Chris@42 638 Tf = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 639 Tg = BYTW(&(W[TWVL * 6]), Tf);
Chris@42 640 {
Chris@42 641 V T6, T8, T1, T3;
Chris@42 642 T6 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@42 643 T7 = BYTW(&(W[TWVL * 26]), T6);
Chris@42 644 T8 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@42 645 T9 = BYTW(&(W[TWVL * 36]), T8);
Chris@42 646 Td = VADD(T7, T9);
Chris@42 647 T1 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@42 648 T2 = BYTW(&(W[TWVL * 16]), T1);
Chris@42 649 T3 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@42 650 T4 = BYTW(&(W[TWVL * 46]), T3);
Chris@42 651 Tc = VADD(T2, T4);
Chris@42 652 }
Chris@42 653 {
Chris@42 654 V T5, Ta, Te, Th, Ti;
Chris@42 655 T5 = VSUB(T2, T4);
Chris@42 656 Ta = VSUB(T7, T9);
Chris@42 657 Tb = VFMA(LDK(KP475528258), T5, VMUL(LDK(KP293892626), Ta));
Chris@42 658 T2i = VFNMS(LDK(KP475528258), Ta, VMUL(LDK(KP293892626), T5));
Chris@42 659 Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
Chris@42 660 Th = VADD(Tc, Td);
Chris@42 661 Ti = VFNMS(LDK(KP250000000), Th, Tg);
Chris@42 662 Tj = VADD(Te, Ti);
Chris@42 663 T3b = VADD(Tg, Th);
Chris@42 664 T2h = VSUB(Ti, Te);
Chris@42 665 }
Chris@42 666 }
Chris@42 667 {
Chris@42 668 V TA, Tr, Tt, Tx, Tm, To, Tw, Tz;
Chris@42 669 Tz = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 670 TA = BYTW(&(W[0]), Tz);
Chris@42 671 {
Chris@42 672 V Tq, Ts, Tl, Tn;
Chris@42 673 Tq = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@42 674 Tr = BYTW(&(W[TWVL * 20]), Tq);
Chris@42 675 Ts = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@42 676 Tt = BYTW(&(W[TWVL * 30]), Ts);
Chris@42 677 Tx = VADD(Tr, Tt);
Chris@42 678 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 679 Tm = BYTW(&(W[TWVL * 10]), Tl);
Chris@42 680 Tn = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@42 681 To = BYTW(&(W[TWVL * 40]), Tn);
Chris@42 682 Tw = VADD(Tm, To);
Chris@42 683 }
Chris@42 684 {
Chris@42 685 V Tp, Tu, Ty, TB, TC;
Chris@42 686 Tp = VSUB(Tm, To);
Chris@42 687 Tu = VSUB(Tr, Tt);
Chris@42 688 Tv = VFMA(LDK(KP475528258), Tp, VMUL(LDK(KP293892626), Tu));
Chris@42 689 T2k = VFNMS(LDK(KP475528258), Tu, VMUL(LDK(KP293892626), Tp));
Chris@42 690 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
Chris@42 691 TB = VADD(Tw, Tx);
Chris@42 692 TC = VFNMS(LDK(KP250000000), TB, TA);
Chris@42 693 TD = VADD(Ty, TC);
Chris@42 694 T3a = VADD(TA, TB);
Chris@42 695 T2l = VSUB(TC, Ty);
Chris@42 696 }
Chris@42 697 }
Chris@42 698 {
Chris@42 699 V TV, TP, TR, TS, TK, TM, TN, TU;
Chris@42 700 TU = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 701 TV = BYTW(&(W[TWVL * 2]), TU);
Chris@42 702 {
Chris@42 703 V TO, TQ, TJ, TL;
Chris@42 704 TO = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@42 705 TP = BYTW(&(W[TWVL * 22]), TO);
Chris@42 706 TQ = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@42 707 TR = BYTW(&(W[TWVL * 32]), TQ);
Chris@42 708 TS = VADD(TP, TR);
Chris@42 709 TJ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 710 TK = BYTW(&(W[TWVL * 12]), TJ);
Chris@42 711 TL = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@42 712 TM = BYTW(&(W[TWVL * 42]), TL);
Chris@42 713 TN = VADD(TK, TM);
Chris@42 714 }
Chris@42 715 {
Chris@42 716 V TZ, T10, TT, TW, TX;
Chris@42 717 TZ = VSUB(TK, TM);
Chris@42 718 T10 = VSUB(TP, TR);
Chris@42 719 T11 = VFMA(LDK(KP475528258), TZ, VMUL(LDK(KP293892626), T10));
Chris@42 720 T2s = VFNMS(LDK(KP475528258), T10, VMUL(LDK(KP293892626), TZ));
Chris@42 721 TT = VMUL(LDK(KP559016994), VSUB(TN, TS));
Chris@42 722 TW = VADD(TN, TS);
Chris@42 723 TX = VFNMS(LDK(KP250000000), TW, TV);
Chris@42 724 TY = VADD(TT, TX);
Chris@42 725 T3d = VADD(TV, TW);
Chris@42 726 T2r = VSUB(TX, TT);
Chris@42 727 }
Chris@42 728 }
Chris@42 729 {
Chris@42 730 V T3g, T3o, T3k, T3l, T3j, T3m, T3p, T3n;
Chris@42 731 {
Chris@42 732 V T3c, T3f, T3h, T3i;
Chris@42 733 T3c = VSUB(T3a, T3b);
Chris@42 734 T3f = VSUB(T3d, T3e);
Chris@42 735 T3g = VBYI(VFMA(LDK(KP951056516), T3c, VMUL(LDK(KP587785252), T3f)));
Chris@42 736 T3o = VBYI(VFNMS(LDK(KP951056516), T3f, VMUL(LDK(KP587785252), T3c)));
Chris@42 737 T3k = VADD(T1A, T1B);
Chris@42 738 T3h = VADD(T3a, T3b);
Chris@42 739 T3i = VADD(T3d, T3e);
Chris@42 740 T3l = VADD(T3h, T3i);
Chris@42 741 T3j = VMUL(LDK(KP559016994), VSUB(T3h, T3i));
Chris@42 742 T3m = VFNMS(LDK(KP250000000), T3l, T3k);
Chris@42 743 }
Chris@42 744 ST(&(x[0]), VADD(T3k, T3l), ms, &(x[0]));
Chris@42 745 T3p = VSUB(T3m, T3j);
Chris@42 746 ST(&(x[WS(rs, 10)]), VADD(T3o, T3p), ms, &(x[0]));
Chris@42 747 ST(&(x[WS(rs, 15)]), VSUB(T3p, T3o), ms, &(x[WS(rs, 1)]));
Chris@42 748 T3n = VADD(T3j, T3m);
Chris@42 749 ST(&(x[WS(rs, 5)]), VADD(T3g, T3n), ms, &(x[WS(rs, 1)]));
Chris@42 750 ST(&(x[WS(rs, 20)]), VSUB(T3n, T3g), ms, &(x[0]));
Chris@42 751 }
Chris@42 752 {
Chris@42 753 V T2z, T2M, T2U, T2V, T2W, T34, T35, T36, T2X, T2Y, T2Z, T31, T32, T33, T2n;
Chris@42 754 V T2N, T2E, T2K, T2y, T2H, T2A, T2G, T38, T39;
Chris@42 755 T2z = VSUB(T1C, T1z);
Chris@42 756 T2M = VFNMS(LDK(KP951056516), T1R, T2L);
Chris@42 757 T2U = VFMA(LDK(KP1_369094211), T2k, VMUL(LDK(KP728968627), T2l));
Chris@42 758 T2V = VFNMS(LDK(KP992114701), T2h, VMUL(LDK(KP250666467), T2i));
Chris@42 759 T2W = VADD(T2U, T2V);
Chris@42 760 T34 = VFNMS(LDK(KP125581039), T2s, VMUL(LDK(KP998026728), T2r));
Chris@42 761 T35 = VFMA(LDK(KP1_274847979), T2v, VMUL(LDK(KP770513242), T2u));
Chris@42 762 T36 = VADD(T34, T35);
Chris@42 763 T2X = VFMA(LDK(KP1_996053456), T2s, VMUL(LDK(KP062790519), T2r));
Chris@42 764 T2Y = VFNMS(LDK(KP637423989), T2u, VMUL(LDK(KP1_541026485), T2v));
Chris@42 765 T2Z = VADD(T2X, T2Y);
Chris@42 766 T31 = VFNMS(LDK(KP1_457937254), T2k, VMUL(LDK(KP684547105), T2l));
Chris@42 767 T32 = VFMA(LDK(KP1_984229402), T2i, VMUL(LDK(KP125333233), T2h));
Chris@42 768 T33 = VADD(T31, T32);
Chris@42 769 {
Chris@42 770 V T2j, T2m, T2I, T2C, T2D, T2J;
Chris@42 771 T2j = VFNMS(LDK(KP851558583), T2i, VMUL(LDK(KP904827052), T2h));
Chris@42 772 T2m = VFMA(LDK(KP1_752613360), T2k, VMUL(LDK(KP481753674), T2l));
Chris@42 773 T2I = VADD(T2m, T2j);
Chris@42 774 T2C = VFMA(LDK(KP1_071653589), T2s, VMUL(LDK(KP844327925), T2r));
Chris@42 775 T2D = VFMA(LDK(KP125581039), T2v, VMUL(LDK(KP998026728), T2u));
Chris@42 776 T2J = VADD(T2C, T2D);
Chris@42 777 T2n = VSUB(T2j, T2m);
Chris@42 778 T2N = VADD(T2I, T2J);
Chris@42 779 T2E = VSUB(T2C, T2D);
Chris@42 780 T2K = VMUL(LDK(KP559016994), VSUB(T2I, T2J));
Chris@42 781 }
Chris@42 782 {
Chris@42 783 V T2o, T2p, T2q, T2t, T2w, T2x;
Chris@42 784 T2o = VFNMS(LDK(KP963507348), T2k, VMUL(LDK(KP876306680), T2l));
Chris@42 785 T2p = VFMA(LDK(KP1_809654104), T2i, VMUL(LDK(KP425779291), T2h));
Chris@42 786 T2q = VSUB(T2o, T2p);
Chris@42 787 T2t = VFNMS(LDK(KP1_688655851), T2s, VMUL(LDK(KP535826794), T2r));
Chris@42 788 T2w = VFNMS(LDK(KP1_996053456), T2v, VMUL(LDK(KP062790519), T2u));
Chris@42 789 T2x = VADD(T2t, T2w);
Chris@42 790 T2y = VMUL(LDK(KP559016994), VSUB(T2q, T2x));
Chris@42 791 T2H = VSUB(T2t, T2w);
Chris@42 792 T2A = VADD(T2q, T2x);
Chris@42 793 T2G = VADD(T2o, T2p);
Chris@42 794 }
Chris@42 795 {
Chris@42 796 V T2S, T2T, T30, T37;
Chris@42 797 T2S = VADD(T2z, T2A);
Chris@42 798 T2T = VBYI(VADD(T2M, T2N));
Chris@42 799 ST(&(x[WS(rs, 23)]), VSUB(T2S, T2T), ms, &(x[WS(rs, 1)]));
Chris@42 800 ST(&(x[WS(rs, 2)]), VADD(T2S, T2T), ms, &(x[0]));
Chris@42 801 T30 = VADD(T2z, VADD(T2W, T2Z));
Chris@42 802 T37 = VBYI(VSUB(VADD(T33, T36), T2M));
Chris@42 803 ST(&(x[WS(rs, 22)]), VSUB(T30, T37), ms, &(x[0]));
Chris@42 804 ST(&(x[WS(rs, 3)]), VADD(T30, T37), ms, &(x[WS(rs, 1)]));
Chris@42 805 }
Chris@42 806 T38 = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T2U, T2V), VFMA(LDK(KP309016994), T33, VFNMS(LDK(KP809016994), T36, VMUL(LDK(KP587785252), VSUB(T2X, T2Y))))), T2M));
Chris@42 807 T39 = VFMA(LDK(KP309016994), T2W, VFMA(LDK(KP951056516), VSUB(T32, T31), VFMA(LDK(KP587785252), VSUB(T35, T34), VFNMS(LDK(KP809016994), T2Z, T2z))));
Chris@42 808 ST(&(x[WS(rs, 8)]), VADD(T38, T39), ms, &(x[0]));
Chris@42 809 ST(&(x[WS(rs, 17)]), VSUB(T39, T38), ms, &(x[WS(rs, 1)]));
Chris@42 810 {
Chris@42 811 V T2F, T2Q, T2P, T2R, T2B, T2O;
Chris@42 812 T2B = VFNMS(LDK(KP250000000), T2A, T2z);
Chris@42 813 T2F = VFMA(LDK(KP951056516), T2n, VADD(T2y, VFNMS(LDK(KP587785252), T2E, T2B)));
Chris@42 814 T2Q = VFMA(LDK(KP587785252), T2n, VFMA(LDK(KP951056516), T2E, VSUB(T2B, T2y)));
Chris@42 815 T2O = VFNMS(LDK(KP250000000), T2N, T2M);
Chris@42 816 T2P = VBYI(VADD(VFMA(LDK(KP951056516), T2G, VMUL(LDK(KP587785252), T2H)), VADD(T2K, T2O)));
Chris@42 817 T2R = VBYI(VADD(VFNMS(LDK(KP951056516), T2H, VMUL(LDK(KP587785252), T2G)), VSUB(T2O, T2K)));
Chris@42 818 ST(&(x[WS(rs, 18)]), VSUB(T2F, T2P), ms, &(x[0]));
Chris@42 819 ST(&(x[WS(rs, 12)]), VADD(T2Q, T2R), ms, &(x[0]));
Chris@42 820 ST(&(x[WS(rs, 7)]), VADD(T2F, T2P), ms, &(x[WS(rs, 1)]));
Chris@42 821 ST(&(x[WS(rs, 13)]), VSUB(T2Q, T2R), ms, &(x[WS(rs, 1)]));
Chris@42 822 }
Chris@42 823 }
Chris@42 824 {
Chris@42 825 V T1D, T1T, T21, T22, T23, T2b, T2c, T2d, T24, T25, T26, T28, T29, T2a, TF;
Chris@42 826 V T1U, T1I, T1O, T1o, T1L, T1E, T1K, T2f, T2g;
Chris@42 827 T1D = VADD(T1z, T1C);
Chris@42 828 T1T = VADD(T1Q, T1S);
Chris@42 829 T21 = VFMA(LDK(KP1_688655851), Tv, VMUL(LDK(KP535826794), TD));
Chris@42 830 T22 = VFMA(LDK(KP1_541026485), Tb, VMUL(LDK(KP637423989), Tj));
Chris@42 831 T23 = VSUB(T21, T22);
Chris@42 832 T2b = VFMA(LDK(KP851558583), T11, VMUL(LDK(KP904827052), TY));
Chris@42 833 T2c = VFMA(LDK(KP1_984229402), T1l, VMUL(LDK(KP125333233), T1i));
Chris@42 834 T2d = VADD(T2b, T2c);
Chris@42 835 T24 = VFNMS(LDK(KP425779291), TY, VMUL(LDK(KP1_809654104), T11));
Chris@42 836 T25 = VFNMS(LDK(KP992114701), T1i, VMUL(LDK(KP250666467), T1l));
Chris@42 837 T26 = VADD(T24, T25);
Chris@42 838 T28 = VFNMS(LDK(KP1_071653589), Tv, VMUL(LDK(KP844327925), TD));
Chris@42 839 T29 = VFNMS(LDK(KP770513242), Tj, VMUL(LDK(KP1_274847979), Tb));
Chris@42 840 T2a = VADD(T28, T29);
Chris@42 841 {
Chris@42 842 V Tk, TE, T1M, T1G, T1H, T1N;
Chris@42 843 Tk = VFMA(LDK(KP1_071653589), Tb, VMUL(LDK(KP844327925), Tj));
Chris@42 844 TE = VFMA(LDK(KP1_937166322), Tv, VMUL(LDK(KP248689887), TD));
Chris@42 845 T1M = VADD(TE, Tk);
Chris@42 846 T1G = VFMA(LDK(KP1_752613360), T11, VMUL(LDK(KP481753674), TY));
Chris@42 847 T1H = VFMA(LDK(KP1_457937254), T1l, VMUL(LDK(KP684547105), T1i));
Chris@42 848 T1N = VADD(T1G, T1H);
Chris@42 849 TF = VSUB(Tk, TE);
Chris@42 850 T1U = VADD(T1M, T1N);
Chris@42 851 T1I = VSUB(T1G, T1H);
Chris@42 852 T1O = VMUL(LDK(KP559016994), VSUB(T1M, T1N));
Chris@42 853 }
Chris@42 854 {
Chris@42 855 V TG, TH, TI, T12, T1m, T1n;
Chris@42 856 TG = VFNMS(LDK(KP497379774), Tv, VMUL(LDK(KP968583161), TD));
Chris@42 857 TH = VFNMS(LDK(KP1_688655851), Tb, VMUL(LDK(KP535826794), Tj));
Chris@42 858 TI = VADD(TG, TH);
Chris@42 859 T12 = VFNMS(LDK(KP963507348), T11, VMUL(LDK(KP876306680), TY));
Chris@42 860 T1m = VFNMS(LDK(KP1_369094211), T1l, VMUL(LDK(KP728968627), T1i));
Chris@42 861 T1n = VADD(T12, T1m);
Chris@42 862 T1o = VMUL(LDK(KP559016994), VSUB(TI, T1n));
Chris@42 863 T1L = VSUB(T12, T1m);
Chris@42 864 T1E = VADD(TI, T1n);
Chris@42 865 T1K = VSUB(TG, TH);
Chris@42 866 }
Chris@42 867 {
Chris@42 868 V T1Z, T20, T27, T2e;
Chris@42 869 T1Z = VADD(T1D, T1E);
Chris@42 870 T20 = VBYI(VADD(T1T, T1U));
Chris@42 871 ST(&(x[WS(rs, 24)]), VSUB(T1Z, T20), ms, &(x[0]));
Chris@42 872 ST(&(x[WS(rs, 1)]), VADD(T1Z, T20), ms, &(x[WS(rs, 1)]));
Chris@42 873 T27 = VADD(T1D, VADD(T23, T26));
Chris@42 874 T2e = VBYI(VSUB(VADD(T2a, T2d), T1T));
Chris@42 875 ST(&(x[WS(rs, 21)]), VSUB(T27, T2e), ms, &(x[WS(rs, 1)]));
Chris@42 876 ST(&(x[WS(rs, 4)]), VADD(T27, T2e), ms, &(x[0]));
Chris@42 877 }
Chris@42 878 T2f = VBYI(VSUB(VFMA(LDK(KP309016994), T2a, VFMA(LDK(KP951056516), VADD(T21, T22), VFNMS(LDK(KP809016994), T2d, VMUL(LDK(KP587785252), VSUB(T24, T25))))), T1T));
Chris@42 879 T2g = VFMA(LDK(KP951056516), VSUB(T29, T28), VFMA(LDK(KP309016994), T23, VFMA(LDK(KP587785252), VSUB(T2c, T2b), VFNMS(LDK(KP809016994), T26, T1D))));
Chris@42 880 ST(&(x[WS(rs, 9)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@42 881 ST(&(x[WS(rs, 16)]), VSUB(T2g, T2f), ms, &(x[0]));
Chris@42 882 {
Chris@42 883 V T1J, T1X, T1W, T1Y, T1F, T1V;
Chris@42 884 T1F = VFNMS(LDK(KP250000000), T1E, T1D);
Chris@42 885 T1J = VFMA(LDK(KP951056516), TF, VADD(T1o, VFNMS(LDK(KP587785252), T1I, T1F)));
Chris@42 886 T1X = VFMA(LDK(KP587785252), TF, VFMA(LDK(KP951056516), T1I, VSUB(T1F, T1o)));
Chris@42 887 T1V = VFNMS(LDK(KP250000000), T1U, T1T);
Chris@42 888 T1W = VBYI(VADD(VFMA(LDK(KP951056516), T1K, VMUL(LDK(KP587785252), T1L)), VADD(T1O, T1V)));
Chris@42 889 T1Y = VBYI(VADD(VFNMS(LDK(KP951056516), T1L, VMUL(LDK(KP587785252), T1K)), VSUB(T1V, T1O)));
Chris@42 890 ST(&(x[WS(rs, 19)]), VSUB(T1J, T1W), ms, &(x[WS(rs, 1)]));
Chris@42 891 ST(&(x[WS(rs, 11)]), VADD(T1X, T1Y), ms, &(x[WS(rs, 1)]));
Chris@42 892 ST(&(x[WS(rs, 6)]), VADD(T1J, T1W), ms, &(x[0]));
Chris@42 893 ST(&(x[WS(rs, 14)]), VSUB(T1X, T1Y), ms, &(x[0]));
Chris@42 894 }
Chris@42 895 }
Chris@42 896 }
Chris@42 897 }
Chris@42 898 VLEAVE();
Chris@42 899 }
Chris@42 900
Chris@42 901 static const tw_instr twinstr[] = {
Chris@42 902 VTW(0, 1),
Chris@42 903 VTW(0, 2),
Chris@42 904 VTW(0, 3),
Chris@42 905 VTW(0, 4),
Chris@42 906 VTW(0, 5),
Chris@42 907 VTW(0, 6),
Chris@42 908 VTW(0, 7),
Chris@42 909 VTW(0, 8),
Chris@42 910 VTW(0, 9),
Chris@42 911 VTW(0, 10),
Chris@42 912 VTW(0, 11),
Chris@42 913 VTW(0, 12),
Chris@42 914 VTW(0, 13),
Chris@42 915 VTW(0, 14),
Chris@42 916 VTW(0, 15),
Chris@42 917 VTW(0, 16),
Chris@42 918 VTW(0, 17),
Chris@42 919 VTW(0, 18),
Chris@42 920 VTW(0, 19),
Chris@42 921 VTW(0, 20),
Chris@42 922 VTW(0, 21),
Chris@42 923 VTW(0, 22),
Chris@42 924 VTW(0, 23),
Chris@42 925 VTW(0, 24),
Chris@42 926 {TW_NEXT, VL, 0}
Chris@42 927 };
Chris@42 928
Chris@42 929 static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {171, 111, 77, 0}, 0, 0, 0 };
Chris@42 930
Chris@42 931 void XSIMD(codelet_t2bv_25) (planner *p) {
Chris@42 932 X(kdft_dit_register) (p, t2bv_25, &desc);
Chris@42 933 }
Chris@42 934 #endif /* HAVE_FMA */