annotate src/fftw-3.3.8/dft/simd/common/t2bv_25.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:05 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include dft/simd/t2b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 248 FP additions, 241 FP multiplications,
Chris@82 32 * (or, 67 additions, 60 multiplications, 181 fused multiply/add),
Chris@82 33 * 147 stack variables, 67 constants, and 50 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/t2b.h"
Chris@82 36
Chris@82 37 static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP617882369, +0.617882369114440893914546919006756321695042882);
Chris@82 40 DVK(KP792626838, +0.792626838241819413632131824093538848057784557);
Chris@82 41 DVK(KP876091699, +0.876091699473550838204498029706869638173524346);
Chris@82 42 DVK(KP803003575, +0.803003575438660414833440593570376004635464850);
Chris@82 43 DVK(KP999544308, +0.999544308746292983948881682379742149196758193);
Chris@82 44 DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
Chris@82 45 DVK(KP242145790, +0.242145790282157779872542093866183953459003101);
Chris@82 46 DVK(KP916574801, +0.916574801383451584742370439148878693530976769);
Chris@82 47 DVK(KP269969613, +0.269969613759572083574752974412347470060951301);
Chris@82 48 DVK(KP904730450, +0.904730450839922351881287709692877908104763647);
Chris@82 49 DVK(KP809385824, +0.809385824416008241660603814668679683846476688);
Chris@82 50 DVK(KP894834959, +0.894834959464455102997960030820114611498661386);
Chris@82 51 DVK(KP447417479, +0.447417479732227551498980015410057305749330693);
Chris@82 52 DVK(KP867381224, +0.867381224396525206773171885031575671309956167);
Chris@82 53 DVK(KP958953096, +0.958953096729998668045963838399037225970891871);
Chris@82 54 DVK(KP683113946, +0.683113946453479238701949862233725244439656928);
Chris@82 55 DVK(KP559154169, +0.559154169276087864842202529084232643714075927);
Chris@82 56 DVK(KP831864738, +0.831864738706457140726048799369896829771167132);
Chris@82 57 DVK(KP829049696, +0.829049696159252993975487806364305442437946767);
Chris@82 58 DVK(KP912575812, +0.912575812670962425556968549836277086778922727);
Chris@82 59 DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
Chris@82 60 DVK(KP262346850, +0.262346850930607871785420028382979691334784273);
Chris@82 61 DVK(KP860541664, +0.860541664367944677098261680920518816412804187);
Chris@82 62 DVK(KP681693190, +0.681693190061530575150324149145440022633095390);
Chris@82 63 DVK(KP560319534, +0.560319534973832390111614715371676131169633784);
Chris@82 64 DVK(KP897376177, +0.897376177523557693138608077137219684419427330);
Chris@82 65 DVK(KP855719849, +0.855719849902058969314654733608091555096772472);
Chris@82 66 DVK(KP949179823, +0.949179823508441261575555465843363271711583843);
Chris@82 67 DVK(KP952936919, +0.952936919628306576880750665357914584765951388);
Chris@82 68 DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
Chris@82 69 DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
Chris@82 70 DVK(KP997675361, +0.997675361079556513670859573984492383596555031);
Chris@82 71 DVK(KP237294955, +0.237294955877110315393888866460840817927895961);
Chris@82 72 DVK(KP904508497, +0.904508497187473712051146708591409529430077295);
Chris@82 73 DVK(KP906616052, +0.906616052148196230441134447086066874408359177);
Chris@82 74 DVK(KP923225144, +0.923225144846402650453449441572664695995209956);
Chris@82 75 DVK(KP921078979, +0.921078979742360627699756128143719920817673854);
Chris@82 76 DVK(KP578046249, +0.578046249379945007321754579646815604023525655);
Chris@82 77 DVK(KP763932022, +0.763932022500210303590826331268723764559381640);
Chris@82 78 DVK(KP956723877, +0.956723877038460305821989399535483155872969262);
Chris@82 79 DVK(KP690983005, +0.690983005625052575897706582817180941139845410);
Chris@82 80 DVK(KP945422727, +0.945422727388575946270360266328811958657216298);
Chris@82 81 DVK(KP522616830, +0.522616830205754336872861364785224694908468440);
Chris@82 82 DVK(KP772036680, +0.772036680810363904029489473607579825330539880);
Chris@82 83 DVK(KP669429328, +0.669429328479476605641803240971985825917022098);
Chris@82 84 DVK(KP570584518, +0.570584518783621657366766175430996792655723863);
Chris@82 85 DVK(KP982009705, +0.982009705009746369461829878184175962711969869);
Chris@82 86 DVK(KP845997307, +0.845997307939530944175097360758058292389769300);
Chris@82 87 DVK(KP734762448, +0.734762448793050413546343770063151342619912334);
Chris@82 88 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 89 DVK(KP447533225, +0.447533225982656890041886979663652563063114397);
Chris@82 90 DVK(KP059835404, +0.059835404262124915169548397419498386427871950);
Chris@82 91 DVK(KP494780565, +0.494780565770515410344588413655324772219443730);
Chris@82 92 DVK(KP603558818, +0.603558818296015001454675132653458027918768137);
Chris@82 93 DVK(KP987388751, +0.987388751065621252324603216482382109400433949);
Chris@82 94 DVK(KP522847744, +0.522847744331509716623755382187077770911012542);
Chris@82 95 DVK(KP667278218, +0.667278218140296670899089292254759909713898805);
Chris@82 96 DVK(KP244189809, +0.244189809627953270309879511234821255780225091);
Chris@82 97 DVK(KP132830569, +0.132830569247582714407653942074819768844536507);
Chris@82 98 DVK(KP869845200, +0.869845200362138853122720822420327157933056305);
Chris@82 99 DVK(KP786782374, +0.786782374965295178365099601674911834788448471);
Chris@82 100 DVK(KP066152395, +0.066152395967733048213034281011006031460903353);
Chris@82 101 DVK(KP120146378, +0.120146378570687701782758537356596213647956445);
Chris@82 102 DVK(KP893101515, +0.893101515366181661711202267938416198338079437);
Chris@82 103 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 104 DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 105 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 106 {
Chris@82 107 INT m;
Chris@82 108 R *x;
Chris@82 109 x = ii;
Chris@82 110 for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
Chris@82 111 V T1, Te, Tc, Td, T1O, T2X, T3Q, T1x, T2K, T1u, T2L, T1y, T27, T3b, T2R;
Chris@82 112 V T2M, T2f, T3M, Ty, T2E, Tv, T2D, Tz, T2a, T3e, T2U, T2F, T2i, T3N, TK;
Chris@82 113 V T2B, TS, T2A, TT, T2b, T3f, T2T, T2C, T2j, T3P, T1d, T2H, T1a, T2I, T1e;
Chris@82 114 V T28, T3c, T2Q, T2J, T2g;
Chris@82 115 {
Chris@82 116 V T8, Ta, Tb, T3, T5, T6, T1M, T1N;
Chris@82 117 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 118 {
Chris@82 119 V T7, T9, T2, T4;
Chris@82 120 T7 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 121 T8 = BYTW(&(W[TWVL * 18]), T7);
Chris@82 122 T9 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 123 Ta = BYTW(&(W[TWVL * 28]), T9);
Chris@82 124 Tb = VADD(T8, Ta);
Chris@82 125 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 126 T3 = BYTW(&(W[TWVL * 8]), T2);
Chris@82 127 T4 = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 128 T5 = BYTW(&(W[TWVL * 38]), T4);
Chris@82 129 T6 = VADD(T3, T5);
Chris@82 130 }
Chris@82 131 Te = VSUB(T6, Tb);
Chris@82 132 Tc = VADD(T6, Tb);
Chris@82 133 Td = VFNMS(LDK(KP250000000), Tc, T1);
Chris@82 134 T1M = VSUB(T3, T5);
Chris@82 135 T1N = VSUB(T8, Ta);
Chris@82 136 T1O = VFMA(LDK(KP618033988), T1N, T1M);
Chris@82 137 T2X = VFNMS(LDK(KP618033988), T1M, T1N);
Chris@82 138 }
Chris@82 139 {
Chris@82 140 V T1g, T1v, T1w, T1l, T1q, T1r, T1f, T1s, T1t;
Chris@82 141 T1f = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 142 T1g = BYTW(&(W[TWVL * 4]), T1f);
Chris@82 143 {
Chris@82 144 V T1i, T1p, T1k, T1n;
Chris@82 145 {
Chris@82 146 V T1h, T1o, T1j, T1m;
Chris@82 147 T1h = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 148 T1i = BYTW(&(W[TWVL * 14]), T1h);
Chris@82 149 T1o = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 150 T1p = BYTW(&(W[TWVL * 34]), T1o);
Chris@82 151 T1j = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 152 T1k = BYTW(&(W[TWVL * 44]), T1j);
Chris@82 153 T1m = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 154 T1n = BYTW(&(W[TWVL * 24]), T1m);
Chris@82 155 }
Chris@82 156 T1v = VSUB(T1i, T1k);
Chris@82 157 T1w = VSUB(T1n, T1p);
Chris@82 158 T1l = VADD(T1i, T1k);
Chris@82 159 T1q = VADD(T1n, T1p);
Chris@82 160 T1r = VADD(T1l, T1q);
Chris@82 161 }
Chris@82 162 T3Q = VADD(T1g, T1r);
Chris@82 163 T1x = VFMA(LDK(KP618033988), T1w, T1v);
Chris@82 164 T2K = VFNMS(LDK(KP618033988), T1v, T1w);
Chris@82 165 T1s = VFNMS(LDK(KP250000000), T1r, T1g);
Chris@82 166 T1t = VSUB(T1q, T1l);
Chris@82 167 T1u = VFNMS(LDK(KP559016994), T1t, T1s);
Chris@82 168 T2L = VFMA(LDK(KP559016994), T1t, T1s);
Chris@82 169 T1y = VFNMS(LDK(KP893101515), T1x, T1u);
Chris@82 170 T27 = VFNMS(LDK(KP120146378), T1x, T1u);
Chris@82 171 T3b = VFMA(LDK(KP066152395), T2L, T2K);
Chris@82 172 T2R = VFNMS(LDK(KP786782374), T2K, T2L);
Chris@82 173 T2M = VFMA(LDK(KP869845200), T2L, T2K);
Chris@82 174 T2f = VFMA(LDK(KP132830569), T1u, T1x);
Chris@82 175 }
Chris@82 176 {
Chris@82 177 V Th, Tw, Tx, Tm, Tr, Ts, Tg, Tt, Tu;
Chris@82 178 Tg = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 179 Th = BYTW(&(W[0]), Tg);
Chris@82 180 {
Chris@82 181 V Tj, Tq, Tl, To;
Chris@82 182 {
Chris@82 183 V Ti, Tp, Tk, Tn;
Chris@82 184 Ti = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 185 Tj = BYTW(&(W[TWVL * 10]), Ti);
Chris@82 186 Tp = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 187 Tq = BYTW(&(W[TWVL * 30]), Tp);
Chris@82 188 Tk = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 189 Tl = BYTW(&(W[TWVL * 40]), Tk);
Chris@82 190 Tn = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 191 To = BYTW(&(W[TWVL * 20]), Tn);
Chris@82 192 }
Chris@82 193 Tw = VSUB(Tj, Tl);
Chris@82 194 Tx = VSUB(Tq, To);
Chris@82 195 Tm = VADD(Tj, Tl);
Chris@82 196 Tr = VADD(To, Tq);
Chris@82 197 Ts = VADD(Tm, Tr);
Chris@82 198 }
Chris@82 199 T3M = VADD(Th, Ts);
Chris@82 200 Ty = VFNMS(LDK(KP618033988), Tx, Tw);
Chris@82 201 T2E = VFMA(LDK(KP618033988), Tw, Tx);
Chris@82 202 Tt = VFNMS(LDK(KP250000000), Ts, Th);
Chris@82 203 Tu = VSUB(Tm, Tr);
Chris@82 204 Tv = VFMA(LDK(KP559016994), Tu, Tt);
Chris@82 205 T2D = VFNMS(LDK(KP559016994), Tu, Tt);
Chris@82 206 Tz = VFNMS(LDK(KP244189809), Ty, Tv);
Chris@82 207 T2a = VFMA(LDK(KP667278218), Tv, Ty);
Chris@82 208 T3e = VFNMS(LDK(KP522847744), T2E, T2D);
Chris@82 209 T2U = VFNMS(LDK(KP987388751), T2D, T2E);
Chris@82 210 T2F = VFMA(LDK(KP893101515), T2E, T2D);
Chris@82 211 T2i = VFNMS(LDK(KP603558818), Ty, Tv);
Chris@82 212 }
Chris@82 213 {
Chris@82 214 V TM, TE, TJ, TN, TO, TP, TL, TQ, TR;
Chris@82 215 TL = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 216 TM = BYTW(&(W[TWVL * 6]), TL);
Chris@82 217 {
Chris@82 218 V TB, TI, TD, TG;
Chris@82 219 {
Chris@82 220 V TA, TH, TC, TF;
Chris@82 221 TA = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 222 TB = BYTW(&(W[TWVL * 46]), TA);
Chris@82 223 TH = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 224 TI = BYTW(&(W[TWVL * 26]), TH);
Chris@82 225 TC = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 226 TD = BYTW(&(W[TWVL * 16]), TC);
Chris@82 227 TF = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 228 TG = BYTW(&(W[TWVL * 36]), TF);
Chris@82 229 }
Chris@82 230 TE = VSUB(TB, TD);
Chris@82 231 TJ = VSUB(TG, TI);
Chris@82 232 TN = VADD(TD, TB);
Chris@82 233 TO = VADD(TI, TG);
Chris@82 234 TP = VADD(TN, TO);
Chris@82 235 }
Chris@82 236 T3N = VADD(TM, TP);
Chris@82 237 TK = VFMA(LDK(KP618033988), TJ, TE);
Chris@82 238 T2B = VFNMS(LDK(KP618033988), TE, TJ);
Chris@82 239 TQ = VFMS(LDK(KP250000000), TP, TM);
Chris@82 240 TR = VSUB(TN, TO);
Chris@82 241 TS = VFNMS(LDK(KP559016994), TR, TQ);
Chris@82 242 T2A = VFMA(LDK(KP559016994), TR, TQ);
Chris@82 243 TT = VFNMS(LDK(KP667278218), TS, TK);
Chris@82 244 T2b = VFMA(LDK(KP869845200), TS, TK);
Chris@82 245 T3f = VFNMS(LDK(KP494780565), T2A, T2B);
Chris@82 246 T2T = VFNMS(LDK(KP132830569), T2A, T2B);
Chris@82 247 T2C = VFMA(LDK(KP120146378), T2B, T2A);
Chris@82 248 T2j = VFNMS(LDK(KP786782374), TK, TS);
Chris@82 249 }
Chris@82 250 {
Chris@82 251 V TW, T1b, T1c, T11, T16, T17, TV, T18, T19;
Chris@82 252 TV = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 253 TW = BYTW(&(W[TWVL * 2]), TV);
Chris@82 254 {
Chris@82 255 V TY, T15, T10, T13;
Chris@82 256 {
Chris@82 257 V TX, T14, TZ, T12;
Chris@82 258 TX = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 259 TY = BYTW(&(W[TWVL * 12]), TX);
Chris@82 260 T14 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 261 T15 = BYTW(&(W[TWVL * 32]), T14);
Chris@82 262 TZ = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 263 T10 = BYTW(&(W[TWVL * 42]), TZ);
Chris@82 264 T12 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 265 T13 = BYTW(&(W[TWVL * 22]), T12);
Chris@82 266 }
Chris@82 267 T1b = VSUB(TY, T10);
Chris@82 268 T1c = VSUB(T15, T13);
Chris@82 269 T11 = VADD(TY, T10);
Chris@82 270 T16 = VADD(T13, T15);
Chris@82 271 T17 = VADD(T11, T16);
Chris@82 272 }
Chris@82 273 T3P = VADD(TW, T17);
Chris@82 274 T1d = VFNMS(LDK(KP618033988), T1c, T1b);
Chris@82 275 T2H = VFMA(LDK(KP618033988), T1b, T1c);
Chris@82 276 T18 = VFNMS(LDK(KP250000000), T17, TW);
Chris@82 277 T19 = VSUB(T16, T11);
Chris@82 278 T1a = VFNMS(LDK(KP559016994), T19, T18);
Chris@82 279 T2I = VFMA(LDK(KP559016994), T19, T18);
Chris@82 280 T1e = VFNMS(LDK(KP522847744), T1d, T1a);
Chris@82 281 T28 = VFNMS(LDK(KP494780565), T1a, T1d);
Chris@82 282 T3c = VFNMS(LDK(KP667278218), T2I, T2H);
Chris@82 283 T2Q = VFNMS(LDK(KP059835404), T2H, T2I);
Chris@82 284 T2J = VFMA(LDK(KP066152395), T2I, T2H);
Chris@82 285 T2g = VFMA(LDK(KP447533225), T1d, T1a);
Chris@82 286 }
Chris@82 287 {
Chris@82 288 V T3Y, T40, T3L, T3S, T3T, T3U, T3Z, T3V;
Chris@82 289 {
Chris@82 290 V T3W, T3X, T3O, T3R;
Chris@82 291 T3W = VSUB(T3M, T3N);
Chris@82 292 T3X = VSUB(T3P, T3Q);
Chris@82 293 T3Y = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T3X, T3W));
Chris@82 294 T40 = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T3W, T3X));
Chris@82 295 T3L = VADD(T1, Tc);
Chris@82 296 T3O = VADD(T3M, T3N);
Chris@82 297 T3R = VADD(T3P, T3Q);
Chris@82 298 T3S = VADD(T3O, T3R);
Chris@82 299 T3T = VFNMS(LDK(KP250000000), T3S, T3L);
Chris@82 300 T3U = VSUB(T3O, T3R);
Chris@82 301 }
Chris@82 302 ST(&(x[0]), VADD(T3S, T3L), ms, &(x[0]));
Chris@82 303 T3Z = VFNMS(LDK(KP559016994), T3U, T3T);
Chris@82 304 ST(&(x[WS(rs, 10)]), VFNMSI(T40, T3Z), ms, &(x[0]));
Chris@82 305 ST(&(x[WS(rs, 15)]), VFMAI(T40, T3Z), ms, &(x[WS(rs, 1)]));
Chris@82 306 T3V = VFMA(LDK(KP559016994), T3U, T3T);
Chris@82 307 ST(&(x[WS(rs, 5)]), VFMAI(T3Y, T3V), ms, &(x[WS(rs, 1)]));
Chris@82 308 ST(&(x[WS(rs, 20)]), VFNMSI(T3Y, T3V), ms, &(x[0]));
Chris@82 309 }
Chris@82 310 {
Chris@82 311 V T2Z, T35, T3B, T3I, T2W, T38, T2O, T32, T2z, T3t, T3h, T3s, T3p, T3F, T3r;
Chris@82 312 V T3v, T3C, T3z, T3A;
Chris@82 313 T2Z = VFMA(LDK(KP734762448), T2U, T2T);
Chris@82 314 T35 = VFNMS(LDK(KP734762448), T2F, T2C);
Chris@82 315 T3z = VFMA(LDK(KP845997307), T3c, T3b);
Chris@82 316 T3A = VFMA(LDK(KP982009705), T3f, T3e);
Chris@82 317 T3B = VFMA(LDK(KP570584518), T3A, T3z);
Chris@82 318 T3I = VFNMS(LDK(KP669429328), T3z, T3A);
Chris@82 319 {
Chris@82 320 V T2S, T2V, T37, T36;
Chris@82 321 T2S = VFMA(LDK(KP772036680), T2R, T2Q);
Chris@82 322 T2V = VFNMS(LDK(KP734762448), T2U, T2T);
Chris@82 323 T36 = VFMA(LDK(KP772036680), T2M, T2J);
Chris@82 324 T37 = VFMA(LDK(KP522616830), T2V, T36);
Chris@82 325 T2W = VFMA(LDK(KP945422727), T2V, T2S);
Chris@82 326 T38 = VFNMS(LDK(KP690983005), T37, T2S);
Chris@82 327 }
Chris@82 328 {
Chris@82 329 V T2N, T2G, T31, T30;
Chris@82 330 T2N = VFNMS(LDK(KP772036680), T2M, T2J);
Chris@82 331 T2G = VFMA(LDK(KP734762448), T2F, T2C);
Chris@82 332 T30 = VFNMS(LDK(KP772036680), T2R, T2Q);
Chris@82 333 T31 = VFNMS(LDK(KP522616830), T2G, T30);
Chris@82 334 T2O = VFMA(LDK(KP956723877), T2N, T2G);
Chris@82 335 T32 = VFMA(LDK(KP763932022), T31, T2N);
Chris@82 336 }
Chris@82 337 {
Chris@82 338 V T3o, T3u, T3l, T3m, T3n;
Chris@82 339 T2z = VFNMS(LDK(KP559016994), Te, Td);
Chris@82 340 T3m = VFMA(LDK(KP447533225), T2B, T2A);
Chris@82 341 T3n = VFMA(LDK(KP578046249), T2D, T2E);
Chris@82 342 T3o = VFNMS(LDK(KP921078979), T3n, T3m);
Chris@82 343 T3t = VFMA(LDK(KP921078979), T3n, T3m);
Chris@82 344 {
Chris@82 345 V T3d, T3g, T3j, T3k;
Chris@82 346 T3d = VFNMS(LDK(KP845997307), T3c, T3b);
Chris@82 347 T3g = VFNMS(LDK(KP982009705), T3f, T3e);
Chris@82 348 T3h = VFMA(LDK(KP923225144), T3g, T3d);
Chris@82 349 T3u = VFNMS(LDK(KP923225144), T3g, T3d);
Chris@82 350 T3j = VFNMS(LDK(KP059835404), T2K, T2L);
Chris@82 351 T3k = VFMA(LDK(KP603558818), T2H, T2I);
Chris@82 352 T3l = VFMA(LDK(KP845997307), T3k, T3j);
Chris@82 353 T3s = VFNMS(LDK(KP845997307), T3k, T3j);
Chris@82 354 }
Chris@82 355 T3p = VFNMS(LDK(KP906616052), T3o, T3l);
Chris@82 356 T3F = VFNMS(LDK(KP904508497), T3u, T3s);
Chris@82 357 T3r = VFNMS(LDK(KP237294955), T3h, T2z);
Chris@82 358 T3v = VFNMS(LDK(KP997675361), T3u, T3t);
Chris@82 359 T3C = VFMA(LDK(KP906616052), T3o, T3l);
Chris@82 360 }
Chris@82 361 {
Chris@82 362 V T2P, T2Y, T3i, T3q;
Chris@82 363 T2P = VFMA(LDK(KP992114701), T2O, T2z);
Chris@82 364 T2Y = VMUL(LDK(KP998026728), VFMA(LDK(KP952936919), T2X, T2W));
Chris@82 365 ST(&(x[WS(rs, 22)]), VFNMSI(T2Y, T2P), ms, &(x[0]));
Chris@82 366 ST(&(x[WS(rs, 3)]), VFMAI(T2Y, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 367 T3i = VFMA(LDK(KP949179823), T3h, T2z);
Chris@82 368 T3q = VMUL(LDK(KP998026728), VFNMS(LDK(KP952936919), T2X, T3p));
Chris@82 369 ST(&(x[WS(rs, 23)]), VFNMSI(T3q, T3i), ms, &(x[WS(rs, 1)]));
Chris@82 370 ST(&(x[WS(rs, 2)]), VFMAI(T3q, T3i), ms, &(x[0]));
Chris@82 371 }
Chris@82 372 {
Chris@82 373 V T34, T3a, T33, T39;
Chris@82 374 T33 = VFNMS(LDK(KP855719849), T32, T2Z);
Chris@82 375 T34 = VFMA(LDK(KP897376177), T33, T2z);
Chris@82 376 T39 = VFMA(LDK(KP855719849), T38, T35);
Chris@82 377 T3a = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T39, T2X));
Chris@82 378 ST(&(x[WS(rs, 8)]), VFMAI(T3a, T34), ms, &(x[0]));
Chris@82 379 ST(&(x[WS(rs, 17)]), VFNMSI(T3a, T34), ms, &(x[WS(rs, 1)]));
Chris@82 380 }
Chris@82 381 {
Chris@82 382 V T3x, T3H, T3E, T3K, T3w;
Chris@82 383 T3w = VFMA(LDK(KP560319534), T3v, T3s);
Chris@82 384 T3x = VFNMS(LDK(KP949179823), T3w, T3r);
Chris@82 385 {
Chris@82 386 V T3G, T3y, T3J, T3D;
Chris@82 387 T3G = VFNMS(LDK(KP681693190), T3F, T3t);
Chris@82 388 T3H = VFNMS(LDK(KP860541664), T3G, T3r);
Chris@82 389 T3y = VFMA(LDK(KP262346850), T3p, T2X);
Chris@82 390 T3J = VFNMS(LDK(KP669429328), T3C, T3I);
Chris@82 391 T3D = VFMA(LDK(KP618033988), T3C, T3B);
Chris@82 392 T3E = VMUL(LDK(KP951056516), VFNMS(LDK(KP949179823), T3D, T3y));
Chris@82 393 T3K = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T3J, T3y));
Chris@82 394 }
Chris@82 395 ST(&(x[WS(rs, 12)]), VFNMSI(T3E, T3x), ms, &(x[0]));
Chris@82 396 ST(&(x[WS(rs, 18)]), VFMAI(T3K, T3H), ms, &(x[0]));
Chris@82 397 ST(&(x[WS(rs, 13)]), VFMAI(T3E, T3x), ms, &(x[WS(rs, 1)]));
Chris@82 398 ST(&(x[WS(rs, 7)]), VFNMSI(T3K, T3H), ms, &(x[WS(rs, 1)]));
Chris@82 399 }
Chris@82 400 }
Chris@82 401 {
Chris@82 402 V T2n, T2t, T1V, T22, T2l, T2w, T2d, T2q, Tf, T1I, T1A, T1E, T1B, T1Z, T1J;
Chris@82 403 V T1R, T1W, T1T, T1U;
Chris@82 404 T2n = VFNMS(LDK(KP912575812), T2j, T2i);
Chris@82 405 T2t = VFNMS(LDK(KP912575812), T2b, T2a);
Chris@82 406 T1T = VFNMS(LDK(KP829049696), TT, Tz);
Chris@82 407 T1U = VFNMS(LDK(KP831864738), T1y, T1e);
Chris@82 408 T1V = VFMA(LDK(KP559154169), T1U, T1T);
Chris@82 409 T22 = VFNMS(LDK(KP683113946), T1T, T1U);
Chris@82 410 {
Chris@82 411 V T2h, T2k, T2v, T2u;
Chris@82 412 T2h = VFMA(LDK(KP958953096), T2g, T2f);
Chris@82 413 T2k = VFMA(LDK(KP912575812), T2j, T2i);
Chris@82 414 T2u = VFMA(LDK(KP867381224), T28, T27);
Chris@82 415 T2v = VFMA(LDK(KP447417479), T2k, T2u);
Chris@82 416 T2l = VFMA(LDK(KP894834959), T2k, T2h);
Chris@82 417 T2w = VFNMS(LDK(KP763932022), T2v, T2h);
Chris@82 418 }
Chris@82 419 {
Chris@82 420 V T29, T2c, T2p, T2o;
Chris@82 421 T29 = VFNMS(LDK(KP867381224), T28, T27);
Chris@82 422 T2c = VFMA(LDK(KP912575812), T2b, T2a);
Chris@82 423 T2o = VFNMS(LDK(KP958953096), T2g, T2f);
Chris@82 424 T2p = VFMA(LDK(KP447417479), T2c, T2o);
Chris@82 425 T2d = VFNMS(LDK(KP809385824), T2c, T29);
Chris@82 426 T2q = VFMA(LDK(KP690983005), T2p, T29);
Chris@82 427 }
Chris@82 428 {
Chris@82 429 V T1Q, T1F, T1P, T1G, T1H;
Chris@82 430 Tf = VFMA(LDK(KP559016994), Te, Td);
Chris@82 431 T1G = VFMA(LDK(KP578046249), T1a, T1d);
Chris@82 432 T1H = VFMA(LDK(KP987388751), T1u, T1x);
Chris@82 433 T1I = VFNMS(LDK(KP831864738), T1H, T1G);
Chris@82 434 T1Q = VFMA(LDK(KP831864738), T1H, T1G);
Chris@82 435 {
Chris@82 436 V TU, T1z, T1C, T1D;
Chris@82 437 TU = VFMA(LDK(KP829049696), TT, Tz);
Chris@82 438 T1z = VFMA(LDK(KP831864738), T1y, T1e);
Chris@82 439 T1A = VFMA(LDK(KP904730450), T1z, TU);
Chris@82 440 T1F = VFNMS(LDK(KP904730450), T1z, TU);
Chris@82 441 T1C = VFMA(LDK(KP269969613), Tv, Ty);
Chris@82 442 T1D = VFMA(LDK(KP603558818), TK, TS);
Chris@82 443 T1E = VFMA(LDK(KP916574801), T1D, T1C);
Chris@82 444 T1P = VFNMS(LDK(KP916574801), T1D, T1C);
Chris@82 445 }
Chris@82 446 T1B = VFNMS(LDK(KP242145790), T1A, Tf);
Chris@82 447 T1Z = VADD(T1E, T1F);
Chris@82 448 T1J = VFNMS(LDK(KP904730450), T1I, T1F);
Chris@82 449 T1R = VFMA(LDK(KP904730450), T1Q, T1P);
Chris@82 450 T1W = VFNMS(LDK(KP904730450), T1Q, T1P);
Chris@82 451 }
Chris@82 452 {
Chris@82 453 V T25, T26, T2e, T2m;
Chris@82 454 T25 = VFMA(LDK(KP968583161), T1A, Tf);
Chris@82 455 T26 = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1R, T1O));
Chris@82 456 ST(&(x[WS(rs, 1)]), VFMAI(T26, T25), ms, &(x[WS(rs, 1)]));
Chris@82 457 ST(&(x[WS(rs, 24)]), VFNMSI(T26, T25), ms, &(x[0]));
Chris@82 458 T2e = VFNMS(LDK(KP992114701), T2d, Tf);
Chris@82 459 T2m = VMUL(LDK(KP951056516), VFNMS(LDK(KP992114701), T2l, T1O));
Chris@82 460 ST(&(x[WS(rs, 4)]), VFNMSI(T2m, T2e), ms, &(x[0]));
Chris@82 461 ST(&(x[WS(rs, 21)]), VFMAI(T2m, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 462 }
Chris@82 463 {
Chris@82 464 V T2s, T2y, T2r, T2x;
Chris@82 465 T2r = VFNMS(LDK(KP999544308), T2q, T2n);
Chris@82 466 T2s = VFNMS(LDK(KP803003575), T2r, Tf);
Chris@82 467 T2x = VFNMS(LDK(KP999544308), T2w, T2t);
Chris@82 468 T2y = VMUL(LDK(KP951056516), VFNMS(LDK(KP803003575), T2x, T1O));
Chris@82 469 ST(&(x[WS(rs, 9)]), VFNMSI(T2y, T2s), ms, &(x[WS(rs, 1)]));
Chris@82 470 ST(&(x[WS(rs, 16)]), VFMAI(T2y, T2s), ms, &(x[0]));
Chris@82 471 }
Chris@82 472 {
Chris@82 473 V T1L, T21, T1Y, T24, T1K;
Chris@82 474 T1K = VFNMS(LDK(KP618033988), T1J, T1E);
Chris@82 475 T1L = VFNMS(LDK(KP876091699), T1K, T1B);
Chris@82 476 {
Chris@82 477 V T20, T1S, T23, T1X;
Chris@82 478 T20 = VFNMS(LDK(KP683113946), T1Z, T1I);
Chris@82 479 T21 = VFMA(LDK(KP792626838), T20, T1B);
Chris@82 480 T1S = VFNMS(LDK(KP242145790), T1R, T1O);
Chris@82 481 T23 = VFMA(LDK(KP617882369), T1W, T22);
Chris@82 482 T1X = VFMA(LDK(KP559016994), T1W, T1V);
Chris@82 483 T1Y = VMUL(LDK(KP951056516), VFMA(LDK(KP968583161), T1X, T1S));
Chris@82 484 T24 = VMUL(LDK(KP951056516), VFNMS(LDK(KP876306680), T23, T1S));
Chris@82 485 }
Chris@82 486 ST(&(x[WS(rs, 6)]), VFMAI(T1Y, T1L), ms, &(x[0]));
Chris@82 487 ST(&(x[WS(rs, 14)]), VFNMSI(T24, T21), ms, &(x[0]));
Chris@82 488 ST(&(x[WS(rs, 19)]), VFNMSI(T1Y, T1L), ms, &(x[WS(rs, 1)]));
Chris@82 489 ST(&(x[WS(rs, 11)]), VFMAI(T24, T21), ms, &(x[WS(rs, 1)]));
Chris@82 490 }
Chris@82 491 }
Chris@82 492 }
Chris@82 493 }
Chris@82 494 VLEAVE();
Chris@82 495 }
Chris@82 496
Chris@82 497 static const tw_instr twinstr[] = {
Chris@82 498 VTW(0, 1),
Chris@82 499 VTW(0, 2),
Chris@82 500 VTW(0, 3),
Chris@82 501 VTW(0, 4),
Chris@82 502 VTW(0, 5),
Chris@82 503 VTW(0, 6),
Chris@82 504 VTW(0, 7),
Chris@82 505 VTW(0, 8),
Chris@82 506 VTW(0, 9),
Chris@82 507 VTW(0, 10),
Chris@82 508 VTW(0, 11),
Chris@82 509 VTW(0, 12),
Chris@82 510 VTW(0, 13),
Chris@82 511 VTW(0, 14),
Chris@82 512 VTW(0, 15),
Chris@82 513 VTW(0, 16),
Chris@82 514 VTW(0, 17),
Chris@82 515 VTW(0, 18),
Chris@82 516 VTW(0, 19),
Chris@82 517 VTW(0, 20),
Chris@82 518 VTW(0, 21),
Chris@82 519 VTW(0, 22),
Chris@82 520 VTW(0, 23),
Chris@82 521 VTW(0, 24),
Chris@82 522 {TW_NEXT, VL, 0}
Chris@82 523 };
Chris@82 524
Chris@82 525 static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {67, 60, 181, 0}, 0, 0, 0 };
Chris@82 526
Chris@82 527 void XSIMD(codelet_t2bv_25) (planner *p) {
Chris@82 528 X(kdft_dit_register) (p, t2bv_25, &desc);
Chris@82 529 }
Chris@82 530 #else
Chris@82 531
Chris@82 532 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 25 -name t2bv_25 -include dft/simd/t2b.h -sign 1 */
Chris@82 533
Chris@82 534 /*
Chris@82 535 * This function contains 248 FP additions, 188 FP multiplications,
Chris@82 536 * (or, 171 additions, 111 multiplications, 77 fused multiply/add),
Chris@82 537 * 100 stack variables, 40 constants, and 50 memory accesses
Chris@82 538 */
Chris@82 539 #include "dft/simd/t2b.h"
Chris@82 540
Chris@82 541 static void t2bv_25(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 542 {
Chris@82 543 DVK(KP497379774, +0.497379774329709576484567492012895936835134813);
Chris@82 544 DVK(KP968583161, +0.968583161128631119490168375464735813836012403);
Chris@82 545 DVK(KP248689887, +0.248689887164854788242283746006447968417567406);
Chris@82 546 DVK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
Chris@82 547 DVK(KP809016994, +0.809016994374947424102293417182819058860154590);
Chris@82 548 DVK(KP309016994, +0.309016994374947424102293417182819058860154590);
Chris@82 549 DVK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
Chris@82 550 DVK(KP535826794, +0.535826794978996618271308767867639978063575346);
Chris@82 551 DVK(KP425779291, +0.425779291565072648862502445744251703979973042);
Chris@82 552 DVK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
Chris@82 553 DVK(KP963507348, +0.963507348203430549974383005744259307057084020);
Chris@82 554 DVK(KP876306680, +0.876306680043863587308115903922062583399064238);
Chris@82 555 DVK(KP844327925, +0.844327925502015078548558063966681505381659241);
Chris@82 556 DVK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
Chris@82 557 DVK(KP481753674, +0.481753674101715274987191502872129653528542010);
Chris@82 558 DVK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
Chris@82 559 DVK(KP851558583, +0.851558583130145297725004891488503407959946084);
Chris@82 560 DVK(KP904827052, +0.904827052466019527713668647932697593970413911);
Chris@82 561 DVK(KP125333233, +0.125333233564304245373118759816508793942918247);
Chris@82 562 DVK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
Chris@82 563 DVK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
Chris@82 564 DVK(KP684547105, +0.684547105928688673732283357621209269889519233);
Chris@82 565 DVK(KP637423989, +0.637423989748689710176712811676016195434917298);
Chris@82 566 DVK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
Chris@82 567 DVK(KP062790519, +0.062790519529313376076178224565631133122484832);
Chris@82 568 DVK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
Chris@82 569 DVK(KP770513242, +0.770513242775789230803009636396177847271667672);
Chris@82 570 DVK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
Chris@82 571 DVK(KP125581039, +0.125581039058626752152356449131262266244969664);
Chris@82 572 DVK(KP998026728, +0.998026728428271561952336806863450553336905220);
Chris@82 573 DVK(KP992114701, +0.992114701314477831049793042785778521453036709);
Chris@82 574 DVK(KP250666467, +0.250666467128608490746237519633017587885836494);
Chris@82 575 DVK(KP728968627, +0.728968627421411523146730319055259111372571664);
Chris@82 576 DVK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
Chris@82 577 DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
Chris@82 578 DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
Chris@82 579 DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 580 DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 581 DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 582 DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 583 {
Chris@82 584 INT m;
Chris@82 585 R *x;
Chris@82 586 x = ii;
Chris@82 587 for (m = mb, W = W + (mb * ((TWVL / VL) * 48)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 48), MAKE_VOLATILE_STRIDE(25, rs)) {
Chris@82 588 V T1A, T1z, T1R, T1S, T1B, T1C, T1Q, T2L, T1l, T2v, T1i, T3e, T2u, Tb, T2i;
Chris@82 589 V Tj, T3b, T2h, Tv, T2k, TD, T3a, T2l, T11, T2s, TY, T3d, T2r;
Chris@82 590 {
Chris@82 591 V T1v, T1x, T1y, T1q, T1s, T1t, T1P;
Chris@82 592 T1A = LD(&(x[0]), ms, &(x[0]));
Chris@82 593 {
Chris@82 594 V T1u, T1w, T1p, T1r;
Chris@82 595 T1u = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
Chris@82 596 T1v = BYTW(&(W[TWVL * 18]), T1u);
Chris@82 597 T1w = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
Chris@82 598 T1x = BYTW(&(W[TWVL * 28]), T1w);
Chris@82 599 T1y = VADD(T1v, T1x);
Chris@82 600 T1p = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 601 T1q = BYTW(&(W[TWVL * 8]), T1p);
Chris@82 602 T1r = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
Chris@82 603 T1s = BYTW(&(W[TWVL * 38]), T1r);
Chris@82 604 T1t = VADD(T1q, T1s);
Chris@82 605 }
Chris@82 606 T1z = VMUL(LDK(KP559016994), VSUB(T1t, T1y));
Chris@82 607 T1R = VSUB(T1v, T1x);
Chris@82 608 T1S = VMUL(LDK(KP587785252), T1R);
Chris@82 609 T1B = VADD(T1t, T1y);
Chris@82 610 T1C = VFNMS(LDK(KP250000000), T1B, T1A);
Chris@82 611 T1P = VSUB(T1q, T1s);
Chris@82 612 T1Q = VMUL(LDK(KP951056516), T1P);
Chris@82 613 T2L = VMUL(LDK(KP587785252), T1P);
Chris@82 614 }
Chris@82 615 {
Chris@82 616 V T1f, T19, T1b, T1c, T14, T16, T17, T1e;
Chris@82 617 T1e = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 618 T1f = BYTW(&(W[TWVL * 4]), T1e);
Chris@82 619 {
Chris@82 620 V T18, T1a, T13, T15;
Chris@82 621 T18 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
Chris@82 622 T19 = BYTW(&(W[TWVL * 24]), T18);
Chris@82 623 T1a = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
Chris@82 624 T1b = BYTW(&(W[TWVL * 34]), T1a);
Chris@82 625 T1c = VADD(T19, T1b);
Chris@82 626 T13 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
Chris@82 627 T14 = BYTW(&(W[TWVL * 14]), T13);
Chris@82 628 T15 = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
Chris@82 629 T16 = BYTW(&(W[TWVL * 44]), T15);
Chris@82 630 T17 = VADD(T14, T16);
Chris@82 631 }
Chris@82 632 {
Chris@82 633 V T1j, T1k, T1d, T1g, T1h;
Chris@82 634 T1j = VSUB(T14, T16);
Chris@82 635 T1k = VSUB(T19, T1b);
Chris@82 636 T1l = VFMA(LDK(KP475528258), T1j, VMUL(LDK(KP293892626), T1k));
Chris@82 637 T2v = VFNMS(LDK(KP475528258), T1k, VMUL(LDK(KP293892626), T1j));
Chris@82 638 T1d = VMUL(LDK(KP559016994), VSUB(T17, T1c));
Chris@82 639 T1g = VADD(T17, T1c);
Chris@82 640 T1h = VFNMS(LDK(KP250000000), T1g, T1f);
Chris@82 641 T1i = VADD(T1d, T1h);
Chris@82 642 T3e = VADD(T1f, T1g);
Chris@82 643 T2u = VSUB(T1h, T1d);
Chris@82 644 }
Chris@82 645 }
Chris@82 646 {
Chris@82 647 V Tg, T7, T9, Td, T2, T4, Tc, Tf;
Chris@82 648 Tf = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 649 Tg = BYTW(&(W[TWVL * 6]), Tf);
Chris@82 650 {
Chris@82 651 V T6, T8, T1, T3;
Chris@82 652 T6 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
Chris@82 653 T7 = BYTW(&(W[TWVL * 26]), T6);
Chris@82 654 T8 = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
Chris@82 655 T9 = BYTW(&(W[TWVL * 36]), T8);
Chris@82 656 Td = VADD(T7, T9);
Chris@82 657 T1 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
Chris@82 658 T2 = BYTW(&(W[TWVL * 16]), T1);
Chris@82 659 T3 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
Chris@82 660 T4 = BYTW(&(W[TWVL * 46]), T3);
Chris@82 661 Tc = VADD(T2, T4);
Chris@82 662 }
Chris@82 663 {
Chris@82 664 V T5, Ta, Te, Th, Ti;
Chris@82 665 T5 = VSUB(T2, T4);
Chris@82 666 Ta = VSUB(T7, T9);
Chris@82 667 Tb = VFMA(LDK(KP475528258), T5, VMUL(LDK(KP293892626), Ta));
Chris@82 668 T2i = VFNMS(LDK(KP475528258), Ta, VMUL(LDK(KP293892626), T5));
Chris@82 669 Te = VMUL(LDK(KP559016994), VSUB(Tc, Td));
Chris@82 670 Th = VADD(Tc, Td);
Chris@82 671 Ti = VFNMS(LDK(KP250000000), Th, Tg);
Chris@82 672 Tj = VADD(Te, Ti);
Chris@82 673 T3b = VADD(Tg, Th);
Chris@82 674 T2h = VSUB(Ti, Te);
Chris@82 675 }
Chris@82 676 }
Chris@82 677 {
Chris@82 678 V TA, Tr, Tt, Tx, Tm, To, Tw, Tz;
Chris@82 679 Tz = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 680 TA = BYTW(&(W[0]), Tz);
Chris@82 681 {
Chris@82 682 V Tq, Ts, Tl, Tn;
Chris@82 683 Tq = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
Chris@82 684 Tr = BYTW(&(W[TWVL * 20]), Tq);
Chris@82 685 Ts = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
Chris@82 686 Tt = BYTW(&(W[TWVL * 30]), Ts);
Chris@82 687 Tx = VADD(Tr, Tt);
Chris@82 688 Tl = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 689 Tm = BYTW(&(W[TWVL * 10]), Tl);
Chris@82 690 Tn = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
Chris@82 691 To = BYTW(&(W[TWVL * 40]), Tn);
Chris@82 692 Tw = VADD(Tm, To);
Chris@82 693 }
Chris@82 694 {
Chris@82 695 V Tp, Tu, Ty, TB, TC;
Chris@82 696 Tp = VSUB(Tm, To);
Chris@82 697 Tu = VSUB(Tr, Tt);
Chris@82 698 Tv = VFMA(LDK(KP475528258), Tp, VMUL(LDK(KP293892626), Tu));
Chris@82 699 T2k = VFNMS(LDK(KP475528258), Tu, VMUL(LDK(KP293892626), Tp));
Chris@82 700 Ty = VMUL(LDK(KP559016994), VSUB(Tw, Tx));
Chris@82 701 TB = VADD(Tw, Tx);
Chris@82 702 TC = VFNMS(LDK(KP250000000), TB, TA);
Chris@82 703 TD = VADD(Ty, TC);
Chris@82 704 T3a = VADD(TA, TB);
Chris@82 705 T2l = VSUB(TC, Ty);
Chris@82 706 }
Chris@82 707 }
Chris@82 708 {
Chris@82 709 V TV, TP, TR, TS, TK, TM, TN, TU;
Chris@82 710 TU = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 711 TV = BYTW(&(W[TWVL * 2]), TU);
Chris@82 712 {
Chris@82 713 V TO, TQ, TJ, TL;
Chris@82 714 TO = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
Chris@82 715 TP = BYTW(&(W[TWVL * 22]), TO);
Chris@82 716 TQ = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
Chris@82 717 TR = BYTW(&(W[TWVL * 32]), TQ);
Chris@82 718 TS = VADD(TP, TR);
Chris@82 719 TJ = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 720 TK = BYTW(&(W[TWVL * 12]), TJ);
Chris@82 721 TL = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
Chris@82 722 TM = BYTW(&(W[TWVL * 42]), TL);
Chris@82 723 TN = VADD(TK, TM);
Chris@82 724 }
Chris@82 725 {
Chris@82 726 V TZ, T10, TT, TW, TX;
Chris@82 727 TZ = VSUB(TK, TM);
Chris@82 728 T10 = VSUB(TP, TR);
Chris@82 729 T11 = VFMA(LDK(KP475528258), TZ, VMUL(LDK(KP293892626), T10));
Chris@82 730 T2s = VFNMS(LDK(KP475528258), T10, VMUL(LDK(KP293892626), TZ));
Chris@82 731 TT = VMUL(LDK(KP559016994), VSUB(TN, TS));
Chris@82 732 TW = VADD(TN, TS);
Chris@82 733 TX = VFNMS(LDK(KP250000000), TW, TV);
Chris@82 734 TY = VADD(TT, TX);
Chris@82 735 T3d = VADD(TV, TW);
Chris@82 736 T2r = VSUB(TX, TT);
Chris@82 737 }
Chris@82 738 }
Chris@82 739 {
Chris@82 740 V T3g, T3o, T3k, T3l, T3j, T3m, T3p, T3n;
Chris@82 741 {
Chris@82 742 V T3c, T3f, T3h, T3i;
Chris@82 743 T3c = VSUB(T3a, T3b);
Chris@82 744 T3f = VSUB(T3d, T3e);
Chris@82 745 T3g = VBYI(VFMA(LDK(KP951056516), T3c, VMUL(LDK(KP587785252), T3f)));
Chris@82 746 T3o = VBYI(VFNMS(LDK(KP951056516), T3f, VMUL(LDK(KP587785252), T3c)));
Chris@82 747 T3k = VADD(T1A, T1B);
Chris@82 748 T3h = VADD(T3a, T3b);
Chris@82 749 T3i = VADD(T3d, T3e);
Chris@82 750 T3l = VADD(T3h, T3i);
Chris@82 751 T3j = VMUL(LDK(KP559016994), VSUB(T3h, T3i));
Chris@82 752 T3m = VFNMS(LDK(KP250000000), T3l, T3k);
Chris@82 753 }
Chris@82 754 ST(&(x[0]), VADD(T3k, T3l), ms, &(x[0]));
Chris@82 755 T3p = VSUB(T3m, T3j);
Chris@82 756 ST(&(x[WS(rs, 10)]), VADD(T3o, T3p), ms, &(x[0]));
Chris@82 757 ST(&(x[WS(rs, 15)]), VSUB(T3p, T3o), ms, &(x[WS(rs, 1)]));
Chris@82 758 T3n = VADD(T3j, T3m);
Chris@82 759 ST(&(x[WS(rs, 5)]), VADD(T3g, T3n), ms, &(x[WS(rs, 1)]));
Chris@82 760 ST(&(x[WS(rs, 20)]), VSUB(T3n, T3g), ms, &(x[0]));
Chris@82 761 }
Chris@82 762 {
Chris@82 763 V T2z, T2M, T2U, T2V, T2W, T34, T35, T36, T2X, T2Y, T2Z, T31, T32, T33, T2n;
Chris@82 764 V T2N, T2E, T2K, T2y, T2H, T2A, T2G, T38, T39;
Chris@82 765 T2z = VSUB(T1C, T1z);
Chris@82 766 T2M = VFNMS(LDK(KP951056516), T1R, T2L);
Chris@82 767 T2U = VFMA(LDK(KP1_369094211), T2k, VMUL(LDK(KP728968627), T2l));
Chris@82 768 T2V = VFNMS(LDK(KP992114701), T2h, VMUL(LDK(KP250666467), T2i));
Chris@82 769 T2W = VADD(T2U, T2V);
Chris@82 770 T34 = VFNMS(LDK(KP125581039), T2s, VMUL(LDK(KP998026728), T2r));
Chris@82 771 T35 = VFMA(LDK(KP1_274847979), T2v, VMUL(LDK(KP770513242), T2u));
Chris@82 772 T36 = VADD(T34, T35);
Chris@82 773 T2X = VFMA(LDK(KP1_996053456), T2s, VMUL(LDK(KP062790519), T2r));
Chris@82 774 T2Y = VFNMS(LDK(KP637423989), T2u, VMUL(LDK(KP1_541026485), T2v));
Chris@82 775 T2Z = VADD(T2X, T2Y);
Chris@82 776 T31 = VFNMS(LDK(KP1_457937254), T2k, VMUL(LDK(KP684547105), T2l));
Chris@82 777 T32 = VFMA(LDK(KP1_984229402), T2i, VMUL(LDK(KP125333233), T2h));
Chris@82 778 T33 = VADD(T31, T32);
Chris@82 779 {
Chris@82 780 V T2j, T2m, T2I, T2C, T2D, T2J;
Chris@82 781 T2j = VFNMS(LDK(KP851558583), T2i, VMUL(LDK(KP904827052), T2h));
Chris@82 782 T2m = VFMA(LDK(KP1_752613360), T2k, VMUL(LDK(KP481753674), T2l));
Chris@82 783 T2I = VADD(T2m, T2j);
Chris@82 784 T2C = VFMA(LDK(KP1_071653589), T2s, VMUL(LDK(KP844327925), T2r));
Chris@82 785 T2D = VFMA(LDK(KP125581039), T2v, VMUL(LDK(KP998026728), T2u));
Chris@82 786 T2J = VADD(T2C, T2D);
Chris@82 787 T2n = VSUB(T2j, T2m);
Chris@82 788 T2N = VADD(T2I, T2J);
Chris@82 789 T2E = VSUB(T2C, T2D);
Chris@82 790 T2K = VMUL(LDK(KP559016994), VSUB(T2I, T2J));
Chris@82 791 }
Chris@82 792 {
Chris@82 793 V T2o, T2p, T2q, T2t, T2w, T2x;
Chris@82 794 T2o = VFNMS(LDK(KP963507348), T2k, VMUL(LDK(KP876306680), T2l));
Chris@82 795 T2p = VFMA(LDK(KP1_809654104), T2i, VMUL(LDK(KP425779291), T2h));
Chris@82 796 T2q = VSUB(T2o, T2p);
Chris@82 797 T2t = VFNMS(LDK(KP1_688655851), T2s, VMUL(LDK(KP535826794), T2r));
Chris@82 798 T2w = VFNMS(LDK(KP1_996053456), T2v, VMUL(LDK(KP062790519), T2u));
Chris@82 799 T2x = VADD(T2t, T2w);
Chris@82 800 T2y = VMUL(LDK(KP559016994), VSUB(T2q, T2x));
Chris@82 801 T2H = VSUB(T2t, T2w);
Chris@82 802 T2A = VADD(T2q, T2x);
Chris@82 803 T2G = VADD(T2o, T2p);
Chris@82 804 }
Chris@82 805 {
Chris@82 806 V T2S, T2T, T30, T37;
Chris@82 807 T2S = VADD(T2z, T2A);
Chris@82 808 T2T = VBYI(VADD(T2M, T2N));
Chris@82 809 ST(&(x[WS(rs, 23)]), VSUB(T2S, T2T), ms, &(x[WS(rs, 1)]));
Chris@82 810 ST(&(x[WS(rs, 2)]), VADD(T2S, T2T), ms, &(x[0]));
Chris@82 811 T30 = VADD(T2z, VADD(T2W, T2Z));
Chris@82 812 T37 = VBYI(VSUB(VADD(T33, T36), T2M));
Chris@82 813 ST(&(x[WS(rs, 22)]), VSUB(T30, T37), ms, &(x[0]));
Chris@82 814 ST(&(x[WS(rs, 3)]), VADD(T30, T37), ms, &(x[WS(rs, 1)]));
Chris@82 815 }
Chris@82 816 T38 = VBYI(VSUB(VFMA(LDK(KP951056516), VSUB(T2U, T2V), VFMA(LDK(KP309016994), T33, VFNMS(LDK(KP809016994), T36, VMUL(LDK(KP587785252), VSUB(T2X, T2Y))))), T2M));
Chris@82 817 T39 = VFMA(LDK(KP309016994), T2W, VFMA(LDK(KP951056516), VSUB(T32, T31), VFMA(LDK(KP587785252), VSUB(T35, T34), VFNMS(LDK(KP809016994), T2Z, T2z))));
Chris@82 818 ST(&(x[WS(rs, 8)]), VADD(T38, T39), ms, &(x[0]));
Chris@82 819 ST(&(x[WS(rs, 17)]), VSUB(T39, T38), ms, &(x[WS(rs, 1)]));
Chris@82 820 {
Chris@82 821 V T2F, T2Q, T2P, T2R, T2B, T2O;
Chris@82 822 T2B = VFNMS(LDK(KP250000000), T2A, T2z);
Chris@82 823 T2F = VFMA(LDK(KP951056516), T2n, VADD(T2y, VFNMS(LDK(KP587785252), T2E, T2B)));
Chris@82 824 T2Q = VFMA(LDK(KP587785252), T2n, VFMA(LDK(KP951056516), T2E, VSUB(T2B, T2y)));
Chris@82 825 T2O = VFNMS(LDK(KP250000000), T2N, T2M);
Chris@82 826 T2P = VBYI(VADD(VFMA(LDK(KP951056516), T2G, VMUL(LDK(KP587785252), T2H)), VADD(T2K, T2O)));
Chris@82 827 T2R = VBYI(VADD(VFNMS(LDK(KP951056516), T2H, VMUL(LDK(KP587785252), T2G)), VSUB(T2O, T2K)));
Chris@82 828 ST(&(x[WS(rs, 18)]), VSUB(T2F, T2P), ms, &(x[0]));
Chris@82 829 ST(&(x[WS(rs, 12)]), VADD(T2Q, T2R), ms, &(x[0]));
Chris@82 830 ST(&(x[WS(rs, 7)]), VADD(T2F, T2P), ms, &(x[WS(rs, 1)]));
Chris@82 831 ST(&(x[WS(rs, 13)]), VSUB(T2Q, T2R), ms, &(x[WS(rs, 1)]));
Chris@82 832 }
Chris@82 833 }
Chris@82 834 {
Chris@82 835 V T1D, T1T, T21, T22, T23, T2b, T2c, T2d, T24, T25, T26, T28, T29, T2a, TF;
Chris@82 836 V T1U, T1I, T1O, T1o, T1L, T1E, T1K, T2f, T2g;
Chris@82 837 T1D = VADD(T1z, T1C);
Chris@82 838 T1T = VADD(T1Q, T1S);
Chris@82 839 T21 = VFMA(LDK(KP1_688655851), Tv, VMUL(LDK(KP535826794), TD));
Chris@82 840 T22 = VFMA(LDK(KP1_541026485), Tb, VMUL(LDK(KP637423989), Tj));
Chris@82 841 T23 = VSUB(T21, T22);
Chris@82 842 T2b = VFMA(LDK(KP851558583), T11, VMUL(LDK(KP904827052), TY));
Chris@82 843 T2c = VFMA(LDK(KP1_984229402), T1l, VMUL(LDK(KP125333233), T1i));
Chris@82 844 T2d = VADD(T2b, T2c);
Chris@82 845 T24 = VFNMS(LDK(KP425779291), TY, VMUL(LDK(KP1_809654104), T11));
Chris@82 846 T25 = VFNMS(LDK(KP992114701), T1i, VMUL(LDK(KP250666467), T1l));
Chris@82 847 T26 = VADD(T24, T25);
Chris@82 848 T28 = VFNMS(LDK(KP1_071653589), Tv, VMUL(LDK(KP844327925), TD));
Chris@82 849 T29 = VFNMS(LDK(KP770513242), Tj, VMUL(LDK(KP1_274847979), Tb));
Chris@82 850 T2a = VADD(T28, T29);
Chris@82 851 {
Chris@82 852 V Tk, TE, T1M, T1G, T1H, T1N;
Chris@82 853 Tk = VFMA(LDK(KP1_071653589), Tb, VMUL(LDK(KP844327925), Tj));
Chris@82 854 TE = VFMA(LDK(KP1_937166322), Tv, VMUL(LDK(KP248689887), TD));
Chris@82 855 T1M = VADD(TE, Tk);
Chris@82 856 T1G = VFMA(LDK(KP1_752613360), T11, VMUL(LDK(KP481753674), TY));
Chris@82 857 T1H = VFMA(LDK(KP1_457937254), T1l, VMUL(LDK(KP684547105), T1i));
Chris@82 858 T1N = VADD(T1G, T1H);
Chris@82 859 TF = VSUB(Tk, TE);
Chris@82 860 T1U = VADD(T1M, T1N);
Chris@82 861 T1I = VSUB(T1G, T1H);
Chris@82 862 T1O = VMUL(LDK(KP559016994), VSUB(T1M, T1N));
Chris@82 863 }
Chris@82 864 {
Chris@82 865 V TG, TH, TI, T12, T1m, T1n;
Chris@82 866 TG = VFNMS(LDK(KP497379774), Tv, VMUL(LDK(KP968583161), TD));
Chris@82 867 TH = VFNMS(LDK(KP1_688655851), Tb, VMUL(LDK(KP535826794), Tj));
Chris@82 868 TI = VADD(TG, TH);
Chris@82 869 T12 = VFNMS(LDK(KP963507348), T11, VMUL(LDK(KP876306680), TY));
Chris@82 870 T1m = VFNMS(LDK(KP1_369094211), T1l, VMUL(LDK(KP728968627), T1i));
Chris@82 871 T1n = VADD(T12, T1m);
Chris@82 872 T1o = VMUL(LDK(KP559016994), VSUB(TI, T1n));
Chris@82 873 T1L = VSUB(T12, T1m);
Chris@82 874 T1E = VADD(TI, T1n);
Chris@82 875 T1K = VSUB(TG, TH);
Chris@82 876 }
Chris@82 877 {
Chris@82 878 V T1Z, T20, T27, T2e;
Chris@82 879 T1Z = VADD(T1D, T1E);
Chris@82 880 T20 = VBYI(VADD(T1T, T1U));
Chris@82 881 ST(&(x[WS(rs, 24)]), VSUB(T1Z, T20), ms, &(x[0]));
Chris@82 882 ST(&(x[WS(rs, 1)]), VADD(T1Z, T20), ms, &(x[WS(rs, 1)]));
Chris@82 883 T27 = VADD(T1D, VADD(T23, T26));
Chris@82 884 T2e = VBYI(VSUB(VADD(T2a, T2d), T1T));
Chris@82 885 ST(&(x[WS(rs, 21)]), VSUB(T27, T2e), ms, &(x[WS(rs, 1)]));
Chris@82 886 ST(&(x[WS(rs, 4)]), VADD(T27, T2e), ms, &(x[0]));
Chris@82 887 }
Chris@82 888 T2f = VBYI(VSUB(VFMA(LDK(KP309016994), T2a, VFMA(LDK(KP951056516), VADD(T21, T22), VFNMS(LDK(KP809016994), T2d, VMUL(LDK(KP587785252), VSUB(T24, T25))))), T1T));
Chris@82 889 T2g = VFMA(LDK(KP951056516), VSUB(T29, T28), VFMA(LDK(KP309016994), T23, VFMA(LDK(KP587785252), VSUB(T2c, T2b), VFNMS(LDK(KP809016994), T26, T1D))));
Chris@82 890 ST(&(x[WS(rs, 9)]), VADD(T2f, T2g), ms, &(x[WS(rs, 1)]));
Chris@82 891 ST(&(x[WS(rs, 16)]), VSUB(T2g, T2f), ms, &(x[0]));
Chris@82 892 {
Chris@82 893 V T1J, T1X, T1W, T1Y, T1F, T1V;
Chris@82 894 T1F = VFNMS(LDK(KP250000000), T1E, T1D);
Chris@82 895 T1J = VFMA(LDK(KP951056516), TF, VADD(T1o, VFNMS(LDK(KP587785252), T1I, T1F)));
Chris@82 896 T1X = VFMA(LDK(KP587785252), TF, VFMA(LDK(KP951056516), T1I, VSUB(T1F, T1o)));
Chris@82 897 T1V = VFNMS(LDK(KP250000000), T1U, T1T);
Chris@82 898 T1W = VBYI(VADD(VFMA(LDK(KP951056516), T1K, VMUL(LDK(KP587785252), T1L)), VADD(T1O, T1V)));
Chris@82 899 T1Y = VBYI(VADD(VFNMS(LDK(KP951056516), T1L, VMUL(LDK(KP587785252), T1K)), VSUB(T1V, T1O)));
Chris@82 900 ST(&(x[WS(rs, 19)]), VSUB(T1J, T1W), ms, &(x[WS(rs, 1)]));
Chris@82 901 ST(&(x[WS(rs, 11)]), VADD(T1X, T1Y), ms, &(x[WS(rs, 1)]));
Chris@82 902 ST(&(x[WS(rs, 6)]), VADD(T1J, T1W), ms, &(x[0]));
Chris@82 903 ST(&(x[WS(rs, 14)]), VSUB(T1X, T1Y), ms, &(x[0]));
Chris@82 904 }
Chris@82 905 }
Chris@82 906 }
Chris@82 907 }
Chris@82 908 VLEAVE();
Chris@82 909 }
Chris@82 910
Chris@82 911 static const tw_instr twinstr[] = {
Chris@82 912 VTW(0, 1),
Chris@82 913 VTW(0, 2),
Chris@82 914 VTW(0, 3),
Chris@82 915 VTW(0, 4),
Chris@82 916 VTW(0, 5),
Chris@82 917 VTW(0, 6),
Chris@82 918 VTW(0, 7),
Chris@82 919 VTW(0, 8),
Chris@82 920 VTW(0, 9),
Chris@82 921 VTW(0, 10),
Chris@82 922 VTW(0, 11),
Chris@82 923 VTW(0, 12),
Chris@82 924 VTW(0, 13),
Chris@82 925 VTW(0, 14),
Chris@82 926 VTW(0, 15),
Chris@82 927 VTW(0, 16),
Chris@82 928 VTW(0, 17),
Chris@82 929 VTW(0, 18),
Chris@82 930 VTW(0, 19),
Chris@82 931 VTW(0, 20),
Chris@82 932 VTW(0, 21),
Chris@82 933 VTW(0, 22),
Chris@82 934 VTW(0, 23),
Chris@82 935 VTW(0, 24),
Chris@82 936 {TW_NEXT, VL, 0}
Chris@82 937 };
Chris@82 938
Chris@82 939 static const ct_desc desc = { 25, XSIMD_STRING("t2bv_25"), twinstr, &GENUS, {171, 111, 77, 0}, 0, 0, 0 };
Chris@82 940
Chris@82 941 void XSIMD(codelet_t2bv_25) (planner *p) {
Chris@82 942 X(kdft_dit_register) (p, t2bv_25, &desc);
Chris@82 943 }
Chris@82 944 #endif