annotate src/fftw-3.3.5/dft/simd/common/q1fv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:45:26 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include q1f.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 264 FP additions, 192 FP multiplications,
Chris@42 32 * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
Chris@42 33 * 117 stack variables, 1 constants, and 128 memory accesses
Chris@42 34 */
Chris@42 35 #include "q1f.h"
Chris@42 36
Chris@42 37 static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 40 {
Chris@42 41 INT m;
Chris@42 42 R *x;
Chris@42 43 x = ri;
Chris@42 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@42 45 V T42, T43, T1U, T1V, T2Y, T2Z, TT, TS;
Chris@42 46 {
Chris@42 47 V T3, Te, T1E, T1P, Tu, Tp, T25, T20, T2b, T2m, T3M, T2x, T2C, T3X, TA;
Chris@42 48 V TL, T48, T4d, T17, T11, TW, T1i, T2I, T1y, T1t, T2T, T3f, T3q, T34, T39;
Chris@42 49 V T3G, T3B, Ts, Tv, Tf, Ta, T23, T26, T1Q, T1L, T2A, T2D, T2n, T2i, T4b;
Chris@42 50 V T4e, T3Y, T3T, TZ, T12, TM, TH, T35, T2L, T3j, T1w, T1z, T1j, T1e, T36;
Chris@42 51 V T2O, T3C, T3i, T3k;
Chris@42 52 {
Chris@42 53 V T3d, T32, T3e, T3o, T3p, T33;
Chris@42 54 {
Chris@42 55 V T2v, T2w, T3V, T46, T3W;
Chris@42 56 {
Chris@42 57 V T1, T2, Tc, Td, T1C, T1D, T1N, T1O;
Chris@42 58 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 59 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 60 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 61 Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 62 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@42 63 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@42 64 T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@42 65 T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@42 66 {
Chris@42 67 V T29, T1Y, T1Z, T2a, T2k, T2l, Tn, To, T3K, T3L;
Chris@42 68 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@42 69 T3 = VSUB(T1, T2);
Chris@42 70 Tn = VADD(T1, T2);
Chris@42 71 Te = VSUB(Tc, Td);
Chris@42 72 To = VADD(Tc, Td);
Chris@42 73 T1E = VSUB(T1C, T1D);
Chris@42 74 T1Y = VADD(T1C, T1D);
Chris@42 75 T1P = VSUB(T1N, T1O);
Chris@42 76 T1Z = VADD(T1N, T1O);
Chris@42 77 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@42 78 T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@42 79 T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@42 80 Tu = VSUB(Tn, To);
Chris@42 81 Tp = VADD(Tn, To);
Chris@42 82 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@42 83 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@42 84 T25 = VSUB(T1Y, T1Z);
Chris@42 85 T20 = VADD(T1Y, T1Z);
Chris@42 86 T2v = VADD(T29, T2a);
Chris@42 87 T2b = VSUB(T29, T2a);
Chris@42 88 T2w = VADD(T2k, T2l);
Chris@42 89 T2m = VSUB(T2k, T2l);
Chris@42 90 T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@42 91 T46 = VADD(T3K, T3L);
Chris@42 92 T3M = VSUB(T3K, T3L);
Chris@42 93 T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@42 94 }
Chris@42 95 }
Chris@42 96 {
Chris@42 97 V T15, TU, T16, T1g, TV, T1h;
Chris@42 98 {
Chris@42 99 V Ty, Tz, TJ, TK, T47;
Chris@42 100 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@42 101 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@42 102 TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@42 103 T2x = VADD(T2v, T2w);
Chris@42 104 T2C = VSUB(T2v, T2w);
Chris@42 105 TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@42 106 T47 = VADD(T3V, T3W);
Chris@42 107 T3X = VSUB(T3V, T3W);
Chris@42 108 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 109 TA = VSUB(Ty, Tz);
Chris@42 110 TU = VADD(Ty, Tz);
Chris@42 111 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@42 112 T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 113 TL = VSUB(TJ, TK);
Chris@42 114 TV = VADD(TJ, TK);
Chris@42 115 T48 = VADD(T46, T47);
Chris@42 116 T4d = VSUB(T46, T47);
Chris@42 117 T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@42 118 }
Chris@42 119 {
Chris@42 120 V T2G, T1r, T2H, T2R, T1s, T2S;
Chris@42 121 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@42 122 T17 = VSUB(T15, T16);
Chris@42 123 T1r = VADD(T15, T16);
Chris@42 124 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@42 125 T11 = VSUB(TU, TV);
Chris@42 126 TW = VADD(TU, TV);
Chris@42 127 T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@42 128 T1i = VSUB(T1g, T1h);
Chris@42 129 T1s = VADD(T1g, T1h);
Chris@42 130 T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@42 131 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@42 132 T2I = VSUB(T2G, T2H);
Chris@42 133 T32 = VADD(T2G, T2H);
Chris@42 134 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@42 135 T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@42 136 T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@42 137 T1y = VSUB(T1r, T1s);
Chris@42 138 T1t = VADD(T1r, T1s);
Chris@42 139 T33 = VADD(T2R, T2S);
Chris@42 140 T2T = VSUB(T2R, T2S);
Chris@42 141 }
Chris@42 142 }
Chris@42 143 }
Chris@42 144 {
Chris@42 145 V T2y, T2e, T3Q, T2z, T2h, T49, T3P, T3R;
Chris@42 146 {
Chris@42 147 V T6, Tq, T1I, Tr, T9, T21, T1H, T1J;
Chris@42 148 {
Chris@42 149 V T4, T3z, T3A, T5, T7, T8, T1F, T1G;
Chris@42 150 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 151 T3f = VSUB(T3d, T3e);
Chris@42 152 T3z = VADD(T3d, T3e);
Chris@42 153 T3q = VSUB(T3o, T3p);
Chris@42 154 T3A = VADD(T3o, T3p);
Chris@42 155 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 156 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 157 T34 = VADD(T32, T33);
Chris@42 158 T39 = VSUB(T32, T33);
Chris@42 159 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 160 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 161 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 162 T3G = VSUB(T3z, T3A);
Chris@42 163 T3B = VADD(T3z, T3A);
Chris@42 164 T6 = VSUB(T4, T5);
Chris@42 165 Tq = VADD(T4, T5);
Chris@42 166 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 167 Tr = VADD(T7, T8);
Chris@42 168 T9 = VSUB(T7, T8);
Chris@42 169 T21 = VADD(T1F, T1G);
Chris@42 170 T1H = VSUB(T1F, T1G);
Chris@42 171 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 172 }
Chris@42 173 {
Chris@42 174 V T2f, T22, T1K, T2g, T2c, T2d, T3N, T3O;
Chris@42 175 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 176 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 177 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 178 Ts = VADD(Tq, Tr);
Chris@42 179 Tv = VSUB(Tr, Tq);
Chris@42 180 Tf = VSUB(T9, T6);
Chris@42 181 Ta = VADD(T6, T9);
Chris@42 182 T22 = VADD(T1I, T1J);
Chris@42 183 T1K = VSUB(T1I, T1J);
Chris@42 184 T2y = VADD(T2c, T2d);
Chris@42 185 T2e = VSUB(T2c, T2d);
Chris@42 186 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 187 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 188 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 189 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 190 T23 = VADD(T21, T22);
Chris@42 191 T26 = VSUB(T22, T21);
Chris@42 192 T1Q = VSUB(T1K, T1H);
Chris@42 193 T1L = VADD(T1H, T1K);
Chris@42 194 T2z = VADD(T2f, T2g);
Chris@42 195 T2h = VSUB(T2f, T2g);
Chris@42 196 T49 = VADD(T3N, T3O);
Chris@42 197 T3P = VSUB(T3N, T3O);
Chris@42 198 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 199 }
Chris@42 200 }
Chris@42 201 {
Chris@42 202 V TX, TD, T1b, TY, TG, T1u, T1a, T1c;
Chris@42 203 {
Chris@42 204 V TE, T4a, T3S, TF, TB, TC, T18, T19;
Chris@42 205 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 206 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 207 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 208 T2A = VADD(T2y, T2z);
Chris@42 209 T2D = VSUB(T2z, T2y);
Chris@42 210 T2n = VSUB(T2h, T2e);
Chris@42 211 T2i = VADD(T2e, T2h);
Chris@42 212 T4a = VADD(T3Q, T3R);
Chris@42 213 T3S = VSUB(T3Q, T3R);
Chris@42 214 TX = VADD(TB, TC);
Chris@42 215 TD = VSUB(TB, TC);
Chris@42 216 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 217 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 218 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 219 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 220 T4b = VADD(T49, T4a);
Chris@42 221 T4e = VSUB(T4a, T49);
Chris@42 222 T3Y = VSUB(T3S, T3P);
Chris@42 223 T3T = VADD(T3P, T3S);
Chris@42 224 TY = VADD(TE, TF);
Chris@42 225 TG = VSUB(TE, TF);
Chris@42 226 T1u = VADD(T18, T19);
Chris@42 227 T1a = VSUB(T18, T19);
Chris@42 228 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 229 }
Chris@42 230 {
Chris@42 231 V T2M, T1v, T1d, T2N, T2J, T2K, T3g, T3h;
Chris@42 232 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 233 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 234 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 235 TZ = VADD(TX, TY);
Chris@42 236 T12 = VSUB(TY, TX);
Chris@42 237 TM = VSUB(TG, TD);
Chris@42 238 TH = VADD(TD, TG);
Chris@42 239 T1v = VADD(T1b, T1c);
Chris@42 240 T1d = VSUB(T1b, T1c);
Chris@42 241 T35 = VADD(T2J, T2K);
Chris@42 242 T2L = VSUB(T2J, T2K);
Chris@42 243 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 244 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 245 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 246 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 247 T1w = VADD(T1u, T1v);
Chris@42 248 T1z = VSUB(T1v, T1u);
Chris@42 249 T1j = VSUB(T1d, T1a);
Chris@42 250 T1e = VADD(T1a, T1d);
Chris@42 251 T36 = VADD(T2M, T2N);
Chris@42 252 T2O = VSUB(T2M, T2N);
Chris@42 253 T3C = VADD(T3g, T3h);
Chris@42 254 T3i = VSUB(T3g, T3h);
Chris@42 255 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 256 }
Chris@42 257 }
Chris@42 258 }
Chris@42 259 }
Chris@42 260 {
Chris@42 261 V T3a, T2U, T2P, T3H, T3r, T3m, T13, T27, T3b, T4f;
Chris@42 262 {
Chris@42 263 V T37, T3E, T2B, T24;
Chris@42 264 {
Chris@42 265 V T3D, T3l, Tt, T4c;
Chris@42 266 ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
Chris@42 267 ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
Chris@42 268 ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
Chris@42 269 T37 = VADD(T35, T36);
Chris@42 270 T3a = VSUB(T36, T35);
Chris@42 271 T2U = VSUB(T2O, T2L);
Chris@42 272 T2P = VADD(T2L, T2O);
Chris@42 273 T3D = VADD(T3j, T3k);
Chris@42 274 T3l = VSUB(T3j, T3k);
Chris@42 275 ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
Chris@42 276 ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
Chris@42 277 ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
Chris@42 278 ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
Chris@42 279 Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
Chris@42 280 T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
Chris@42 281 T3E = VADD(T3C, T3D);
Chris@42 282 T3H = VSUB(T3D, T3C);
Chris@42 283 T3r = VSUB(T3l, T3i);
Chris@42 284 T3m = VADD(T3i, T3l);
Chris@42 285 T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
Chris@42 286 T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
Chris@42 287 ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
Chris@42 288 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 289 ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
Chris@42 290 }
Chris@42 291 {
Chris@42 292 V T38, T1A, Tw, T10, T1x, T3F, T2E, T3I;
Chris@42 293 T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
Chris@42 294 T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
Chris@42 295 T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
Chris@42 296 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
Chris@42 297 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 298 T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
Chris@42 299 T1A = BYTWJ(&(W[TWVL * 10]), VFNMSI(T1z, T1y));
Chris@42 300 Tw = BYTWJ(&(W[TWVL * 10]), VFNMSI(Tv, Tu));
Chris@42 301 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 302 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
Chris@42 303 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
Chris@42 304 T2E = BYTWJ(&(W[TWVL * 10]), VFNMSI(T2D, T2C));
Chris@42 305 T3I = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3H, T3G));
Chris@42 306 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 307 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
Chris@42 308 ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
Chris@42 309 T13 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T12, T11));
Chris@42 310 T27 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T26, T25));
Chris@42 311 T3b = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3a, T39));
Chris@42 312 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
Chris@42 313 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
Chris@42 314 T4f = BYTWJ(&(W[TWVL * 10]), VFNMSI(T4e, T4d));
Chris@42 315 }
Chris@42 316 }
Chris@42 317 {
Chris@42 318 V Tj, Tk, T2r, T2j, Ti, Th, T2o, T2s, T1M, T1R, T41, T40;
Chris@42 319 {
Chris@42 320 V T3c, T4g, T3J, T2F, Tx, T1B;
Chris@42 321 Tx = BYTWJ(&(W[TWVL * 2]), VFMAI(Tv, Tu));
Chris@42 322 T1B = BYTWJ(&(W[TWVL * 2]), VFMAI(T1z, T1y));
Chris@42 323 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 324 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 325 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 326 T3c = BYTWJ(&(W[TWVL * 2]), VFMAI(T3a, T39));
Chris@42 327 T4g = BYTWJ(&(W[TWVL * 2]), VFMAI(T4e, T4d));
Chris@42 328 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 329 ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@42 330 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@42 331 T3J = BYTWJ(&(W[TWVL * 2]), VFMAI(T3H, T3G));
Chris@42 332 T2F = BYTWJ(&(W[TWVL * 2]), VFMAI(T2D, T2C));
Chris@42 333 {
Chris@42 334 V T14, Tb, Tg, T28, T3U, T3Z;
Chris@42 335 T28 = BYTWJ(&(W[TWVL * 2]), VFMAI(T26, T25));
Chris@42 336 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 337 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 338 T14 = BYTWJ(&(W[TWVL * 2]), VFMAI(T12, T11));
Chris@42 339 Tj = VFNMS(LDK(KP707106781), Ta, T3);
Chris@42 340 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@42 341 Tg = VFNMS(LDK(KP707106781), Tf, Te);
Chris@42 342 Tk = VFMA(LDK(KP707106781), Tf, Te);
Chris@42 343 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
Chris@42 344 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
Chris@42 345 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 346 T3U = VFMA(LDK(KP707106781), T3T, T3M);
Chris@42 347 T42 = VFNMS(LDK(KP707106781), T3T, T3M);
Chris@42 348 T43 = VFMA(LDK(KP707106781), T3Y, T3X);
Chris@42 349 T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
Chris@42 350 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 351 T2r = VFNMS(LDK(KP707106781), T2i, T2b);
Chris@42 352 T2j = VFMA(LDK(KP707106781), T2i, T2b);
Chris@42 353 Ti = BYTWJ(&(W[TWVL * 12]), VFMAI(Tg, Tb));
Chris@42 354 Th = BYTWJ(&(W[0]), VFNMSI(Tg, Tb));
Chris@42 355 T2o = VFNMS(LDK(KP707106781), T2n, T2m);
Chris@42 356 T2s = VFMA(LDK(KP707106781), T2n, T2m);
Chris@42 357 T1U = VFNMS(LDK(KP707106781), T1L, T1E);
Chris@42 358 T1M = VFMA(LDK(KP707106781), T1L, T1E);
Chris@42 359 T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
Chris@42 360 T1V = VFMA(LDK(KP707106781), T1Q, T1P);
Chris@42 361 T41 = BYTWJ(&(W[TWVL * 12]), VFMAI(T3Z, T3U));
Chris@42 362 T40 = BYTWJ(&(W[0]), VFNMSI(T3Z, T3U));
Chris@42 363 }
Chris@42 364 }
Chris@42 365 {
Chris@42 366 V TQ, TR, T1n, T1o, T3v, T3w;
Chris@42 367 {
Chris@42 368 V T1f, T1k, T3n, TP, TO, T3s, T2Q, T2V;
Chris@42 369 {
Chris@42 370 V TI, T2q, T2p, T1T, T1S, TN;
Chris@42 371 TQ = VFNMS(LDK(KP707106781), TH, TA);
Chris@42 372 TI = VFMA(LDK(KP707106781), TH, TA);
Chris@42 373 ST(&(x[WS(vs, 7)]), Ti, ms, &(x[WS(vs, 7)]));
Chris@42 374 ST(&(x[WS(vs, 1)]), Th, ms, &(x[WS(vs, 1)]));
Chris@42 375 T2q = BYTWJ(&(W[TWVL * 12]), VFMAI(T2o, T2j));
Chris@42 376 T2p = BYTWJ(&(W[0]), VFNMSI(T2o, T2j));
Chris@42 377 T1T = BYTWJ(&(W[TWVL * 12]), VFMAI(T1R, T1M));
Chris@42 378 T1S = BYTWJ(&(W[0]), VFNMSI(T1R, T1M));
Chris@42 379 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 380 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 381 TN = VFNMS(LDK(KP707106781), TM, TL);
Chris@42 382 TR = VFMA(LDK(KP707106781), TM, TL);
Chris@42 383 T1n = VFNMS(LDK(KP707106781), T1e, T17);
Chris@42 384 T1f = VFMA(LDK(KP707106781), T1e, T17);
Chris@42 385 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 7)]));
Chris@42 386 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 1)]));
Chris@42 387 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 388 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 389 T1k = VFNMS(LDK(KP707106781), T1j, T1i);
Chris@42 390 T1o = VFMA(LDK(KP707106781), T1j, T1i);
Chris@42 391 T3v = VFNMS(LDK(KP707106781), T3m, T3f);
Chris@42 392 T3n = VFMA(LDK(KP707106781), T3m, T3f);
Chris@42 393 TP = BYTWJ(&(W[TWVL * 12]), VFMAI(TN, TI));
Chris@42 394 TO = BYTWJ(&(W[0]), VFNMSI(TN, TI));
Chris@42 395 T3s = VFNMS(LDK(KP707106781), T3r, T3q);
Chris@42 396 T3w = VFMA(LDK(KP707106781), T3r, T3q);
Chris@42 397 }
Chris@42 398 T2Y = VFNMS(LDK(KP707106781), T2P, T2I);
Chris@42 399 T2Q = VFMA(LDK(KP707106781), T2P, T2I);
Chris@42 400 T2V = VFNMS(LDK(KP707106781), T2U, T2T);
Chris@42 401 T2Z = VFMA(LDK(KP707106781), T2U, T2T);
Chris@42 402 {
Chris@42 403 V T3u, T3t, T2X, T2W, T1m, T1l;
Chris@42 404 T1m = BYTWJ(&(W[TWVL * 12]), VFMAI(T1k, T1f));
Chris@42 405 T1l = BYTWJ(&(W[0]), VFNMSI(T1k, T1f));
Chris@42 406 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 407 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 408 T3u = BYTWJ(&(W[TWVL * 12]), VFMAI(T3s, T3n));
Chris@42 409 T3t = BYTWJ(&(W[0]), VFNMSI(T3s, T3n));
Chris@42 410 T2X = BYTWJ(&(W[TWVL * 12]), VFMAI(T2V, T2Q));
Chris@42 411 T2W = BYTWJ(&(W[0]), VFNMSI(T2V, T2Q));
Chris@42 412 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 7)]));
Chris@42 413 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 1)]));
Chris@42 414 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 7)]));
Chris@42 415 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 1)]));
Chris@42 416 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 417 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 418 }
Chris@42 419 }
Chris@42 420 {
Chris@42 421 V T2u, T2t, T3y, T3x;
Chris@42 422 {
Chris@42 423 V T1q, T1p, Tm, Tl;
Chris@42 424 T1q = BYTWJ(&(W[TWVL * 4]), VFMAI(T1o, T1n));
Chris@42 425 T1p = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1o, T1n));
Chris@42 426 Tm = BYTWJ(&(W[TWVL * 4]), VFMAI(Tk, Tj));
Chris@42 427 Tl = BYTWJ(&(W[TWVL * 8]), VFNMSI(Tk, Tj));
Chris@42 428 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
Chris@42 429 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
Chris@42 430 T2u = BYTWJ(&(W[TWVL * 4]), VFMAI(T2s, T2r));
Chris@42 431 T2t = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2s, T2r));
Chris@42 432 T3y = BYTWJ(&(W[TWVL * 4]), VFMAI(T3w, T3v));
Chris@42 433 T3x = BYTWJ(&(W[TWVL * 8]), VFNMSI(T3w, T3v));
Chris@42 434 ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
Chris@42 435 ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
Chris@42 436 }
Chris@42 437 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
Chris@42 438 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
Chris@42 439 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
Chris@42 440 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
Chris@42 441 TT = BYTWJ(&(W[TWVL * 4]), VFMAI(TR, TQ));
Chris@42 442 TS = BYTWJ(&(W[TWVL * 8]), VFNMSI(TR, TQ));
Chris@42 443 }
Chris@42 444 }
Chris@42 445 }
Chris@42 446 }
Chris@42 447 }
Chris@42 448 {
Chris@42 449 V T31, T30, T45, T44, T1X, T1W;
Chris@42 450 T1X = BYTWJ(&(W[TWVL * 4]), VFMAI(T1V, T1U));
Chris@42 451 T1W = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1V, T1U));
Chris@42 452 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 453 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 454 T31 = BYTWJ(&(W[TWVL * 4]), VFMAI(T2Z, T2Y));
Chris@42 455 T30 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2Z, T2Y));
Chris@42 456 T45 = BYTWJ(&(W[TWVL * 4]), VFMAI(T43, T42));
Chris@42 457 T44 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T43, T42));
Chris@42 458 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 459 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 460 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 461 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 462 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 463 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 464 }
Chris@42 465 }
Chris@42 466 }
Chris@42 467 VLEAVE();
Chris@42 468 }
Chris@42 469
Chris@42 470 static const tw_instr twinstr[] = {
Chris@42 471 VTW(0, 1),
Chris@42 472 VTW(0, 2),
Chris@42 473 VTW(0, 3),
Chris@42 474 VTW(0, 4),
Chris@42 475 VTW(0, 5),
Chris@42 476 VTW(0, 6),
Chris@42 477 VTW(0, 7),
Chris@42 478 {TW_NEXT, VL, 0}
Chris@42 479 };
Chris@42 480
Chris@42 481 static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
Chris@42 482
Chris@42 483 void XSIMD(codelet_q1fv_8) (planner *p) {
Chris@42 484 X(kdft_difsq_register) (p, q1fv_8, &desc);
Chris@42 485 }
Chris@42 486 #else /* HAVE_FMA */
Chris@42 487
Chris@42 488 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include q1f.h */
Chris@42 489
Chris@42 490 /*
Chris@42 491 * This function contains 264 FP additions, 128 FP multiplications,
Chris@42 492 * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
Chris@42 493 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@42 494 */
Chris@42 495 #include "q1f.h"
Chris@42 496
Chris@42 497 static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@42 498 {
Chris@42 499 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 500 {
Chris@42 501 INT m;
Chris@42 502 R *x;
Chris@42 503 x = ri;
Chris@42 504 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@42 505 V T3, Tu, Tf, Tp, T1E, T25, T1Q, T20, T2b, T2C, T2n, T2x, T3M, T4d, T3Y;
Chris@42 506 V T48, TA, T11, TM, TW, T17, T1y, T1j, T1t, T2I, T39, T2U, T34, T3f, T3G;
Chris@42 507 V T3r, T3B, Ta, Tv, Tc, Ts, T1L, T26, T1N, T23, T2i, T2D, T2k, T2A, T3T;
Chris@42 508 V T4e, T3V, T4b, TH, T12, TJ, TZ, T1e, T1z, T1g, T1w, T2P, T3a, T2R, T37;
Chris@42 509 V T3m, T3H, T3o, T3E, T28, T14;
Chris@42 510 {
Chris@42 511 V T1, T2, Tn, Td, Te, To;
Chris@42 512 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@42 513 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@42 514 Tn = VADD(T1, T2);
Chris@42 515 Td = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@42 516 Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@42 517 To = VADD(Td, Te);
Chris@42 518 T3 = VSUB(T1, T2);
Chris@42 519 Tu = VSUB(Tn, To);
Chris@42 520 Tf = VSUB(Td, Te);
Chris@42 521 Tp = VADD(Tn, To);
Chris@42 522 }
Chris@42 523 {
Chris@42 524 V T1C, T1D, T1Y, T1O, T1P, T1Z;
Chris@42 525 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@42 526 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@42 527 T1Y = VADD(T1C, T1D);
Chris@42 528 T1O = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@42 529 T1P = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@42 530 T1Z = VADD(T1O, T1P);
Chris@42 531 T1E = VSUB(T1C, T1D);
Chris@42 532 T25 = VSUB(T1Y, T1Z);
Chris@42 533 T1Q = VSUB(T1O, T1P);
Chris@42 534 T20 = VADD(T1Y, T1Z);
Chris@42 535 }
Chris@42 536 {
Chris@42 537 V T29, T2a, T2v, T2l, T2m, T2w;
Chris@42 538 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@42 539 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@42 540 T2v = VADD(T29, T2a);
Chris@42 541 T2l = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@42 542 T2m = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@42 543 T2w = VADD(T2l, T2m);
Chris@42 544 T2b = VSUB(T29, T2a);
Chris@42 545 T2C = VSUB(T2v, T2w);
Chris@42 546 T2n = VSUB(T2l, T2m);
Chris@42 547 T2x = VADD(T2v, T2w);
Chris@42 548 }
Chris@42 549 {
Chris@42 550 V T3K, T3L, T46, T3W, T3X, T47;
Chris@42 551 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@42 552 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@42 553 T46 = VADD(T3K, T3L);
Chris@42 554 T3W = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@42 555 T3X = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@42 556 T47 = VADD(T3W, T3X);
Chris@42 557 T3M = VSUB(T3K, T3L);
Chris@42 558 T4d = VSUB(T46, T47);
Chris@42 559 T3Y = VSUB(T3W, T3X);
Chris@42 560 T48 = VADD(T46, T47);
Chris@42 561 }
Chris@42 562 {
Chris@42 563 V Ty, Tz, TU, TK, TL, TV;
Chris@42 564 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@42 565 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@42 566 TU = VADD(Ty, Tz);
Chris@42 567 TK = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@42 568 TL = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@42 569 TV = VADD(TK, TL);
Chris@42 570 TA = VSUB(Ty, Tz);
Chris@42 571 T11 = VSUB(TU, TV);
Chris@42 572 TM = VSUB(TK, TL);
Chris@42 573 TW = VADD(TU, TV);
Chris@42 574 }
Chris@42 575 {
Chris@42 576 V T15, T16, T1r, T1h, T1i, T1s;
Chris@42 577 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 578 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@42 579 T1r = VADD(T15, T16);
Chris@42 580 T1h = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@42 581 T1i = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@42 582 T1s = VADD(T1h, T1i);
Chris@42 583 T17 = VSUB(T15, T16);
Chris@42 584 T1y = VSUB(T1r, T1s);
Chris@42 585 T1j = VSUB(T1h, T1i);
Chris@42 586 T1t = VADD(T1r, T1s);
Chris@42 587 }
Chris@42 588 {
Chris@42 589 V T2G, T2H, T32, T2S, T2T, T33;
Chris@42 590 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@42 591 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@42 592 T32 = VADD(T2G, T2H);
Chris@42 593 T2S = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@42 594 T2T = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@42 595 T33 = VADD(T2S, T2T);
Chris@42 596 T2I = VSUB(T2G, T2H);
Chris@42 597 T39 = VSUB(T32, T33);
Chris@42 598 T2U = VSUB(T2S, T2T);
Chris@42 599 T34 = VADD(T32, T33);
Chris@42 600 }
Chris@42 601 {
Chris@42 602 V T3d, T3e, T3z, T3p, T3q, T3A;
Chris@42 603 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@42 604 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@42 605 T3z = VADD(T3d, T3e);
Chris@42 606 T3p = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@42 607 T3q = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@42 608 T3A = VADD(T3p, T3q);
Chris@42 609 T3f = VSUB(T3d, T3e);
Chris@42 610 T3G = VSUB(T3z, T3A);
Chris@42 611 T3r = VSUB(T3p, T3q);
Chris@42 612 T3B = VADD(T3z, T3A);
Chris@42 613 }
Chris@42 614 {
Chris@42 615 V T6, Tq, T9, Tr;
Chris@42 616 {
Chris@42 617 V T4, T5, T7, T8;
Chris@42 618 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@42 619 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@42 620 T6 = VSUB(T4, T5);
Chris@42 621 Tq = VADD(T4, T5);
Chris@42 622 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@42 623 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@42 624 T9 = VSUB(T7, T8);
Chris@42 625 Tr = VADD(T7, T8);
Chris@42 626 }
Chris@42 627 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@42 628 Tv = VBYI(VSUB(Tr, Tq));
Chris@42 629 Tc = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@42 630 Ts = VADD(Tq, Tr);
Chris@42 631 }
Chris@42 632 {
Chris@42 633 V T1H, T21, T1K, T22;
Chris@42 634 {
Chris@42 635 V T1F, T1G, T1I, T1J;
Chris@42 636 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 637 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 638 T1H = VSUB(T1F, T1G);
Chris@42 639 T21 = VADD(T1F, T1G);
Chris@42 640 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 641 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 642 T1K = VSUB(T1I, T1J);
Chris@42 643 T22 = VADD(T1I, T1J);
Chris@42 644 }
Chris@42 645 T1L = VMUL(LDK(KP707106781), VADD(T1H, T1K));
Chris@42 646 T26 = VBYI(VSUB(T22, T21));
Chris@42 647 T1N = VMUL(LDK(KP707106781), VSUB(T1K, T1H));
Chris@42 648 T23 = VADD(T21, T22);
Chris@42 649 }
Chris@42 650 {
Chris@42 651 V T2e, T2y, T2h, T2z;
Chris@42 652 {
Chris@42 653 V T2c, T2d, T2f, T2g;
Chris@42 654 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 655 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 656 T2e = VSUB(T2c, T2d);
Chris@42 657 T2y = VADD(T2c, T2d);
Chris@42 658 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 659 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 660 T2h = VSUB(T2f, T2g);
Chris@42 661 T2z = VADD(T2f, T2g);
Chris@42 662 }
Chris@42 663 T2i = VMUL(LDK(KP707106781), VADD(T2e, T2h));
Chris@42 664 T2D = VBYI(VSUB(T2z, T2y));
Chris@42 665 T2k = VMUL(LDK(KP707106781), VSUB(T2h, T2e));
Chris@42 666 T2A = VADD(T2y, T2z);
Chris@42 667 }
Chris@42 668 {
Chris@42 669 V T3P, T49, T3S, T4a;
Chris@42 670 {
Chris@42 671 V T3N, T3O, T3Q, T3R;
Chris@42 672 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 673 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 674 T3P = VSUB(T3N, T3O);
Chris@42 675 T49 = VADD(T3N, T3O);
Chris@42 676 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 677 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 678 T3S = VSUB(T3Q, T3R);
Chris@42 679 T4a = VADD(T3Q, T3R);
Chris@42 680 }
Chris@42 681 T3T = VMUL(LDK(KP707106781), VADD(T3P, T3S));
Chris@42 682 T4e = VBYI(VSUB(T4a, T49));
Chris@42 683 T3V = VMUL(LDK(KP707106781), VSUB(T3S, T3P));
Chris@42 684 T4b = VADD(T49, T4a);
Chris@42 685 }
Chris@42 686 {
Chris@42 687 V TD, TX, TG, TY;
Chris@42 688 {
Chris@42 689 V TB, TC, TE, TF;
Chris@42 690 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 691 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 692 TD = VSUB(TB, TC);
Chris@42 693 TX = VADD(TB, TC);
Chris@42 694 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 695 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 696 TG = VSUB(TE, TF);
Chris@42 697 TY = VADD(TE, TF);
Chris@42 698 }
Chris@42 699 TH = VMUL(LDK(KP707106781), VADD(TD, TG));
Chris@42 700 T12 = VBYI(VSUB(TY, TX));
Chris@42 701 TJ = VMUL(LDK(KP707106781), VSUB(TG, TD));
Chris@42 702 TZ = VADD(TX, TY);
Chris@42 703 }
Chris@42 704 {
Chris@42 705 V T1a, T1u, T1d, T1v;
Chris@42 706 {
Chris@42 707 V T18, T19, T1b, T1c;
Chris@42 708 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 709 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 710 T1a = VSUB(T18, T19);
Chris@42 711 T1u = VADD(T18, T19);
Chris@42 712 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 713 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 714 T1d = VSUB(T1b, T1c);
Chris@42 715 T1v = VADD(T1b, T1c);
Chris@42 716 }
Chris@42 717 T1e = VMUL(LDK(KP707106781), VADD(T1a, T1d));
Chris@42 718 T1z = VBYI(VSUB(T1v, T1u));
Chris@42 719 T1g = VMUL(LDK(KP707106781), VSUB(T1d, T1a));
Chris@42 720 T1w = VADD(T1u, T1v);
Chris@42 721 }
Chris@42 722 {
Chris@42 723 V T2L, T35, T2O, T36;
Chris@42 724 {
Chris@42 725 V T2J, T2K, T2M, T2N;
Chris@42 726 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 727 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 728 T2L = VSUB(T2J, T2K);
Chris@42 729 T35 = VADD(T2J, T2K);
Chris@42 730 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 731 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 732 T2O = VSUB(T2M, T2N);
Chris@42 733 T36 = VADD(T2M, T2N);
Chris@42 734 }
Chris@42 735 T2P = VMUL(LDK(KP707106781), VADD(T2L, T2O));
Chris@42 736 T3a = VBYI(VSUB(T36, T35));
Chris@42 737 T2R = VMUL(LDK(KP707106781), VSUB(T2O, T2L));
Chris@42 738 T37 = VADD(T35, T36);
Chris@42 739 }
Chris@42 740 {
Chris@42 741 V T3i, T3C, T3l, T3D;
Chris@42 742 {
Chris@42 743 V T3g, T3h, T3j, T3k;
Chris@42 744 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 745 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 746 T3i = VSUB(T3g, T3h);
Chris@42 747 T3C = VADD(T3g, T3h);
Chris@42 748 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 749 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 750 T3l = VSUB(T3j, T3k);
Chris@42 751 T3D = VADD(T3j, T3k);
Chris@42 752 }
Chris@42 753 T3m = VMUL(LDK(KP707106781), VADD(T3i, T3l));
Chris@42 754 T3H = VBYI(VSUB(T3D, T3C));
Chris@42 755 T3o = VMUL(LDK(KP707106781), VSUB(T3l, T3i));
Chris@42 756 T3E = VADD(T3C, T3D);
Chris@42 757 }
Chris@42 758 ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
Chris@42 759 ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
Chris@42 760 ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
Chris@42 761 ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
Chris@42 762 ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
Chris@42 763 ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
Chris@42 764 {
Chris@42 765 V Tt, T4c, T2B, T24;
Chris@42 766 ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
Chris@42 767 ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
Chris@42 768 Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
Chris@42 769 ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
Chris@42 770 T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
Chris@42 771 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 772 T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
Chris@42 773 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
Chris@42 774 T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
Chris@42 775 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 776 }
Chris@42 777 {
Chris@42 778 V T10, T1x, T3F, T38, T1A, Tw;
Chris@42 779 T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
Chris@42 780 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 781 T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
Chris@42 782 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
Chris@42 783 T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
Chris@42 784 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
Chris@42 785 T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
Chris@42 786 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@42 787 T1A = BYTWJ(&(W[TWVL * 10]), VSUB(T1y, T1z));
Chris@42 788 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
Chris@42 789 Tw = BYTWJ(&(W[TWVL * 10]), VSUB(Tu, Tv));
Chris@42 790 ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
Chris@42 791 }
Chris@42 792 {
Chris@42 793 V T2E, T3I, T13, T27, T3b, T4f;
Chris@42 794 T2E = BYTWJ(&(W[TWVL * 10]), VSUB(T2C, T2D));
Chris@42 795 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
Chris@42 796 T3I = BYTWJ(&(W[TWVL * 10]), VSUB(T3G, T3H));
Chris@42 797 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
Chris@42 798 T13 = BYTWJ(&(W[TWVL * 10]), VSUB(T11, T12));
Chris@42 799 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 800 T27 = BYTWJ(&(W[TWVL * 10]), VSUB(T25, T26));
Chris@42 801 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 802 T3b = BYTWJ(&(W[TWVL * 10]), VSUB(T39, T3a));
Chris@42 803 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 804 T4f = BYTWJ(&(W[TWVL * 10]), VSUB(T4d, T4e));
Chris@42 805 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@42 806 }
Chris@42 807 {
Chris@42 808 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@42 809 Tx = BYTWJ(&(W[TWVL * 2]), VADD(Tu, Tv));
Chris@42 810 ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@42 811 T1B = BYTWJ(&(W[TWVL * 2]), VADD(T1y, T1z));
Chris@42 812 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@42 813 T3c = BYTWJ(&(W[TWVL * 2]), VADD(T39, T3a));
Chris@42 814 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 815 T4g = BYTWJ(&(W[TWVL * 2]), VADD(T4d, T4e));
Chris@42 816 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 817 T3J = BYTWJ(&(W[TWVL * 2]), VADD(T3G, T3H));
Chris@42 818 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
Chris@42 819 T2F = BYTWJ(&(W[TWVL * 2]), VADD(T2C, T2D));
Chris@42 820 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
Chris@42 821 }
Chris@42 822 T28 = BYTWJ(&(W[TWVL * 2]), VADD(T25, T26));
Chris@42 823 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 824 T14 = BYTWJ(&(W[TWVL * 2]), VADD(T11, T12));
Chris@42 825 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@42 826 {
Chris@42 827 V Th, Ti, Tb, Tg;
Chris@42 828 Tb = VADD(T3, Ta);
Chris@42 829 Tg = VBYI(VSUB(Tc, Tf));
Chris@42 830 Th = BYTWJ(&(W[TWVL * 12]), VSUB(Tb, Tg));
Chris@42 831 Ti = BYTWJ(&(W[0]), VADD(Tb, Tg));
Chris@42 832 ST(&(x[WS(vs, 7)]), Th, ms, &(x[WS(vs, 7)]));
Chris@42 833 ST(&(x[WS(vs, 1)]), Ti, ms, &(x[WS(vs, 1)]));
Chris@42 834 }
Chris@42 835 {
Chris@42 836 V T40, T41, T3U, T3Z;
Chris@42 837 T3U = VADD(T3M, T3T);
Chris@42 838 T3Z = VBYI(VSUB(T3V, T3Y));
Chris@42 839 T40 = BYTWJ(&(W[TWVL * 12]), VSUB(T3U, T3Z));
Chris@42 840 T41 = BYTWJ(&(W[0]), VADD(T3U, T3Z));
Chris@42 841 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 842 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 843 }
Chris@42 844 {
Chris@42 845 V T2p, T2q, T2j, T2o;
Chris@42 846 T2j = VADD(T2b, T2i);
Chris@42 847 T2o = VBYI(VSUB(T2k, T2n));
Chris@42 848 T2p = BYTWJ(&(W[TWVL * 12]), VSUB(T2j, T2o));
Chris@42 849 T2q = BYTWJ(&(W[0]), VADD(T2j, T2o));
Chris@42 850 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 7)]));
Chris@42 851 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 1)]));
Chris@42 852 }
Chris@42 853 {
Chris@42 854 V T1S, T1T, T1M, T1R;
Chris@42 855 T1M = VADD(T1E, T1L);
Chris@42 856 T1R = VBYI(VSUB(T1N, T1Q));
Chris@42 857 T1S = BYTWJ(&(W[TWVL * 12]), VSUB(T1M, T1R));
Chris@42 858 T1T = BYTWJ(&(W[0]), VADD(T1M, T1R));
Chris@42 859 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 860 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 861 }
Chris@42 862 {
Chris@42 863 V TO, TP, TI, TN;
Chris@42 864 TI = VADD(TA, TH);
Chris@42 865 TN = VBYI(VSUB(TJ, TM));
Chris@42 866 TO = BYTWJ(&(W[TWVL * 12]), VSUB(TI, TN));
Chris@42 867 TP = BYTWJ(&(W[0]), VADD(TI, TN));
Chris@42 868 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 869 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 870 }
Chris@42 871 {
Chris@42 872 V T1l, T1m, T1f, T1k;
Chris@42 873 T1f = VADD(T17, T1e);
Chris@42 874 T1k = VBYI(VSUB(T1g, T1j));
Chris@42 875 T1l = BYTWJ(&(W[TWVL * 12]), VSUB(T1f, T1k));
Chris@42 876 T1m = BYTWJ(&(W[0]), VADD(T1f, T1k));
Chris@42 877 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 7)]));
Chris@42 878 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 1)]));
Chris@42 879 }
Chris@42 880 {
Chris@42 881 V T3t, T3u, T3n, T3s;
Chris@42 882 T3n = VADD(T3f, T3m);
Chris@42 883 T3s = VBYI(VSUB(T3o, T3r));
Chris@42 884 T3t = BYTWJ(&(W[TWVL * 12]), VSUB(T3n, T3s));
Chris@42 885 T3u = BYTWJ(&(W[0]), VADD(T3n, T3s));
Chris@42 886 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 7)]));
Chris@42 887 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 1)]));
Chris@42 888 }
Chris@42 889 {
Chris@42 890 V T2W, T2X, T2Q, T2V;
Chris@42 891 T2Q = VADD(T2I, T2P);
Chris@42 892 T2V = VBYI(VSUB(T2R, T2U));
Chris@42 893 T2W = BYTWJ(&(W[TWVL * 12]), VSUB(T2Q, T2V));
Chris@42 894 T2X = BYTWJ(&(W[0]), VADD(T2Q, T2V));
Chris@42 895 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@42 896 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@42 897 }
Chris@42 898 {
Chris@42 899 V T1p, T1q, T1n, T1o;
Chris@42 900 T1n = VSUB(T17, T1e);
Chris@42 901 T1o = VBYI(VADD(T1j, T1g));
Chris@42 902 T1p = BYTWJ(&(W[TWVL * 8]), VSUB(T1n, T1o));
Chris@42 903 T1q = BYTWJ(&(W[TWVL * 4]), VADD(T1n, T1o));
Chris@42 904 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
Chris@42 905 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
Chris@42 906 }
Chris@42 907 {
Chris@42 908 V Tl, Tm, Tj, Tk;
Chris@42 909 Tj = VSUB(T3, Ta);
Chris@42 910 Tk = VBYI(VADD(Tf, Tc));
Chris@42 911 Tl = BYTWJ(&(W[TWVL * 8]), VSUB(Tj, Tk));
Chris@42 912 Tm = BYTWJ(&(W[TWVL * 4]), VADD(Tj, Tk));
Chris@42 913 ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
Chris@42 914 ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
Chris@42 915 }
Chris@42 916 {
Chris@42 917 V T2t, T2u, T2r, T2s;
Chris@42 918 T2r = VSUB(T2b, T2i);
Chris@42 919 T2s = VBYI(VADD(T2n, T2k));
Chris@42 920 T2t = BYTWJ(&(W[TWVL * 8]), VSUB(T2r, T2s));
Chris@42 921 T2u = BYTWJ(&(W[TWVL * 4]), VADD(T2r, T2s));
Chris@42 922 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
Chris@42 923 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
Chris@42 924 }
Chris@42 925 {
Chris@42 926 V T3x, T3y, T3v, T3w;
Chris@42 927 T3v = VSUB(T3f, T3m);
Chris@42 928 T3w = VBYI(VADD(T3r, T3o));
Chris@42 929 T3x = BYTWJ(&(W[TWVL * 8]), VSUB(T3v, T3w));
Chris@42 930 T3y = BYTWJ(&(W[TWVL * 4]), VADD(T3v, T3w));
Chris@42 931 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
Chris@42 932 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
Chris@42 933 }
Chris@42 934 {
Chris@42 935 V TS, TT, TQ, TR;
Chris@42 936 TQ = VSUB(TA, TH);
Chris@42 937 TR = VBYI(VADD(TM, TJ));
Chris@42 938 TS = BYTWJ(&(W[TWVL * 8]), VSUB(TQ, TR));
Chris@42 939 TT = BYTWJ(&(W[TWVL * 4]), VADD(TQ, TR));
Chris@42 940 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 941 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 942 }
Chris@42 943 {
Chris@42 944 V T1W, T1X, T1U, T1V;
Chris@42 945 T1U = VSUB(T1E, T1L);
Chris@42 946 T1V = VBYI(VADD(T1Q, T1N));
Chris@42 947 T1W = BYTWJ(&(W[TWVL * 8]), VSUB(T1U, T1V));
Chris@42 948 T1X = BYTWJ(&(W[TWVL * 4]), VADD(T1U, T1V));
Chris@42 949 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 950 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 951 }
Chris@42 952 {
Chris@42 953 V T30, T31, T2Y, T2Z;
Chris@42 954 T2Y = VSUB(T2I, T2P);
Chris@42 955 T2Z = VBYI(VADD(T2U, T2R));
Chris@42 956 T30 = BYTWJ(&(W[TWVL * 8]), VSUB(T2Y, T2Z));
Chris@42 957 T31 = BYTWJ(&(W[TWVL * 4]), VADD(T2Y, T2Z));
Chris@42 958 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 959 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 960 }
Chris@42 961 {
Chris@42 962 V T44, T45, T42, T43;
Chris@42 963 T42 = VSUB(T3M, T3T);
Chris@42 964 T43 = VBYI(VADD(T3Y, T3V));
Chris@42 965 T44 = BYTWJ(&(W[TWVL * 8]), VSUB(T42, T43));
Chris@42 966 T45 = BYTWJ(&(W[TWVL * 4]), VADD(T42, T43));
Chris@42 967 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@42 968 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@42 969 }
Chris@42 970 }
Chris@42 971 }
Chris@42 972 VLEAVE();
Chris@42 973 }
Chris@42 974
Chris@42 975 static const tw_instr twinstr[] = {
Chris@42 976 VTW(0, 1),
Chris@42 977 VTW(0, 2),
Chris@42 978 VTW(0, 3),
Chris@42 979 VTW(0, 4),
Chris@42 980 VTW(0, 5),
Chris@42 981 VTW(0, 6),
Chris@42 982 VTW(0, 7),
Chris@42 983 {TW_NEXT, VL, 0}
Chris@42 984 };
Chris@42 985
Chris@42 986 static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
Chris@42 987
Chris@42 988 void XSIMD(codelet_q1fv_8) (planner *p) {
Chris@42 989 X(kdft_difsq_register) (p, q1fv_8, &desc);
Chris@42 990 }
Chris@42 991 #endif /* HAVE_FMA */