annotate src/fftw-3.3.3/dft/simd/common/q1bv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:39:33 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include q1b.h -sign 1 */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 264 FP additions, 192 FP multiplications,
Chris@10 32 * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
Chris@10 33 * 121 stack variables, 1 constants, and 128 memory accesses
Chris@10 34 */
Chris@10 35 #include "q1b.h"
Chris@10 36
Chris@10 37 static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 40 {
Chris@10 41 INT m;
Chris@10 42 R *x;
Chris@10 43 x = ii;
Chris@10 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@10 45 V T42, T43, T1U, T1V, T2Y, T2Z, TT, TS, T45, T44;
Chris@10 46 {
Chris@10 47 V T3, Te, T1E, T1P, Tv, Tp, T26, T20, T2b, T2m, T3M, T2x, T2D, T3X, TA;
Chris@10 48 V TL, T48, T4e, T17, T12, TW, T1i, T2I, T1z, T1t, T2T, T3f, T3q, T34, T3a;
Chris@10 49 V T3H, T3B, Ts, Tw, Tf, Ta, T23, T27, T1Q, T1L, T2A, T2E, T2n, T2i, T4b;
Chris@10 50 V T4f, T3Y, T3T, TZ, T13, TM, TH, T35, T2L, T3j, T1w, T1A, T1j, T1e, T36;
Chris@10 51 V T2O, T3C, T3i, T3k;
Chris@10 52 {
Chris@10 53 V T3d, T32, T3e, T3o, T3p, T33;
Chris@10 54 {
Chris@10 55 V T2v, T2w, T3V, T46, T3W;
Chris@10 56 {
Chris@10 57 V T1, T2, Tc, Td, T1C, T1D, T1N, T1O;
Chris@10 58 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@10 59 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 60 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 61 Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 62 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@10 63 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@10 64 T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@10 65 T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@10 66 {
Chris@10 67 V T29, T1Y, T1Z, T2a, T2k, T2l, Tn, To, T3K, T3L;
Chris@10 68 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 69 T3 = VSUB(T1, T2);
Chris@10 70 Tn = VADD(T1, T2);
Chris@10 71 Te = VSUB(Tc, Td);
Chris@10 72 To = VADD(Tc, Td);
Chris@10 73 T1E = VSUB(T1C, T1D);
Chris@10 74 T1Y = VADD(T1C, T1D);
Chris@10 75 T1P = VSUB(T1N, T1O);
Chris@10 76 T1Z = VADD(T1N, T1O);
Chris@10 77 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 78 T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@10 79 T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@10 80 Tv = VADD(Tn, To);
Chris@10 81 Tp = VSUB(Tn, To);
Chris@10 82 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@10 83 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@10 84 T26 = VADD(T1Y, T1Z);
Chris@10 85 T20 = VSUB(T1Y, T1Z);
Chris@10 86 T2v = VADD(T29, T2a);
Chris@10 87 T2b = VSUB(T29, T2a);
Chris@10 88 T2w = VADD(T2k, T2l);
Chris@10 89 T2m = VSUB(T2k, T2l);
Chris@10 90 T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@10 91 T46 = VADD(T3K, T3L);
Chris@10 92 T3M = VSUB(T3K, T3L);
Chris@10 93 T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@10 94 }
Chris@10 95 }
Chris@10 96 {
Chris@10 97 V T15, TU, T16, T1g, TV, T1h;
Chris@10 98 {
Chris@10 99 V Ty, Tz, TJ, TK, T47;
Chris@10 100 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@10 101 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@10 102 TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@10 103 T2x = VSUB(T2v, T2w);
Chris@10 104 T2D = VADD(T2v, T2w);
Chris@10 105 TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@10 106 T47 = VADD(T3V, T3W);
Chris@10 107 T3X = VSUB(T3V, T3W);
Chris@10 108 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 109 TA = VSUB(Ty, Tz);
Chris@10 110 TU = VADD(Ty, Tz);
Chris@10 111 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@10 112 T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 113 TL = VSUB(TJ, TK);
Chris@10 114 TV = VADD(TJ, TK);
Chris@10 115 T48 = VSUB(T46, T47);
Chris@10 116 T4e = VADD(T46, T47);
Chris@10 117 T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@10 118 }
Chris@10 119 {
Chris@10 120 V T2G, T1r, T2H, T2R, T1s, T2S;
Chris@10 121 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@10 122 T17 = VSUB(T15, T16);
Chris@10 123 T1r = VADD(T15, T16);
Chris@10 124 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@10 125 T12 = VADD(TU, TV);
Chris@10 126 TW = VSUB(TU, TV);
Chris@10 127 T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@10 128 T1i = VSUB(T1g, T1h);
Chris@10 129 T1s = VADD(T1g, T1h);
Chris@10 130 T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@10 131 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@10 132 T2I = VSUB(T2G, T2H);
Chris@10 133 T32 = VADD(T2G, T2H);
Chris@10 134 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@10 135 T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@10 136 T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@10 137 T1z = VADD(T1r, T1s);
Chris@10 138 T1t = VSUB(T1r, T1s);
Chris@10 139 T33 = VADD(T2R, T2S);
Chris@10 140 T2T = VSUB(T2R, T2S);
Chris@10 141 }
Chris@10 142 }
Chris@10 143 }
Chris@10 144 {
Chris@10 145 V T2y, T2e, T3Q, T2z, T2h, T49, T3P, T3R;
Chris@10 146 {
Chris@10 147 V T6, Tq, T1I, Tr, T9, T21, T1H, T1J;
Chris@10 148 {
Chris@10 149 V T4, T3z, T3A, T5, T7, T8, T1F, T1G;
Chris@10 150 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 151 T3f = VSUB(T3d, T3e);
Chris@10 152 T3z = VADD(T3d, T3e);
Chris@10 153 T3q = VSUB(T3o, T3p);
Chris@10 154 T3A = VADD(T3o, T3p);
Chris@10 155 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 156 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 157 T34 = VSUB(T32, T33);
Chris@10 158 T3a = VADD(T32, T33);
Chris@10 159 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 160 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 161 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 162 T3H = VADD(T3z, T3A);
Chris@10 163 T3B = VSUB(T3z, T3A);
Chris@10 164 T6 = VSUB(T4, T5);
Chris@10 165 Tq = VADD(T4, T5);
Chris@10 166 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 167 Tr = VADD(T7, T8);
Chris@10 168 T9 = VSUB(T7, T8);
Chris@10 169 T21 = VADD(T1F, T1G);
Chris@10 170 T1H = VSUB(T1F, T1G);
Chris@10 171 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 172 }
Chris@10 173 {
Chris@10 174 V T2f, T22, T1K, T2g, T2c, T2d, T3N, T3O;
Chris@10 175 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 176 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 177 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 178 Ts = VSUB(Tq, Tr);
Chris@10 179 Tw = VADD(Tq, Tr);
Chris@10 180 Tf = VSUB(T6, T9);
Chris@10 181 Ta = VADD(T6, T9);
Chris@10 182 T22 = VADD(T1I, T1J);
Chris@10 183 T1K = VSUB(T1I, T1J);
Chris@10 184 T2y = VADD(T2c, T2d);
Chris@10 185 T2e = VSUB(T2c, T2d);
Chris@10 186 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 187 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 188 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 189 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 190 T23 = VSUB(T21, T22);
Chris@10 191 T27 = VADD(T21, T22);
Chris@10 192 T1Q = VSUB(T1H, T1K);
Chris@10 193 T1L = VADD(T1H, T1K);
Chris@10 194 T2z = VADD(T2f, T2g);
Chris@10 195 T2h = VSUB(T2f, T2g);
Chris@10 196 T49 = VADD(T3N, T3O);
Chris@10 197 T3P = VSUB(T3N, T3O);
Chris@10 198 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 199 }
Chris@10 200 }
Chris@10 201 {
Chris@10 202 V TX, TD, T1b, TY, TG, T1u, T1a, T1c;
Chris@10 203 {
Chris@10 204 V TE, T4a, T3S, TF, TB, TC, T18, T19;
Chris@10 205 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 206 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 207 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 208 T2A = VSUB(T2y, T2z);
Chris@10 209 T2E = VADD(T2y, T2z);
Chris@10 210 T2n = VSUB(T2e, T2h);
Chris@10 211 T2i = VADD(T2e, T2h);
Chris@10 212 T4a = VADD(T3Q, T3R);
Chris@10 213 T3S = VSUB(T3Q, T3R);
Chris@10 214 TX = VADD(TB, TC);
Chris@10 215 TD = VSUB(TB, TC);
Chris@10 216 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 217 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 218 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 219 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 220 T4b = VSUB(T49, T4a);
Chris@10 221 T4f = VADD(T49, T4a);
Chris@10 222 T3Y = VSUB(T3P, T3S);
Chris@10 223 T3T = VADD(T3P, T3S);
Chris@10 224 TY = VADD(TE, TF);
Chris@10 225 TG = VSUB(TE, TF);
Chris@10 226 T1u = VADD(T18, T19);
Chris@10 227 T1a = VSUB(T18, T19);
Chris@10 228 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 229 }
Chris@10 230 {
Chris@10 231 V T2M, T1v, T1d, T2N, T2J, T2K, T3g, T3h;
Chris@10 232 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 233 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 234 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 235 TZ = VSUB(TX, TY);
Chris@10 236 T13 = VADD(TX, TY);
Chris@10 237 TM = VSUB(TD, TG);
Chris@10 238 TH = VADD(TD, TG);
Chris@10 239 T1v = VADD(T1b, T1c);
Chris@10 240 T1d = VSUB(T1b, T1c);
Chris@10 241 T35 = VADD(T2J, T2K);
Chris@10 242 T2L = VSUB(T2J, T2K);
Chris@10 243 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 244 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 245 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 246 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 247 T1w = VSUB(T1u, T1v);
Chris@10 248 T1A = VADD(T1u, T1v);
Chris@10 249 T1j = VSUB(T1a, T1d);
Chris@10 250 T1e = VADD(T1a, T1d);
Chris@10 251 T36 = VADD(T2M, T2N);
Chris@10 252 T2O = VSUB(T2M, T2N);
Chris@10 253 T3C = VADD(T3g, T3h);
Chris@10 254 T3i = VSUB(T3g, T3h);
Chris@10 255 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 256 }
Chris@10 257 }
Chris@10 258 }
Chris@10 259 }
Chris@10 260 {
Chris@10 261 V T3b, T2U, T2P, T3I, T3r, T3m, T11, T25, T39, T4d;
Chris@10 262 {
Chris@10 263 V T37, T3E, T2B, T24;
Chris@10 264 {
Chris@10 265 V T3D, T3l, Tt, T4c;
Chris@10 266 ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
Chris@10 267 ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
Chris@10 268 ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
Chris@10 269 T37 = VSUB(T35, T36);
Chris@10 270 T3b = VADD(T35, T36);
Chris@10 271 T2U = VSUB(T2L, T2O);
Chris@10 272 T2P = VADD(T2L, T2O);
Chris@10 273 T3D = VADD(T3j, T3k);
Chris@10 274 T3l = VSUB(T3j, T3k);
Chris@10 275 ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
Chris@10 276 ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
Chris@10 277 ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
Chris@10 278 ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
Chris@10 279 Tt = BYTW(&(W[TWVL * 10]), VFNMSI(Ts, Tp));
Chris@10 280 T4c = BYTW(&(W[TWVL * 10]), VFNMSI(T4b, T48));
Chris@10 281 T3E = VSUB(T3C, T3D);
Chris@10 282 T3I = VADD(T3C, T3D);
Chris@10 283 T3r = VSUB(T3i, T3l);
Chris@10 284 T3m = VADD(T3i, T3l);
Chris@10 285 T2B = BYTW(&(W[TWVL * 10]), VFNMSI(T2A, T2x));
Chris@10 286 T24 = BYTW(&(W[TWVL * 10]), VFNMSI(T23, T20));
Chris@10 287 ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
Chris@10 288 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 289 }
Chris@10 290 {
Chris@10 291 V T38, T1y, Tu, T10, T1x, T3F, T2C, T3G;
Chris@10 292 T10 = BYTW(&(W[TWVL * 10]), VFNMSI(TZ, TW));
Chris@10 293 ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
Chris@10 294 T1x = BYTW(&(W[TWVL * 10]), VFNMSI(T1w, T1t));
Chris@10 295 T3F = BYTW(&(W[TWVL * 10]), VFNMSI(T3E, T3B));
Chris@10 296 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
Chris@10 297 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 298 T38 = BYTW(&(W[TWVL * 10]), VFNMSI(T37, T34));
Chris@10 299 T1y = BYTW(&(W[TWVL * 2]), VFMAI(T1w, T1t));
Chris@10 300 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 301 Tu = BYTW(&(W[TWVL * 2]), VFMAI(Ts, Tp));
Chris@10 302 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
Chris@10 303 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
Chris@10 304 T2C = BYTW(&(W[TWVL * 2]), VFMAI(T2A, T2x));
Chris@10 305 T3G = BYTW(&(W[TWVL * 2]), VFMAI(T3E, T3B));
Chris@10 306 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 307 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
Chris@10 308 T11 = BYTW(&(W[TWVL * 2]), VFMAI(TZ, TW));
Chris@10 309 ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
Chris@10 310 T25 = BYTW(&(W[TWVL * 2]), VFMAI(T23, T20));
Chris@10 311 T39 = BYTW(&(W[TWVL * 2]), VFMAI(T37, T34));
Chris@10 312 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
Chris@10 313 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
Chris@10 314 T4d = BYTW(&(W[TWVL * 2]), VFMAI(T4b, T48));
Chris@10 315 }
Chris@10 316 }
Chris@10 317 {
Chris@10 318 V Tj, Tk, T2r, T2j, T2o, T2s, Ti, Th, T1M, T1R, T41, T40;
Chris@10 319 {
Chris@10 320 V T3c, T4g, T3J, T2F, Tx, T1B;
Chris@10 321 Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
Chris@10 322 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 323 T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
Chris@10 324 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 325 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 326 T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
Chris@10 327 T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
Chris@10 328 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 329 ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
Chris@10 330 T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
Chris@10 331 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
Chris@10 332 T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
Chris@10 333 {
Chris@10 334 V T14, Tb, Tg, T28, T3U, T3Z;
Chris@10 335 T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
Chris@10 336 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 337 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 338 T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
Chris@10 339 Tj = VFMA(LDK(KP707106781), Ta, T3);
Chris@10 340 Tb = VFNMS(LDK(KP707106781), Ta, T3);
Chris@10 341 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
Chris@10 342 Tk = VFMA(LDK(KP707106781), Tf, Te);
Chris@10 343 Tg = VFNMS(LDK(KP707106781), Tf, Te);
Chris@10 344 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
Chris@10 345 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 346 T3U = VFNMS(LDK(KP707106781), T3T, T3M);
Chris@10 347 T42 = VFMA(LDK(KP707106781), T3T, T3M);
Chris@10 348 T43 = VFMA(LDK(KP707106781), T3Y, T3X);
Chris@10 349 T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
Chris@10 350 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 351 T2r = VFMA(LDK(KP707106781), T2i, T2b);
Chris@10 352 T2j = VFNMS(LDK(KP707106781), T2i, T2b);
Chris@10 353 T2o = VFNMS(LDK(KP707106781), T2n, T2m);
Chris@10 354 T2s = VFMA(LDK(KP707106781), T2n, T2m);
Chris@10 355 Ti = BYTW(&(W[TWVL * 8]), VFMAI(Tg, Tb));
Chris@10 356 Th = BYTW(&(W[TWVL * 4]), VFNMSI(Tg, Tb));
Chris@10 357 T1U = VFMA(LDK(KP707106781), T1L, T1E);
Chris@10 358 T1M = VFNMS(LDK(KP707106781), T1L, T1E);
Chris@10 359 T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
Chris@10 360 T1V = VFMA(LDK(KP707106781), T1Q, T1P);
Chris@10 361 T41 = BYTW(&(W[TWVL * 8]), VFMAI(T3Z, T3U));
Chris@10 362 T40 = BYTW(&(W[TWVL * 4]), VFNMSI(T3Z, T3U));
Chris@10 363 }
Chris@10 364 }
Chris@10 365 {
Chris@10 366 V TQ, TR, T1n, T1o, T3v, T3w;
Chris@10 367 {
Chris@10 368 V TI, TN, T1f, T1k, T3n, T3s;
Chris@10 369 {
Chris@10 370 V T1T, T1S, T2q, T2p;
Chris@10 371 TQ = VFMA(LDK(KP707106781), TH, TA);
Chris@10 372 TI = VFNMS(LDK(KP707106781), TH, TA);
Chris@10 373 T2q = BYTW(&(W[TWVL * 8]), VFMAI(T2o, T2j));
Chris@10 374 T2p = BYTW(&(W[TWVL * 4]), VFNMSI(T2o, T2j));
Chris@10 375 ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
Chris@10 376 ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
Chris@10 377 T1T = BYTW(&(W[TWVL * 8]), VFMAI(T1R, T1M));
Chris@10 378 T1S = BYTW(&(W[TWVL * 4]), VFNMSI(T1R, T1M));
Chris@10 379 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 380 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 381 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
Chris@10 382 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
Chris@10 383 TN = VFNMS(LDK(KP707106781), TM, TL);
Chris@10 384 TR = VFMA(LDK(KP707106781), TM, TL);
Chris@10 385 T1n = VFMA(LDK(KP707106781), T1e, T17);
Chris@10 386 T1f = VFNMS(LDK(KP707106781), T1e, T17);
Chris@10 387 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 388 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 389 T1k = VFNMS(LDK(KP707106781), T1j, T1i);
Chris@10 390 T1o = VFMA(LDK(KP707106781), T1j, T1i);
Chris@10 391 T3v = VFMA(LDK(KP707106781), T3m, T3f);
Chris@10 392 T3n = VFNMS(LDK(KP707106781), T3m, T3f);
Chris@10 393 T3s = VFNMS(LDK(KP707106781), T3r, T3q);
Chris@10 394 T3w = VFMA(LDK(KP707106781), T3r, T3q);
Chris@10 395 }
Chris@10 396 {
Chris@10 397 V T2Q, TP, TO, T2V, T2X, T2W;
Chris@10 398 T2Y = VFMA(LDK(KP707106781), T2P, T2I);
Chris@10 399 T2Q = VFNMS(LDK(KP707106781), T2P, T2I);
Chris@10 400 TP = BYTW(&(W[TWVL * 8]), VFMAI(TN, TI));
Chris@10 401 TO = BYTW(&(W[TWVL * 4]), VFNMSI(TN, TI));
Chris@10 402 T2V = VFNMS(LDK(KP707106781), T2U, T2T);
Chris@10 403 T2Z = VFMA(LDK(KP707106781), T2U, T2T);
Chris@10 404 {
Chris@10 405 V T1m, T1l, T3u, T3t;
Chris@10 406 T1m = BYTW(&(W[TWVL * 8]), VFMAI(T1k, T1f));
Chris@10 407 T1l = BYTW(&(W[TWVL * 4]), VFNMSI(T1k, T1f));
Chris@10 408 T3u = BYTW(&(W[TWVL * 8]), VFMAI(T3s, T3n));
Chris@10 409 T3t = BYTW(&(W[TWVL * 4]), VFNMSI(T3s, T3n));
Chris@10 410 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 411 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 412 T2X = BYTW(&(W[TWVL * 8]), VFMAI(T2V, T2Q));
Chris@10 413 T2W = BYTW(&(W[TWVL * 4]), VFNMSI(T2V, T2Q));
Chris@10 414 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
Chris@10 415 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
Chris@10 416 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
Chris@10 417 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
Chris@10 418 }
Chris@10 419 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 420 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 421 }
Chris@10 422 }
Chris@10 423 {
Chris@10 424 V T3y, T3x, T1q, T1p;
Chris@10 425 T1q = BYTW(&(W[TWVL * 12]), VFNMSI(T1o, T1n));
Chris@10 426 T1p = BYTW(&(W[0]), VFMAI(T1o, T1n));
Chris@10 427 {
Chris@10 428 V Tm, Tl, T2u, T2t;
Chris@10 429 Tm = BYTW(&(W[TWVL * 12]), VFNMSI(Tk, Tj));
Chris@10 430 Tl = BYTW(&(W[0]), VFMAI(Tk, Tj));
Chris@10 431 T2u = BYTW(&(W[TWVL * 12]), VFNMSI(T2s, T2r));
Chris@10 432 T2t = BYTW(&(W[0]), VFMAI(T2s, T2r));
Chris@10 433 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
Chris@10 434 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
Chris@10 435 T3y = BYTW(&(W[TWVL * 12]), VFNMSI(T3w, T3v));
Chris@10 436 T3x = BYTW(&(W[0]), VFMAI(T3w, T3v));
Chris@10 437 ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
Chris@10 438 ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
Chris@10 439 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
Chris@10 440 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
Chris@10 441 }
Chris@10 442 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
Chris@10 443 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
Chris@10 444 TT = BYTW(&(W[TWVL * 12]), VFNMSI(TR, TQ));
Chris@10 445 TS = BYTW(&(W[0]), VFMAI(TR, TQ));
Chris@10 446 }
Chris@10 447 }
Chris@10 448 }
Chris@10 449 }
Chris@10 450 }
Chris@10 451 {
Chris@10 452 V T1X, T1W, T31, T30;
Chris@10 453 T1X = BYTW(&(W[TWVL * 12]), VFNMSI(T1V, T1U));
Chris@10 454 T1W = BYTW(&(W[0]), VFMAI(T1V, T1U));
Chris@10 455 T31 = BYTW(&(W[TWVL * 12]), VFNMSI(T2Z, T2Y));
Chris@10 456 T30 = BYTW(&(W[0]), VFMAI(T2Z, T2Y));
Chris@10 457 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 458 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 459 T45 = BYTW(&(W[TWVL * 12]), VFNMSI(T43, T42));
Chris@10 460 T44 = BYTW(&(W[0]), VFMAI(T43, T42));
Chris@10 461 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 462 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 463 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 464 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 465 }
Chris@10 466 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 467 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 468 }
Chris@10 469 }
Chris@10 470 VLEAVE();
Chris@10 471 }
Chris@10 472
Chris@10 473 static const tw_instr twinstr[] = {
Chris@10 474 VTW(0, 1),
Chris@10 475 VTW(0, 2),
Chris@10 476 VTW(0, 3),
Chris@10 477 VTW(0, 4),
Chris@10 478 VTW(0, 5),
Chris@10 479 VTW(0, 6),
Chris@10 480 VTW(0, 7),
Chris@10 481 {TW_NEXT, VL, 0}
Chris@10 482 };
Chris@10 483
Chris@10 484 static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
Chris@10 485
Chris@10 486 void XSIMD(codelet_q1bv_8) (planner *p) {
Chris@10 487 X(kdft_difsq_register) (p, q1bv_8, &desc);
Chris@10 488 }
Chris@10 489 #else /* HAVE_FMA */
Chris@10 490
Chris@10 491 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include q1b.h -sign 1 */
Chris@10 492
Chris@10 493 /*
Chris@10 494 * This function contains 264 FP additions, 128 FP multiplications,
Chris@10 495 * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
Chris@10 496 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@10 497 */
Chris@10 498 #include "q1b.h"
Chris@10 499
Chris@10 500 static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@10 501 {
Chris@10 502 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 503 {
Chris@10 504 INT m;
Chris@10 505 R *x;
Chris@10 506 x = ii;
Chris@10 507 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@10 508 V Ta, Tv, Te, Tp, T1L, T26, T1P, T20, T2i, T2D, T2m, T2x, T3T, T4e, T3X;
Chris@10 509 V T48, TH, T12, TL, TW, T1e, T1z, T1i, T1t, T2P, T3a, T2T, T34, T3m, T3H;
Chris@10 510 V T3q, T3B, T7, Tw, Tf, Ts, T1I, T27, T1Q, T23, T2f, T2E, T2n, T2A, T3Q;
Chris@10 511 V T4f, T3Y, T4b, TE, T13, TM, TZ, T1b, T1A, T1j, T1w, T2M, T3b, T2U, T37;
Chris@10 512 V T3j, T3I, T3r, T3E, T28, T14;
Chris@10 513 {
Chris@10 514 V T8, T9, To, Tc, Td, Tn;
Chris@10 515 T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@10 516 T9 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@10 517 To = VADD(T8, T9);
Chris@10 518 Tc = LD(&(x[0]), ms, &(x[0]));
Chris@10 519 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@10 520 Tn = VADD(Tc, Td);
Chris@10 521 Ta = VSUB(T8, T9);
Chris@10 522 Tv = VADD(Tn, To);
Chris@10 523 Te = VSUB(Tc, Td);
Chris@10 524 Tp = VSUB(Tn, To);
Chris@10 525 }
Chris@10 526 {
Chris@10 527 V T1J, T1K, T1Z, T1N, T1O, T1Y;
Chris@10 528 T1J = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@10 529 T1K = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@10 530 T1Z = VADD(T1J, T1K);
Chris@10 531 T1N = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@10 532 T1O = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@10 533 T1Y = VADD(T1N, T1O);
Chris@10 534 T1L = VSUB(T1J, T1K);
Chris@10 535 T26 = VADD(T1Y, T1Z);
Chris@10 536 T1P = VSUB(T1N, T1O);
Chris@10 537 T20 = VSUB(T1Y, T1Z);
Chris@10 538 }
Chris@10 539 {
Chris@10 540 V T2g, T2h, T2w, T2k, T2l, T2v;
Chris@10 541 T2g = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@10 542 T2h = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@10 543 T2w = VADD(T2g, T2h);
Chris@10 544 T2k = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 545 T2l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@10 546 T2v = VADD(T2k, T2l);
Chris@10 547 T2i = VSUB(T2g, T2h);
Chris@10 548 T2D = VADD(T2v, T2w);
Chris@10 549 T2m = VSUB(T2k, T2l);
Chris@10 550 T2x = VSUB(T2v, T2w);
Chris@10 551 }
Chris@10 552 {
Chris@10 553 V T3R, T3S, T47, T3V, T3W, T46;
Chris@10 554 T3R = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@10 555 T3S = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@10 556 T47 = VADD(T3R, T3S);
Chris@10 557 T3V = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@10 558 T3W = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@10 559 T46 = VADD(T3V, T3W);
Chris@10 560 T3T = VSUB(T3R, T3S);
Chris@10 561 T4e = VADD(T46, T47);
Chris@10 562 T3X = VSUB(T3V, T3W);
Chris@10 563 T48 = VSUB(T46, T47);
Chris@10 564 }
Chris@10 565 {
Chris@10 566 V TF, TG, TV, TJ, TK, TU;
Chris@10 567 TF = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@10 568 TG = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@10 569 TV = VADD(TF, TG);
Chris@10 570 TJ = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@10 571 TK = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@10 572 TU = VADD(TJ, TK);
Chris@10 573 TH = VSUB(TF, TG);
Chris@10 574 T12 = VADD(TU, TV);
Chris@10 575 TL = VSUB(TJ, TK);
Chris@10 576 TW = VSUB(TU, TV);
Chris@10 577 }
Chris@10 578 {
Chris@10 579 V T1c, T1d, T1s, T1g, T1h, T1r;
Chris@10 580 T1c = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 581 T1d = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@10 582 T1s = VADD(T1c, T1d);
Chris@10 583 T1g = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@10 584 T1h = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@10 585 T1r = VADD(T1g, T1h);
Chris@10 586 T1e = VSUB(T1c, T1d);
Chris@10 587 T1z = VADD(T1r, T1s);
Chris@10 588 T1i = VSUB(T1g, T1h);
Chris@10 589 T1t = VSUB(T1r, T1s);
Chris@10 590 }
Chris@10 591 {
Chris@10 592 V T2N, T2O, T33, T2R, T2S, T32;
Chris@10 593 T2N = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@10 594 T2O = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@10 595 T33 = VADD(T2N, T2O);
Chris@10 596 T2R = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@10 597 T2S = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@10 598 T32 = VADD(T2R, T2S);
Chris@10 599 T2P = VSUB(T2N, T2O);
Chris@10 600 T3a = VADD(T32, T33);
Chris@10 601 T2T = VSUB(T2R, T2S);
Chris@10 602 T34 = VSUB(T32, T33);
Chris@10 603 }
Chris@10 604 {
Chris@10 605 V T3k, T3l, T3A, T3o, T3p, T3z;
Chris@10 606 T3k = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@10 607 T3l = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@10 608 T3A = VADD(T3k, T3l);
Chris@10 609 T3o = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@10 610 T3p = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@10 611 T3z = VADD(T3o, T3p);
Chris@10 612 T3m = VSUB(T3k, T3l);
Chris@10 613 T3H = VADD(T3z, T3A);
Chris@10 614 T3q = VSUB(T3o, T3p);
Chris@10 615 T3B = VSUB(T3z, T3A);
Chris@10 616 }
Chris@10 617 {
Chris@10 618 V T3, Tq, T6, Tr;
Chris@10 619 {
Chris@10 620 V T1, T2, T4, T5;
Chris@10 621 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@10 622 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@10 623 T3 = VSUB(T1, T2);
Chris@10 624 Tq = VADD(T1, T2);
Chris@10 625 T4 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@10 626 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@10 627 T6 = VSUB(T4, T5);
Chris@10 628 Tr = VADD(T4, T5);
Chris@10 629 }
Chris@10 630 T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
Chris@10 631 Tw = VADD(Tq, Tr);
Chris@10 632 Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
Chris@10 633 Ts = VBYI(VSUB(Tq, Tr));
Chris@10 634 }
Chris@10 635 {
Chris@10 636 V T1E, T21, T1H, T22;
Chris@10 637 {
Chris@10 638 V T1C, T1D, T1F, T1G;
Chris@10 639 T1C = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 640 T1D = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 641 T1E = VSUB(T1C, T1D);
Chris@10 642 T21 = VADD(T1C, T1D);
Chris@10 643 T1F = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 644 T1G = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 645 T1H = VSUB(T1F, T1G);
Chris@10 646 T22 = VADD(T1F, T1G);
Chris@10 647 }
Chris@10 648 T1I = VMUL(LDK(KP707106781), VSUB(T1E, T1H));
Chris@10 649 T27 = VADD(T21, T22);
Chris@10 650 T1Q = VMUL(LDK(KP707106781), VADD(T1E, T1H));
Chris@10 651 T23 = VBYI(VSUB(T21, T22));
Chris@10 652 }
Chris@10 653 {
Chris@10 654 V T2b, T2y, T2e, T2z;
Chris@10 655 {
Chris@10 656 V T29, T2a, T2c, T2d;
Chris@10 657 T29 = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 658 T2a = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 659 T2b = VSUB(T29, T2a);
Chris@10 660 T2y = VADD(T29, T2a);
Chris@10 661 T2c = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 662 T2d = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 663 T2e = VSUB(T2c, T2d);
Chris@10 664 T2z = VADD(T2c, T2d);
Chris@10 665 }
Chris@10 666 T2f = VMUL(LDK(KP707106781), VSUB(T2b, T2e));
Chris@10 667 T2E = VADD(T2y, T2z);
Chris@10 668 T2n = VMUL(LDK(KP707106781), VADD(T2b, T2e));
Chris@10 669 T2A = VBYI(VSUB(T2y, T2z));
Chris@10 670 }
Chris@10 671 {
Chris@10 672 V T3M, T49, T3P, T4a;
Chris@10 673 {
Chris@10 674 V T3K, T3L, T3N, T3O;
Chris@10 675 T3K = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 676 T3L = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 677 T3M = VSUB(T3K, T3L);
Chris@10 678 T49 = VADD(T3K, T3L);
Chris@10 679 T3N = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 680 T3O = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 681 T3P = VSUB(T3N, T3O);
Chris@10 682 T4a = VADD(T3N, T3O);
Chris@10 683 }
Chris@10 684 T3Q = VMUL(LDK(KP707106781), VSUB(T3M, T3P));
Chris@10 685 T4f = VADD(T49, T4a);
Chris@10 686 T3Y = VMUL(LDK(KP707106781), VADD(T3M, T3P));
Chris@10 687 T4b = VBYI(VSUB(T49, T4a));
Chris@10 688 }
Chris@10 689 {
Chris@10 690 V TA, TX, TD, TY;
Chris@10 691 {
Chris@10 692 V Ty, Tz, TB, TC;
Chris@10 693 Ty = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 694 Tz = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 695 TA = VSUB(Ty, Tz);
Chris@10 696 TX = VADD(Ty, Tz);
Chris@10 697 TB = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 698 TC = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 699 TD = VSUB(TB, TC);
Chris@10 700 TY = VADD(TB, TC);
Chris@10 701 }
Chris@10 702 TE = VMUL(LDK(KP707106781), VSUB(TA, TD));
Chris@10 703 T13 = VADD(TX, TY);
Chris@10 704 TM = VMUL(LDK(KP707106781), VADD(TA, TD));
Chris@10 705 TZ = VBYI(VSUB(TX, TY));
Chris@10 706 }
Chris@10 707 {
Chris@10 708 V T17, T1u, T1a, T1v;
Chris@10 709 {
Chris@10 710 V T15, T16, T18, T19;
Chris@10 711 T15 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 712 T16 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 713 T17 = VSUB(T15, T16);
Chris@10 714 T1u = VADD(T15, T16);
Chris@10 715 T18 = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 716 T19 = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 717 T1a = VSUB(T18, T19);
Chris@10 718 T1v = VADD(T18, T19);
Chris@10 719 }
Chris@10 720 T1b = VMUL(LDK(KP707106781), VSUB(T17, T1a));
Chris@10 721 T1A = VADD(T1u, T1v);
Chris@10 722 T1j = VMUL(LDK(KP707106781), VADD(T17, T1a));
Chris@10 723 T1w = VBYI(VSUB(T1u, T1v));
Chris@10 724 }
Chris@10 725 {
Chris@10 726 V T2I, T35, T2L, T36;
Chris@10 727 {
Chris@10 728 V T2G, T2H, T2J, T2K;
Chris@10 729 T2G = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 730 T2H = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 731 T2I = VSUB(T2G, T2H);
Chris@10 732 T35 = VADD(T2G, T2H);
Chris@10 733 T2J = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 734 T2K = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 735 T2L = VSUB(T2J, T2K);
Chris@10 736 T36 = VADD(T2J, T2K);
Chris@10 737 }
Chris@10 738 T2M = VMUL(LDK(KP707106781), VSUB(T2I, T2L));
Chris@10 739 T3b = VADD(T35, T36);
Chris@10 740 T2U = VMUL(LDK(KP707106781), VADD(T2I, T2L));
Chris@10 741 T37 = VBYI(VSUB(T35, T36));
Chris@10 742 }
Chris@10 743 {
Chris@10 744 V T3f, T3C, T3i, T3D;
Chris@10 745 {
Chris@10 746 V T3d, T3e, T3g, T3h;
Chris@10 747 T3d = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 748 T3e = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 749 T3f = VSUB(T3d, T3e);
Chris@10 750 T3C = VADD(T3d, T3e);
Chris@10 751 T3g = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 752 T3h = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 753 T3i = VSUB(T3g, T3h);
Chris@10 754 T3D = VADD(T3g, T3h);
Chris@10 755 }
Chris@10 756 T3j = VMUL(LDK(KP707106781), VSUB(T3f, T3i));
Chris@10 757 T3I = VADD(T3C, T3D);
Chris@10 758 T3r = VMUL(LDK(KP707106781), VADD(T3f, T3i));
Chris@10 759 T3E = VBYI(VSUB(T3C, T3D));
Chris@10 760 }
Chris@10 761 ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
Chris@10 762 ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
Chris@10 763 ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
Chris@10 764 ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
Chris@10 765 ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
Chris@10 766 ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
Chris@10 767 {
Chris@10 768 V Tt, T4c, T2B, T24;
Chris@10 769 ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
Chris@10 770 ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
Chris@10 771 Tt = BYTW(&(W[TWVL * 10]), VSUB(Tp, Ts));
Chris@10 772 ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
Chris@10 773 T4c = BYTW(&(W[TWVL * 10]), VSUB(T48, T4b));
Chris@10 774 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 775 T2B = BYTW(&(W[TWVL * 10]), VSUB(T2x, T2A));
Chris@10 776 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
Chris@10 777 T24 = BYTW(&(W[TWVL * 10]), VSUB(T20, T23));
Chris@10 778 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 779 }
Chris@10 780 {
Chris@10 781 V T10, T1x, T3F, T38, T1y, Tu;
Chris@10 782 T10 = BYTW(&(W[TWVL * 10]), VSUB(TW, TZ));
Chris@10 783 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 784 T1x = BYTW(&(W[TWVL * 10]), VSUB(T1t, T1w));
Chris@10 785 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
Chris@10 786 T3F = BYTW(&(W[TWVL * 10]), VSUB(T3B, T3E));
Chris@10 787 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
Chris@10 788 T38 = BYTW(&(W[TWVL * 10]), VSUB(T34, T37));
Chris@10 789 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@10 790 T1y = BYTW(&(W[TWVL * 2]), VADD(T1t, T1w));
Chris@10 791 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
Chris@10 792 Tu = BYTW(&(W[TWVL * 2]), VADD(Tp, Ts));
Chris@10 793 ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
Chris@10 794 }
Chris@10 795 {
Chris@10 796 V T2C, T3G, T11, T25, T39, T4d;
Chris@10 797 T2C = BYTW(&(W[TWVL * 2]), VADD(T2x, T2A));
Chris@10 798 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
Chris@10 799 T3G = BYTW(&(W[TWVL * 2]), VADD(T3B, T3E));
Chris@10 800 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
Chris@10 801 T11 = BYTW(&(W[TWVL * 2]), VADD(TW, TZ));
Chris@10 802 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 803 T25 = BYTW(&(W[TWVL * 2]), VADD(T20, T23));
Chris@10 804 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 805 T39 = BYTW(&(W[TWVL * 2]), VADD(T34, T37));
Chris@10 806 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 807 T4d = BYTW(&(W[TWVL * 2]), VADD(T48, T4b));
Chris@10 808 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@10 809 }
Chris@10 810 {
Chris@10 811 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@10 812 Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
Chris@10 813 ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
Chris@10 814 T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
Chris@10 815 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
Chris@10 816 T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
Chris@10 817 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 818 T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
Chris@10 819 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 820 T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
Chris@10 821 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
Chris@10 822 T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
Chris@10 823 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
Chris@10 824 }
Chris@10 825 T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
Chris@10 826 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 827 T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
Chris@10 828 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@10 829 {
Chris@10 830 V Th, Ti, Tb, Tg;
Chris@10 831 Tb = VBYI(VSUB(T7, Ta));
Chris@10 832 Tg = VSUB(Te, Tf);
Chris@10 833 Th = BYTW(&(W[TWVL * 4]), VADD(Tb, Tg));
Chris@10 834 Ti = BYTW(&(W[TWVL * 8]), VSUB(Tg, Tb));
Chris@10 835 ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
Chris@10 836 ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
Chris@10 837 }
Chris@10 838 {
Chris@10 839 V T40, T41, T3U, T3Z;
Chris@10 840 T3U = VBYI(VSUB(T3Q, T3T));
Chris@10 841 T3Z = VSUB(T3X, T3Y);
Chris@10 842 T40 = BYTW(&(W[TWVL * 4]), VADD(T3U, T3Z));
Chris@10 843 T41 = BYTW(&(W[TWVL * 8]), VSUB(T3Z, T3U));
Chris@10 844 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 845 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 846 }
Chris@10 847 {
Chris@10 848 V T2p, T2q, T2j, T2o;
Chris@10 849 T2j = VBYI(VSUB(T2f, T2i));
Chris@10 850 T2o = VSUB(T2m, T2n);
Chris@10 851 T2p = BYTW(&(W[TWVL * 4]), VADD(T2j, T2o));
Chris@10 852 T2q = BYTW(&(W[TWVL * 8]), VSUB(T2o, T2j));
Chris@10 853 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
Chris@10 854 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
Chris@10 855 }
Chris@10 856 {
Chris@10 857 V T1S, T1T, T1M, T1R;
Chris@10 858 T1M = VBYI(VSUB(T1I, T1L));
Chris@10 859 T1R = VSUB(T1P, T1Q);
Chris@10 860 T1S = BYTW(&(W[TWVL * 4]), VADD(T1M, T1R));
Chris@10 861 T1T = BYTW(&(W[TWVL * 8]), VSUB(T1R, T1M));
Chris@10 862 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 863 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 864 }
Chris@10 865 {
Chris@10 866 V TO, TP, TI, TN;
Chris@10 867 TI = VBYI(VSUB(TE, TH));
Chris@10 868 TN = VSUB(TL, TM);
Chris@10 869 TO = BYTW(&(W[TWVL * 4]), VADD(TI, TN));
Chris@10 870 TP = BYTW(&(W[TWVL * 8]), VSUB(TN, TI));
Chris@10 871 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 872 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 873 }
Chris@10 874 {
Chris@10 875 V T1l, T1m, T1f, T1k;
Chris@10 876 T1f = VBYI(VSUB(T1b, T1e));
Chris@10 877 T1k = VSUB(T1i, T1j);
Chris@10 878 T1l = BYTW(&(W[TWVL * 4]), VADD(T1f, T1k));
Chris@10 879 T1m = BYTW(&(W[TWVL * 8]), VSUB(T1k, T1f));
Chris@10 880 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
Chris@10 881 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
Chris@10 882 }
Chris@10 883 {
Chris@10 884 V T3t, T3u, T3n, T3s;
Chris@10 885 T3n = VBYI(VSUB(T3j, T3m));
Chris@10 886 T3s = VSUB(T3q, T3r);
Chris@10 887 T3t = BYTW(&(W[TWVL * 4]), VADD(T3n, T3s));
Chris@10 888 T3u = BYTW(&(W[TWVL * 8]), VSUB(T3s, T3n));
Chris@10 889 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
Chris@10 890 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
Chris@10 891 }
Chris@10 892 {
Chris@10 893 V T2W, T2X, T2Q, T2V;
Chris@10 894 T2Q = VBYI(VSUB(T2M, T2P));
Chris@10 895 T2V = VSUB(T2T, T2U);
Chris@10 896 T2W = BYTW(&(W[TWVL * 4]), VADD(T2Q, T2V));
Chris@10 897 T2X = BYTW(&(W[TWVL * 8]), VSUB(T2V, T2Q));
Chris@10 898 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@10 899 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@10 900 }
Chris@10 901 {
Chris@10 902 V T1p, T1q, T1n, T1o;
Chris@10 903 T1n = VBYI(VADD(T1e, T1b));
Chris@10 904 T1o = VADD(T1i, T1j);
Chris@10 905 T1p = BYTW(&(W[0]), VADD(T1n, T1o));
Chris@10 906 T1q = BYTW(&(W[TWVL * 12]), VSUB(T1o, T1n));
Chris@10 907 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
Chris@10 908 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
Chris@10 909 }
Chris@10 910 {
Chris@10 911 V Tl, Tm, Tj, Tk;
Chris@10 912 Tj = VBYI(VADD(Ta, T7));
Chris@10 913 Tk = VADD(Te, Tf);
Chris@10 914 Tl = BYTW(&(W[0]), VADD(Tj, Tk));
Chris@10 915 Tm = BYTW(&(W[TWVL * 12]), VSUB(Tk, Tj));
Chris@10 916 ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
Chris@10 917 ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
Chris@10 918 }
Chris@10 919 {
Chris@10 920 V T2t, T2u, T2r, T2s;
Chris@10 921 T2r = VBYI(VADD(T2i, T2f));
Chris@10 922 T2s = VADD(T2m, T2n);
Chris@10 923 T2t = BYTW(&(W[0]), VADD(T2r, T2s));
Chris@10 924 T2u = BYTW(&(W[TWVL * 12]), VSUB(T2s, T2r));
Chris@10 925 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
Chris@10 926 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
Chris@10 927 }
Chris@10 928 {
Chris@10 929 V T3x, T3y, T3v, T3w;
Chris@10 930 T3v = VBYI(VADD(T3m, T3j));
Chris@10 931 T3w = VADD(T3q, T3r);
Chris@10 932 T3x = BYTW(&(W[0]), VADD(T3v, T3w));
Chris@10 933 T3y = BYTW(&(W[TWVL * 12]), VSUB(T3w, T3v));
Chris@10 934 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
Chris@10 935 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
Chris@10 936 }
Chris@10 937 {
Chris@10 938 V TS, TT, TQ, TR;
Chris@10 939 TQ = VBYI(VADD(TH, TE));
Chris@10 940 TR = VADD(TL, TM);
Chris@10 941 TS = BYTW(&(W[0]), VADD(TQ, TR));
Chris@10 942 TT = BYTW(&(W[TWVL * 12]), VSUB(TR, TQ));
Chris@10 943 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 944 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 945 }
Chris@10 946 {
Chris@10 947 V T1W, T1X, T1U, T1V;
Chris@10 948 T1U = VBYI(VADD(T1L, T1I));
Chris@10 949 T1V = VADD(T1P, T1Q);
Chris@10 950 T1W = BYTW(&(W[0]), VADD(T1U, T1V));
Chris@10 951 T1X = BYTW(&(W[TWVL * 12]), VSUB(T1V, T1U));
Chris@10 952 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 953 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 954 }
Chris@10 955 {
Chris@10 956 V T30, T31, T2Y, T2Z;
Chris@10 957 T2Y = VBYI(VADD(T2P, T2M));
Chris@10 958 T2Z = VADD(T2T, T2U);
Chris@10 959 T30 = BYTW(&(W[0]), VADD(T2Y, T2Z));
Chris@10 960 T31 = BYTW(&(W[TWVL * 12]), VSUB(T2Z, T2Y));
Chris@10 961 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 962 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 963 }
Chris@10 964 {
Chris@10 965 V T44, T45, T42, T43;
Chris@10 966 T42 = VBYI(VADD(T3T, T3Q));
Chris@10 967 T43 = VADD(T3X, T3Y);
Chris@10 968 T44 = BYTW(&(W[0]), VADD(T42, T43));
Chris@10 969 T45 = BYTW(&(W[TWVL * 12]), VSUB(T43, T42));
Chris@10 970 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@10 971 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@10 972 }
Chris@10 973 }
Chris@10 974 }
Chris@10 975 VLEAVE();
Chris@10 976 }
Chris@10 977
Chris@10 978 static const tw_instr twinstr[] = {
Chris@10 979 VTW(0, 1),
Chris@10 980 VTW(0, 2),
Chris@10 981 VTW(0, 3),
Chris@10 982 VTW(0, 4),
Chris@10 983 VTW(0, 5),
Chris@10 984 VTW(0, 6),
Chris@10 985 VTW(0, 7),
Chris@10 986 {TW_NEXT, VL, 0}
Chris@10 987 };
Chris@10 988
Chris@10 989 static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
Chris@10 990
Chris@10 991 void XSIMD(codelet_q1bv_8) (planner *p) {
Chris@10 992 X(kdft_difsq_register) (p, q1bv_8, &desc);
Chris@10 993 }
Chris@10 994 #endif /* HAVE_FMA */