annotate src/fftw-3.3.8/dft/simd/common/q1bv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:15 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include dft/simd/q1b.h -sign 1 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 264 FP additions, 192 FP multiplications,
Chris@82 32 * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
Chris@82 33 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/q1b.h"
Chris@82 36
Chris@82 37 static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 R *x;
Chris@82 43 x = ii;
Chris@82 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@82 45 V T3, Tv, Te, Tp, T1E, T26, T1P, T20, T2b, T2D, T2m, T2x, T3M, T4e, T3X;
Chris@82 46 V T48, TA, T12, TL, TW, T17, T1z, T1i, T1t, T2I, T3a, T2T, T34, T3f, T3H;
Chris@82 47 V T3q, T3B, Ta, Tw, Tf, Ts, T1L, T27, T1Q, T23, T2i, T2E, T2n, T2A, T3T;
Chris@82 48 V T4f, T3Y, T4b, TH, T13, TM, TZ, T1e, T1A, T1j, T1w, T2P, T3b, T2U, T37;
Chris@82 49 V T3m, T3I, T3r, T3E, T28, T14;
Chris@82 50 {
Chris@82 51 V T1, T2, Tn, Tc, Td, To;
Chris@82 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 54 Tn = VADD(T1, T2);
Chris@82 55 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 56 Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 57 To = VADD(Tc, Td);
Chris@82 58 T3 = VSUB(T1, T2);
Chris@82 59 Tv = VADD(Tn, To);
Chris@82 60 Te = VSUB(Tc, Td);
Chris@82 61 Tp = VSUB(Tn, To);
Chris@82 62 }
Chris@82 63 {
Chris@82 64 V T1C, T1D, T1Y, T1N, T1O, T1Z;
Chris@82 65 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 66 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 67 T1Y = VADD(T1C, T1D);
Chris@82 68 T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 69 T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@82 70 T1Z = VADD(T1N, T1O);
Chris@82 71 T1E = VSUB(T1C, T1D);
Chris@82 72 T26 = VADD(T1Y, T1Z);
Chris@82 73 T1P = VSUB(T1N, T1O);
Chris@82 74 T20 = VSUB(T1Y, T1Z);
Chris@82 75 }
Chris@82 76 {
Chris@82 77 V T29, T2a, T2v, T2k, T2l, T2w;
Chris@82 78 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 79 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 80 T2v = VADD(T29, T2a);
Chris@82 81 T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 82 T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@82 83 T2w = VADD(T2k, T2l);
Chris@82 84 T2b = VSUB(T29, T2a);
Chris@82 85 T2D = VADD(T2v, T2w);
Chris@82 86 T2m = VSUB(T2k, T2l);
Chris@82 87 T2x = VSUB(T2v, T2w);
Chris@82 88 }
Chris@82 89 {
Chris@82 90 V T3K, T3L, T46, T3V, T3W, T47;
Chris@82 91 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@82 92 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@82 93 T46 = VADD(T3K, T3L);
Chris@82 94 T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@82 95 T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@82 96 T47 = VADD(T3V, T3W);
Chris@82 97 T3M = VSUB(T3K, T3L);
Chris@82 98 T4e = VADD(T46, T47);
Chris@82 99 T3X = VSUB(T3V, T3W);
Chris@82 100 T48 = VSUB(T46, T47);
Chris@82 101 }
Chris@82 102 {
Chris@82 103 V Ty, Tz, TU, TJ, TK, TV;
Chris@82 104 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 105 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 106 TU = VADD(Ty, Tz);
Chris@82 107 TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 108 TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@82 109 TV = VADD(TJ, TK);
Chris@82 110 TA = VSUB(Ty, Tz);
Chris@82 111 T12 = VADD(TU, TV);
Chris@82 112 TL = VSUB(TJ, TK);
Chris@82 113 TW = VSUB(TU, TV);
Chris@82 114 }
Chris@82 115 {
Chris@82 116 V T15, T16, T1r, T1g, T1h, T1s;
Chris@82 117 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 118 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 119 T1r = VADD(T15, T16);
Chris@82 120 T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 121 T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@82 122 T1s = VADD(T1g, T1h);
Chris@82 123 T17 = VSUB(T15, T16);
Chris@82 124 T1z = VADD(T1r, T1s);
Chris@82 125 T1i = VSUB(T1g, T1h);
Chris@82 126 T1t = VSUB(T1r, T1s);
Chris@82 127 }
Chris@82 128 {
Chris@82 129 V T2G, T2H, T32, T2R, T2S, T33;
Chris@82 130 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@82 131 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@82 132 T32 = VADD(T2G, T2H);
Chris@82 133 T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@82 134 T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@82 135 T33 = VADD(T2R, T2S);
Chris@82 136 T2I = VSUB(T2G, T2H);
Chris@82 137 T3a = VADD(T32, T33);
Chris@82 138 T2T = VSUB(T2R, T2S);
Chris@82 139 T34 = VSUB(T32, T33);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 V T3d, T3e, T3z, T3o, T3p, T3A;
Chris@82 143 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 144 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@82 145 T3z = VADD(T3d, T3e);
Chris@82 146 T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@82 147 T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 148 T3A = VADD(T3o, T3p);
Chris@82 149 T3f = VSUB(T3d, T3e);
Chris@82 150 T3H = VADD(T3z, T3A);
Chris@82 151 T3q = VSUB(T3o, T3p);
Chris@82 152 T3B = VSUB(T3z, T3A);
Chris@82 153 }
Chris@82 154 {
Chris@82 155 V T6, Tq, T9, Tr;
Chris@82 156 {
Chris@82 157 V T4, T5, T7, T8;
Chris@82 158 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 159 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 160 T6 = VSUB(T4, T5);
Chris@82 161 Tq = VADD(T4, T5);
Chris@82 162 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 163 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 164 T9 = VSUB(T7, T8);
Chris@82 165 Tr = VADD(T7, T8);
Chris@82 166 }
Chris@82 167 Ta = VADD(T6, T9);
Chris@82 168 Tw = VADD(Tq, Tr);
Chris@82 169 Tf = VSUB(T6, T9);
Chris@82 170 Ts = VSUB(Tq, Tr);
Chris@82 171 }
Chris@82 172 {
Chris@82 173 V T1H, T21, T1K, T22;
Chris@82 174 {
Chris@82 175 V T1F, T1G, T1I, T1J;
Chris@82 176 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 177 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 178 T1H = VSUB(T1F, T1G);
Chris@82 179 T21 = VADD(T1F, T1G);
Chris@82 180 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 181 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 182 T1K = VSUB(T1I, T1J);
Chris@82 183 T22 = VADD(T1I, T1J);
Chris@82 184 }
Chris@82 185 T1L = VADD(T1H, T1K);
Chris@82 186 T27 = VADD(T21, T22);
Chris@82 187 T1Q = VSUB(T1H, T1K);
Chris@82 188 T23 = VSUB(T21, T22);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 V T2e, T2y, T2h, T2z;
Chris@82 192 {
Chris@82 193 V T2c, T2d, T2f, T2g;
Chris@82 194 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 195 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 196 T2e = VSUB(T2c, T2d);
Chris@82 197 T2y = VADD(T2c, T2d);
Chris@82 198 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 199 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 200 T2h = VSUB(T2f, T2g);
Chris@82 201 T2z = VADD(T2f, T2g);
Chris@82 202 }
Chris@82 203 T2i = VADD(T2e, T2h);
Chris@82 204 T2E = VADD(T2y, T2z);
Chris@82 205 T2n = VSUB(T2e, T2h);
Chris@82 206 T2A = VSUB(T2y, T2z);
Chris@82 207 }
Chris@82 208 {
Chris@82 209 V T3P, T49, T3S, T4a;
Chris@82 210 {
Chris@82 211 V T3N, T3O, T3Q, T3R;
Chris@82 212 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 213 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 214 T3P = VSUB(T3N, T3O);
Chris@82 215 T49 = VADD(T3N, T3O);
Chris@82 216 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 217 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 218 T3S = VSUB(T3Q, T3R);
Chris@82 219 T4a = VADD(T3Q, T3R);
Chris@82 220 }
Chris@82 221 T3T = VADD(T3P, T3S);
Chris@82 222 T4f = VADD(T49, T4a);
Chris@82 223 T3Y = VSUB(T3P, T3S);
Chris@82 224 T4b = VSUB(T49, T4a);
Chris@82 225 }
Chris@82 226 {
Chris@82 227 V TD, TX, TG, TY;
Chris@82 228 {
Chris@82 229 V TB, TC, TE, TF;
Chris@82 230 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 231 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 232 TD = VSUB(TB, TC);
Chris@82 233 TX = VADD(TB, TC);
Chris@82 234 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 235 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 236 TG = VSUB(TE, TF);
Chris@82 237 TY = VADD(TE, TF);
Chris@82 238 }
Chris@82 239 TH = VADD(TD, TG);
Chris@82 240 T13 = VADD(TX, TY);
Chris@82 241 TM = VSUB(TD, TG);
Chris@82 242 TZ = VSUB(TX, TY);
Chris@82 243 }
Chris@82 244 {
Chris@82 245 V T1a, T1u, T1d, T1v;
Chris@82 246 {
Chris@82 247 V T18, T19, T1b, T1c;
Chris@82 248 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 249 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 250 T1a = VSUB(T18, T19);
Chris@82 251 T1u = VADD(T18, T19);
Chris@82 252 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 253 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 254 T1d = VSUB(T1b, T1c);
Chris@82 255 T1v = VADD(T1b, T1c);
Chris@82 256 }
Chris@82 257 T1e = VADD(T1a, T1d);
Chris@82 258 T1A = VADD(T1u, T1v);
Chris@82 259 T1j = VSUB(T1a, T1d);
Chris@82 260 T1w = VSUB(T1u, T1v);
Chris@82 261 }
Chris@82 262 {
Chris@82 263 V T2L, T35, T2O, T36;
Chris@82 264 {
Chris@82 265 V T2J, T2K, T2M, T2N;
Chris@82 266 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 267 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 268 T2L = VSUB(T2J, T2K);
Chris@82 269 T35 = VADD(T2J, T2K);
Chris@82 270 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 271 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 272 T2O = VSUB(T2M, T2N);
Chris@82 273 T36 = VADD(T2M, T2N);
Chris@82 274 }
Chris@82 275 T2P = VADD(T2L, T2O);
Chris@82 276 T3b = VADD(T35, T36);
Chris@82 277 T2U = VSUB(T2L, T2O);
Chris@82 278 T37 = VSUB(T35, T36);
Chris@82 279 }
Chris@82 280 {
Chris@82 281 V T3i, T3C, T3l, T3D;
Chris@82 282 {
Chris@82 283 V T3g, T3h, T3j, T3k;
Chris@82 284 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 285 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 286 T3i = VSUB(T3g, T3h);
Chris@82 287 T3C = VADD(T3g, T3h);
Chris@82 288 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 289 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 290 T3l = VSUB(T3j, T3k);
Chris@82 291 T3D = VADD(T3j, T3k);
Chris@82 292 }
Chris@82 293 T3m = VADD(T3i, T3l);
Chris@82 294 T3I = VADD(T3C, T3D);
Chris@82 295 T3r = VSUB(T3i, T3l);
Chris@82 296 T3E = VSUB(T3C, T3D);
Chris@82 297 }
Chris@82 298 ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
Chris@82 299 ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
Chris@82 300 ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
Chris@82 301 ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
Chris@82 302 ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
Chris@82 303 ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
Chris@82 304 {
Chris@82 305 V Tt, T4c, T2B, T24;
Chris@82 306 ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
Chris@82 307 ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
Chris@82 308 Tt = BYTW(&(W[TWVL * 10]), VFNMSI(Ts, Tp));
Chris@82 309 ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
Chris@82 310 T4c = BYTW(&(W[TWVL * 10]), VFNMSI(T4b, T48));
Chris@82 311 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 312 T2B = BYTW(&(W[TWVL * 10]), VFNMSI(T2A, T2x));
Chris@82 313 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
Chris@82 314 T24 = BYTW(&(W[TWVL * 10]), VFNMSI(T23, T20));
Chris@82 315 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 316 }
Chris@82 317 {
Chris@82 318 V T10, T1x, T3F, T38, T1y, Tu;
Chris@82 319 T10 = BYTW(&(W[TWVL * 10]), VFNMSI(TZ, TW));
Chris@82 320 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 321 T1x = BYTW(&(W[TWVL * 10]), VFNMSI(T1w, T1t));
Chris@82 322 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
Chris@82 323 T3F = BYTW(&(W[TWVL * 10]), VFNMSI(T3E, T3B));
Chris@82 324 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
Chris@82 325 T38 = BYTW(&(W[TWVL * 10]), VFNMSI(T37, T34));
Chris@82 326 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 327 T1y = BYTW(&(W[TWVL * 2]), VFMAI(T1w, T1t));
Chris@82 328 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
Chris@82 329 Tu = BYTW(&(W[TWVL * 2]), VFMAI(Ts, Tp));
Chris@82 330 ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
Chris@82 331 }
Chris@82 332 {
Chris@82 333 V T2C, T3G, T11, T25, T39, T4d;
Chris@82 334 T2C = BYTW(&(W[TWVL * 2]), VFMAI(T2A, T2x));
Chris@82 335 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
Chris@82 336 T3G = BYTW(&(W[TWVL * 2]), VFMAI(T3E, T3B));
Chris@82 337 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
Chris@82 338 T11 = BYTW(&(W[TWVL * 2]), VFMAI(TZ, TW));
Chris@82 339 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 340 T25 = BYTW(&(W[TWVL * 2]), VFMAI(T23, T20));
Chris@82 341 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 342 T39 = BYTW(&(W[TWVL * 2]), VFMAI(T37, T34));
Chris@82 343 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 344 T4d = BYTW(&(W[TWVL * 2]), VFMAI(T4b, T48));
Chris@82 345 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 346 }
Chris@82 347 {
Chris@82 348 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@82 349 Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
Chris@82 350 ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
Chris@82 351 T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
Chris@82 352 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
Chris@82 353 T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
Chris@82 354 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 355 T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
Chris@82 356 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 357 T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
Chris@82 358 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
Chris@82 359 T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
Chris@82 360 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
Chris@82 361 }
Chris@82 362 T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
Chris@82 363 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 364 T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
Chris@82 365 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 366 {
Chris@82 367 V Th, Ti, Tb, Tg;
Chris@82 368 Tb = VFNMS(LDK(KP707106781), Ta, T3);
Chris@82 369 Tg = VFNMS(LDK(KP707106781), Tf, Te);
Chris@82 370 Th = BYTW(&(W[TWVL * 4]), VFNMSI(Tg, Tb));
Chris@82 371 Ti = BYTW(&(W[TWVL * 8]), VFMAI(Tg, Tb));
Chris@82 372 ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
Chris@82 373 ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
Chris@82 374 }
Chris@82 375 {
Chris@82 376 V T40, T41, T3U, T3Z;
Chris@82 377 T3U = VFNMS(LDK(KP707106781), T3T, T3M);
Chris@82 378 T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
Chris@82 379 T40 = BYTW(&(W[TWVL * 4]), VFNMSI(T3Z, T3U));
Chris@82 380 T41 = BYTW(&(W[TWVL * 8]), VFMAI(T3Z, T3U));
Chris@82 381 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 382 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 383 }
Chris@82 384 {
Chris@82 385 V T2p, T2q, T2j, T2o;
Chris@82 386 T2j = VFNMS(LDK(KP707106781), T2i, T2b);
Chris@82 387 T2o = VFNMS(LDK(KP707106781), T2n, T2m);
Chris@82 388 T2p = BYTW(&(W[TWVL * 4]), VFNMSI(T2o, T2j));
Chris@82 389 T2q = BYTW(&(W[TWVL * 8]), VFMAI(T2o, T2j));
Chris@82 390 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
Chris@82 391 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V T1S, T1T, T1M, T1R;
Chris@82 395 T1M = VFNMS(LDK(KP707106781), T1L, T1E);
Chris@82 396 T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
Chris@82 397 T1S = BYTW(&(W[TWVL * 4]), VFNMSI(T1R, T1M));
Chris@82 398 T1T = BYTW(&(W[TWVL * 8]), VFMAI(T1R, T1M));
Chris@82 399 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 400 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 401 }
Chris@82 402 {
Chris@82 403 V TO, TP, TI, TN;
Chris@82 404 TI = VFNMS(LDK(KP707106781), TH, TA);
Chris@82 405 TN = VFNMS(LDK(KP707106781), TM, TL);
Chris@82 406 TO = BYTW(&(W[TWVL * 4]), VFNMSI(TN, TI));
Chris@82 407 TP = BYTW(&(W[TWVL * 8]), VFMAI(TN, TI));
Chris@82 408 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 409 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V T1l, T1m, T1f, T1k;
Chris@82 413 T1f = VFNMS(LDK(KP707106781), T1e, T17);
Chris@82 414 T1k = VFNMS(LDK(KP707106781), T1j, T1i);
Chris@82 415 T1l = BYTW(&(W[TWVL * 4]), VFNMSI(T1k, T1f));
Chris@82 416 T1m = BYTW(&(W[TWVL * 8]), VFMAI(T1k, T1f));
Chris@82 417 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
Chris@82 418 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
Chris@82 419 }
Chris@82 420 {
Chris@82 421 V T3t, T3u, T3n, T3s;
Chris@82 422 T3n = VFNMS(LDK(KP707106781), T3m, T3f);
Chris@82 423 T3s = VFNMS(LDK(KP707106781), T3r, T3q);
Chris@82 424 T3t = BYTW(&(W[TWVL * 4]), VFNMSI(T3s, T3n));
Chris@82 425 T3u = BYTW(&(W[TWVL * 8]), VFMAI(T3s, T3n));
Chris@82 426 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
Chris@82 427 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
Chris@82 428 }
Chris@82 429 {
Chris@82 430 V T2W, T2X, T2Q, T2V;
Chris@82 431 T2Q = VFNMS(LDK(KP707106781), T2P, T2I);
Chris@82 432 T2V = VFNMS(LDK(KP707106781), T2U, T2T);
Chris@82 433 T2W = BYTW(&(W[TWVL * 4]), VFNMSI(T2V, T2Q));
Chris@82 434 T2X = BYTW(&(W[TWVL * 8]), VFMAI(T2V, T2Q));
Chris@82 435 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 436 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 437 }
Chris@82 438 {
Chris@82 439 V T1p, T1q, T1n, T1o;
Chris@82 440 T1n = VFMA(LDK(KP707106781), T1e, T17);
Chris@82 441 T1o = VFMA(LDK(KP707106781), T1j, T1i);
Chris@82 442 T1p = BYTW(&(W[0]), VFMAI(T1o, T1n));
Chris@82 443 T1q = BYTW(&(W[TWVL * 12]), VFNMSI(T1o, T1n));
Chris@82 444 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
Chris@82 445 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
Chris@82 446 }
Chris@82 447 {
Chris@82 448 V Tl, Tm, Tj, Tk;
Chris@82 449 Tj = VFMA(LDK(KP707106781), Ta, T3);
Chris@82 450 Tk = VFMA(LDK(KP707106781), Tf, Te);
Chris@82 451 Tl = BYTW(&(W[0]), VFMAI(Tk, Tj));
Chris@82 452 Tm = BYTW(&(W[TWVL * 12]), VFNMSI(Tk, Tj));
Chris@82 453 ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
Chris@82 454 ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
Chris@82 455 }
Chris@82 456 {
Chris@82 457 V T2t, T2u, T2r, T2s;
Chris@82 458 T2r = VFMA(LDK(KP707106781), T2i, T2b);
Chris@82 459 T2s = VFMA(LDK(KP707106781), T2n, T2m);
Chris@82 460 T2t = BYTW(&(W[0]), VFMAI(T2s, T2r));
Chris@82 461 T2u = BYTW(&(W[TWVL * 12]), VFNMSI(T2s, T2r));
Chris@82 462 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
Chris@82 463 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
Chris@82 464 }
Chris@82 465 {
Chris@82 466 V T3x, T3y, T3v, T3w;
Chris@82 467 T3v = VFMA(LDK(KP707106781), T3m, T3f);
Chris@82 468 T3w = VFMA(LDK(KP707106781), T3r, T3q);
Chris@82 469 T3x = BYTW(&(W[0]), VFMAI(T3w, T3v));
Chris@82 470 T3y = BYTW(&(W[TWVL * 12]), VFNMSI(T3w, T3v));
Chris@82 471 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
Chris@82 472 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
Chris@82 473 }
Chris@82 474 {
Chris@82 475 V TS, TT, TQ, TR;
Chris@82 476 TQ = VFMA(LDK(KP707106781), TH, TA);
Chris@82 477 TR = VFMA(LDK(KP707106781), TM, TL);
Chris@82 478 TS = BYTW(&(W[0]), VFMAI(TR, TQ));
Chris@82 479 TT = BYTW(&(W[TWVL * 12]), VFNMSI(TR, TQ));
Chris@82 480 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 481 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 482 }
Chris@82 483 {
Chris@82 484 V T1W, T1X, T1U, T1V;
Chris@82 485 T1U = VFMA(LDK(KP707106781), T1L, T1E);
Chris@82 486 T1V = VFMA(LDK(KP707106781), T1Q, T1P);
Chris@82 487 T1W = BYTW(&(W[0]), VFMAI(T1V, T1U));
Chris@82 488 T1X = BYTW(&(W[TWVL * 12]), VFNMSI(T1V, T1U));
Chris@82 489 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 490 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 491 }
Chris@82 492 {
Chris@82 493 V T30, T31, T2Y, T2Z;
Chris@82 494 T2Y = VFMA(LDK(KP707106781), T2P, T2I);
Chris@82 495 T2Z = VFMA(LDK(KP707106781), T2U, T2T);
Chris@82 496 T30 = BYTW(&(W[0]), VFMAI(T2Z, T2Y));
Chris@82 497 T31 = BYTW(&(W[TWVL * 12]), VFNMSI(T2Z, T2Y));
Chris@82 498 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 499 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 500 }
Chris@82 501 {
Chris@82 502 V T44, T45, T42, T43;
Chris@82 503 T42 = VFMA(LDK(KP707106781), T3T, T3M);
Chris@82 504 T43 = VFMA(LDK(KP707106781), T3Y, T3X);
Chris@82 505 T44 = BYTW(&(W[0]), VFMAI(T43, T42));
Chris@82 506 T45 = BYTW(&(W[TWVL * 12]), VFNMSI(T43, T42));
Chris@82 507 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 508 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 509 }
Chris@82 510 }
Chris@82 511 }
Chris@82 512 VLEAVE();
Chris@82 513 }
Chris@82 514
Chris@82 515 static const tw_instr twinstr[] = {
Chris@82 516 VTW(0, 1),
Chris@82 517 VTW(0, 2),
Chris@82 518 VTW(0, 3),
Chris@82 519 VTW(0, 4),
Chris@82 520 VTW(0, 5),
Chris@82 521 VTW(0, 6),
Chris@82 522 VTW(0, 7),
Chris@82 523 {TW_NEXT, VL, 0}
Chris@82 524 };
Chris@82 525
Chris@82 526 static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
Chris@82 527
Chris@82 528 void XSIMD(codelet_q1bv_8) (planner *p) {
Chris@82 529 X(kdft_difsq_register) (p, q1bv_8, &desc);
Chris@82 530 }
Chris@82 531 #else
Chris@82 532
Chris@82 533 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1bv_8 -include dft/simd/q1b.h -sign 1 */
Chris@82 534
Chris@82 535 /*
Chris@82 536 * This function contains 264 FP additions, 128 FP multiplications,
Chris@82 537 * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
Chris@82 538 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@82 539 */
Chris@82 540 #include "dft/simd/q1b.h"
Chris@82 541
Chris@82 542 static void q1bv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 543 {
Chris@82 544 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 545 {
Chris@82 546 INT m;
Chris@82 547 R *x;
Chris@82 548 x = ii;
Chris@82 549 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@82 550 V Ta, Tv, Te, Tp, T1L, T26, T1P, T20, T2i, T2D, T2m, T2x, T3T, T4e, T3X;
Chris@82 551 V T48, TH, T12, TL, TW, T1e, T1z, T1i, T1t, T2P, T3a, T2T, T34, T3m, T3H;
Chris@82 552 V T3q, T3B, T7, Tw, Tf, Ts, T1I, T27, T1Q, T23, T2f, T2E, T2n, T2A, T3Q;
Chris@82 553 V T4f, T3Y, T4b, TE, T13, TM, TZ, T1b, T1A, T1j, T1w, T2M, T3b, T2U, T37;
Chris@82 554 V T3j, T3I, T3r, T3E, T28, T14;
Chris@82 555 {
Chris@82 556 V T8, T9, To, Tc, Td, Tn;
Chris@82 557 T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 558 T9 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 559 To = VADD(T8, T9);
Chris@82 560 Tc = LD(&(x[0]), ms, &(x[0]));
Chris@82 561 Td = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 562 Tn = VADD(Tc, Td);
Chris@82 563 Ta = VSUB(T8, T9);
Chris@82 564 Tv = VADD(Tn, To);
Chris@82 565 Te = VSUB(Tc, Td);
Chris@82 566 Tp = VSUB(Tn, To);
Chris@82 567 }
Chris@82 568 {
Chris@82 569 V T1J, T1K, T1Z, T1N, T1O, T1Y;
Chris@82 570 T1J = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 571 T1K = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@82 572 T1Z = VADD(T1J, T1K);
Chris@82 573 T1N = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 574 T1O = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 575 T1Y = VADD(T1N, T1O);
Chris@82 576 T1L = VSUB(T1J, T1K);
Chris@82 577 T26 = VADD(T1Y, T1Z);
Chris@82 578 T1P = VSUB(T1N, T1O);
Chris@82 579 T20 = VSUB(T1Y, T1Z);
Chris@82 580 }
Chris@82 581 {
Chris@82 582 V T2g, T2h, T2w, T2k, T2l, T2v;
Chris@82 583 T2g = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 584 T2h = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@82 585 T2w = VADD(T2g, T2h);
Chris@82 586 T2k = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 587 T2l = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 588 T2v = VADD(T2k, T2l);
Chris@82 589 T2i = VSUB(T2g, T2h);
Chris@82 590 T2D = VADD(T2v, T2w);
Chris@82 591 T2m = VSUB(T2k, T2l);
Chris@82 592 T2x = VSUB(T2v, T2w);
Chris@82 593 }
Chris@82 594 {
Chris@82 595 V T3R, T3S, T47, T3V, T3W, T46;
Chris@82 596 T3R = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@82 597 T3S = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@82 598 T47 = VADD(T3R, T3S);
Chris@82 599 T3V = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@82 600 T3W = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@82 601 T46 = VADD(T3V, T3W);
Chris@82 602 T3T = VSUB(T3R, T3S);
Chris@82 603 T4e = VADD(T46, T47);
Chris@82 604 T3X = VSUB(T3V, T3W);
Chris@82 605 T48 = VSUB(T46, T47);
Chris@82 606 }
Chris@82 607 {
Chris@82 608 V TF, TG, TV, TJ, TK, TU;
Chris@82 609 TF = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 610 TG = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@82 611 TV = VADD(TF, TG);
Chris@82 612 TJ = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 613 TK = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 614 TU = VADD(TJ, TK);
Chris@82 615 TH = VSUB(TF, TG);
Chris@82 616 T12 = VADD(TU, TV);
Chris@82 617 TL = VSUB(TJ, TK);
Chris@82 618 TW = VSUB(TU, TV);
Chris@82 619 }
Chris@82 620 {
Chris@82 621 V T1c, T1d, T1s, T1g, T1h, T1r;
Chris@82 622 T1c = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 623 T1d = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@82 624 T1s = VADD(T1c, T1d);
Chris@82 625 T1g = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 626 T1h = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 627 T1r = VADD(T1g, T1h);
Chris@82 628 T1e = VSUB(T1c, T1d);
Chris@82 629 T1z = VADD(T1r, T1s);
Chris@82 630 T1i = VSUB(T1g, T1h);
Chris@82 631 T1t = VSUB(T1r, T1s);
Chris@82 632 }
Chris@82 633 {
Chris@82 634 V T2N, T2O, T33, T2R, T2S, T32;
Chris@82 635 T2N = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@82 636 T2O = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@82 637 T33 = VADD(T2N, T2O);
Chris@82 638 T2R = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@82 639 T2S = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@82 640 T32 = VADD(T2R, T2S);
Chris@82 641 T2P = VSUB(T2N, T2O);
Chris@82 642 T3a = VADD(T32, T33);
Chris@82 643 T2T = VSUB(T2R, T2S);
Chris@82 644 T34 = VSUB(T32, T33);
Chris@82 645 }
Chris@82 646 {
Chris@82 647 V T3k, T3l, T3A, T3o, T3p, T3z;
Chris@82 648 T3k = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@82 649 T3l = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 650 T3A = VADD(T3k, T3l);
Chris@82 651 T3o = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 652 T3p = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@82 653 T3z = VADD(T3o, T3p);
Chris@82 654 T3m = VSUB(T3k, T3l);
Chris@82 655 T3H = VADD(T3z, T3A);
Chris@82 656 T3q = VSUB(T3o, T3p);
Chris@82 657 T3B = VSUB(T3z, T3A);
Chris@82 658 }
Chris@82 659 {
Chris@82 660 V T3, Tq, T6, Tr;
Chris@82 661 {
Chris@82 662 V T1, T2, T4, T5;
Chris@82 663 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 664 T2 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 665 T3 = VSUB(T1, T2);
Chris@82 666 Tq = VADD(T1, T2);
Chris@82 667 T4 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 668 T5 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 669 T6 = VSUB(T4, T5);
Chris@82 670 Tr = VADD(T4, T5);
Chris@82 671 }
Chris@82 672 T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
Chris@82 673 Tw = VADD(Tq, Tr);
Chris@82 674 Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
Chris@82 675 Ts = VBYI(VSUB(Tq, Tr));
Chris@82 676 }
Chris@82 677 {
Chris@82 678 V T1E, T21, T1H, T22;
Chris@82 679 {
Chris@82 680 V T1C, T1D, T1F, T1G;
Chris@82 681 T1C = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 682 T1D = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 683 T1E = VSUB(T1C, T1D);
Chris@82 684 T21 = VADD(T1C, T1D);
Chris@82 685 T1F = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 686 T1G = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 687 T1H = VSUB(T1F, T1G);
Chris@82 688 T22 = VADD(T1F, T1G);
Chris@82 689 }
Chris@82 690 T1I = VMUL(LDK(KP707106781), VSUB(T1E, T1H));
Chris@82 691 T27 = VADD(T21, T22);
Chris@82 692 T1Q = VMUL(LDK(KP707106781), VADD(T1E, T1H));
Chris@82 693 T23 = VBYI(VSUB(T21, T22));
Chris@82 694 }
Chris@82 695 {
Chris@82 696 V T2b, T2y, T2e, T2z;
Chris@82 697 {
Chris@82 698 V T29, T2a, T2c, T2d;
Chris@82 699 T29 = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 700 T2a = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 701 T2b = VSUB(T29, T2a);
Chris@82 702 T2y = VADD(T29, T2a);
Chris@82 703 T2c = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 704 T2d = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 705 T2e = VSUB(T2c, T2d);
Chris@82 706 T2z = VADD(T2c, T2d);
Chris@82 707 }
Chris@82 708 T2f = VMUL(LDK(KP707106781), VSUB(T2b, T2e));
Chris@82 709 T2E = VADD(T2y, T2z);
Chris@82 710 T2n = VMUL(LDK(KP707106781), VADD(T2b, T2e));
Chris@82 711 T2A = VBYI(VSUB(T2y, T2z));
Chris@82 712 }
Chris@82 713 {
Chris@82 714 V T3M, T49, T3P, T4a;
Chris@82 715 {
Chris@82 716 V T3K, T3L, T3N, T3O;
Chris@82 717 T3K = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 718 T3L = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 719 T3M = VSUB(T3K, T3L);
Chris@82 720 T49 = VADD(T3K, T3L);
Chris@82 721 T3N = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 722 T3O = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 723 T3P = VSUB(T3N, T3O);
Chris@82 724 T4a = VADD(T3N, T3O);
Chris@82 725 }
Chris@82 726 T3Q = VMUL(LDK(KP707106781), VSUB(T3M, T3P));
Chris@82 727 T4f = VADD(T49, T4a);
Chris@82 728 T3Y = VMUL(LDK(KP707106781), VADD(T3M, T3P));
Chris@82 729 T4b = VBYI(VSUB(T49, T4a));
Chris@82 730 }
Chris@82 731 {
Chris@82 732 V TA, TX, TD, TY;
Chris@82 733 {
Chris@82 734 V Ty, Tz, TB, TC;
Chris@82 735 Ty = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 736 Tz = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 737 TA = VSUB(Ty, Tz);
Chris@82 738 TX = VADD(Ty, Tz);
Chris@82 739 TB = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 740 TC = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 741 TD = VSUB(TB, TC);
Chris@82 742 TY = VADD(TB, TC);
Chris@82 743 }
Chris@82 744 TE = VMUL(LDK(KP707106781), VSUB(TA, TD));
Chris@82 745 T13 = VADD(TX, TY);
Chris@82 746 TM = VMUL(LDK(KP707106781), VADD(TA, TD));
Chris@82 747 TZ = VBYI(VSUB(TX, TY));
Chris@82 748 }
Chris@82 749 {
Chris@82 750 V T17, T1u, T1a, T1v;
Chris@82 751 {
Chris@82 752 V T15, T16, T18, T19;
Chris@82 753 T15 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 754 T16 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 755 T17 = VSUB(T15, T16);
Chris@82 756 T1u = VADD(T15, T16);
Chris@82 757 T18 = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 758 T19 = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 759 T1a = VSUB(T18, T19);
Chris@82 760 T1v = VADD(T18, T19);
Chris@82 761 }
Chris@82 762 T1b = VMUL(LDK(KP707106781), VSUB(T17, T1a));
Chris@82 763 T1A = VADD(T1u, T1v);
Chris@82 764 T1j = VMUL(LDK(KP707106781), VADD(T17, T1a));
Chris@82 765 T1w = VBYI(VSUB(T1u, T1v));
Chris@82 766 }
Chris@82 767 {
Chris@82 768 V T2I, T35, T2L, T36;
Chris@82 769 {
Chris@82 770 V T2G, T2H, T2J, T2K;
Chris@82 771 T2G = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 772 T2H = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 773 T2I = VSUB(T2G, T2H);
Chris@82 774 T35 = VADD(T2G, T2H);
Chris@82 775 T2J = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 776 T2K = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 777 T2L = VSUB(T2J, T2K);
Chris@82 778 T36 = VADD(T2J, T2K);
Chris@82 779 }
Chris@82 780 T2M = VMUL(LDK(KP707106781), VSUB(T2I, T2L));
Chris@82 781 T3b = VADD(T35, T36);
Chris@82 782 T2U = VMUL(LDK(KP707106781), VADD(T2I, T2L));
Chris@82 783 T37 = VBYI(VSUB(T35, T36));
Chris@82 784 }
Chris@82 785 {
Chris@82 786 V T3f, T3C, T3i, T3D;
Chris@82 787 {
Chris@82 788 V T3d, T3e, T3g, T3h;
Chris@82 789 T3d = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 790 T3e = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 791 T3f = VSUB(T3d, T3e);
Chris@82 792 T3C = VADD(T3d, T3e);
Chris@82 793 T3g = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 794 T3h = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 795 T3i = VSUB(T3g, T3h);
Chris@82 796 T3D = VADD(T3g, T3h);
Chris@82 797 }
Chris@82 798 T3j = VMUL(LDK(KP707106781), VSUB(T3f, T3i));
Chris@82 799 T3I = VADD(T3C, T3D);
Chris@82 800 T3r = VMUL(LDK(KP707106781), VADD(T3f, T3i));
Chris@82 801 T3E = VBYI(VSUB(T3C, T3D));
Chris@82 802 }
Chris@82 803 ST(&(x[0]), VADD(Tv, Tw), ms, &(x[0]));
Chris@82 804 ST(&(x[WS(rs, 2)]), VADD(T1z, T1A), ms, &(x[0]));
Chris@82 805 ST(&(x[WS(rs, 5)]), VADD(T3a, T3b), ms, &(x[WS(rs, 1)]));
Chris@82 806 ST(&(x[WS(rs, 7)]), VADD(T4e, T4f), ms, &(x[WS(rs, 1)]));
Chris@82 807 ST(&(x[WS(rs, 6)]), VADD(T3H, T3I), ms, &(x[0]));
Chris@82 808 ST(&(x[WS(rs, 4)]), VADD(T2D, T2E), ms, &(x[0]));
Chris@82 809 {
Chris@82 810 V Tt, T4c, T2B, T24;
Chris@82 811 ST(&(x[WS(rs, 3)]), VADD(T26, T27), ms, &(x[WS(rs, 1)]));
Chris@82 812 ST(&(x[WS(rs, 1)]), VADD(T12, T13), ms, &(x[WS(rs, 1)]));
Chris@82 813 Tt = BYTW(&(W[TWVL * 10]), VSUB(Tp, Ts));
Chris@82 814 ST(&(x[WS(vs, 6)]), Tt, ms, &(x[WS(vs, 6)]));
Chris@82 815 T4c = BYTW(&(W[TWVL * 10]), VSUB(T48, T4b));
Chris@82 816 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 817 T2B = BYTW(&(W[TWVL * 10]), VSUB(T2x, T2A));
Chris@82 818 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 6)]));
Chris@82 819 T24 = BYTW(&(W[TWVL * 10]), VSUB(T20, T23));
Chris@82 820 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 821 }
Chris@82 822 {
Chris@82 823 V T10, T1x, T3F, T38, T1y, Tu;
Chris@82 824 T10 = BYTW(&(W[TWVL * 10]), VSUB(TW, TZ));
Chris@82 825 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 826 T1x = BYTW(&(W[TWVL * 10]), VSUB(T1t, T1w));
Chris@82 827 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 6)]));
Chris@82 828 T3F = BYTW(&(W[TWVL * 10]), VSUB(T3B, T3E));
Chris@82 829 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 6)]));
Chris@82 830 T38 = BYTW(&(W[TWVL * 10]), VSUB(T34, T37));
Chris@82 831 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 832 T1y = BYTW(&(W[TWVL * 2]), VADD(T1t, T1w));
Chris@82 833 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1y, ms, &(x[WS(vs, 2)]));
Chris@82 834 Tu = BYTW(&(W[TWVL * 2]), VADD(Tp, Ts));
Chris@82 835 ST(&(x[WS(vs, 2)]), Tu, ms, &(x[WS(vs, 2)]));
Chris@82 836 }
Chris@82 837 {
Chris@82 838 V T2C, T3G, T11, T25, T39, T4d;
Chris@82 839 T2C = BYTW(&(W[TWVL * 2]), VADD(T2x, T2A));
Chris@82 840 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2C, ms, &(x[WS(vs, 2)]));
Chris@82 841 T3G = BYTW(&(W[TWVL * 2]), VADD(T3B, T3E));
Chris@82 842 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3G, ms, &(x[WS(vs, 2)]));
Chris@82 843 T11 = BYTW(&(W[TWVL * 2]), VADD(TW, TZ));
Chris@82 844 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T11, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 845 T25 = BYTW(&(W[TWVL * 2]), VADD(T20, T23));
Chris@82 846 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T25, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 847 T39 = BYTW(&(W[TWVL * 2]), VADD(T34, T37));
Chris@82 848 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T39, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 849 T4d = BYTW(&(W[TWVL * 2]), VADD(T48, T4b));
Chris@82 850 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4d, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 851 }
Chris@82 852 {
Chris@82 853 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@82 854 Tx = BYTW(&(W[TWVL * 6]), VSUB(Tv, Tw));
Chris@82 855 ST(&(x[WS(vs, 4)]), Tx, ms, &(x[WS(vs, 4)]));
Chris@82 856 T1B = BYTW(&(W[TWVL * 6]), VSUB(T1z, T1A));
Chris@82 857 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 4)]));
Chris@82 858 T3c = BYTW(&(W[TWVL * 6]), VSUB(T3a, T3b));
Chris@82 859 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 860 T4g = BYTW(&(W[TWVL * 6]), VSUB(T4e, T4f));
Chris@82 861 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 862 T3J = BYTW(&(W[TWVL * 6]), VSUB(T3H, T3I));
Chris@82 863 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 4)]));
Chris@82 864 T2F = BYTW(&(W[TWVL * 6]), VSUB(T2D, T2E));
Chris@82 865 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 4)]));
Chris@82 866 }
Chris@82 867 T28 = BYTW(&(W[TWVL * 6]), VSUB(T26, T27));
Chris@82 868 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 869 T14 = BYTW(&(W[TWVL * 6]), VSUB(T12, T13));
Chris@82 870 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 871 {
Chris@82 872 V Th, Ti, Tb, Tg;
Chris@82 873 Tb = VBYI(VSUB(T7, Ta));
Chris@82 874 Tg = VSUB(Te, Tf);
Chris@82 875 Th = BYTW(&(W[TWVL * 4]), VADD(Tb, Tg));
Chris@82 876 Ti = BYTW(&(W[TWVL * 8]), VSUB(Tg, Tb));
Chris@82 877 ST(&(x[WS(vs, 3)]), Th, ms, &(x[WS(vs, 3)]));
Chris@82 878 ST(&(x[WS(vs, 5)]), Ti, ms, &(x[WS(vs, 5)]));
Chris@82 879 }
Chris@82 880 {
Chris@82 881 V T40, T41, T3U, T3Z;
Chris@82 882 T3U = VBYI(VSUB(T3Q, T3T));
Chris@82 883 T3Z = VSUB(T3X, T3Y);
Chris@82 884 T40 = BYTW(&(W[TWVL * 4]), VADD(T3U, T3Z));
Chris@82 885 T41 = BYTW(&(W[TWVL * 8]), VSUB(T3Z, T3U));
Chris@82 886 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 887 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 888 }
Chris@82 889 {
Chris@82 890 V T2p, T2q, T2j, T2o;
Chris@82 891 T2j = VBYI(VSUB(T2f, T2i));
Chris@82 892 T2o = VSUB(T2m, T2n);
Chris@82 893 T2p = BYTW(&(W[TWVL * 4]), VADD(T2j, T2o));
Chris@82 894 T2q = BYTW(&(W[TWVL * 8]), VSUB(T2o, T2j));
Chris@82 895 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 3)]));
Chris@82 896 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 5)]));
Chris@82 897 }
Chris@82 898 {
Chris@82 899 V T1S, T1T, T1M, T1R;
Chris@82 900 T1M = VBYI(VSUB(T1I, T1L));
Chris@82 901 T1R = VSUB(T1P, T1Q);
Chris@82 902 T1S = BYTW(&(W[TWVL * 4]), VADD(T1M, T1R));
Chris@82 903 T1T = BYTW(&(W[TWVL * 8]), VSUB(T1R, T1M));
Chris@82 904 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 905 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 906 }
Chris@82 907 {
Chris@82 908 V TO, TP, TI, TN;
Chris@82 909 TI = VBYI(VSUB(TE, TH));
Chris@82 910 TN = VSUB(TL, TM);
Chris@82 911 TO = BYTW(&(W[TWVL * 4]), VADD(TI, TN));
Chris@82 912 TP = BYTW(&(W[TWVL * 8]), VSUB(TN, TI));
Chris@82 913 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 914 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 915 }
Chris@82 916 {
Chris@82 917 V T1l, T1m, T1f, T1k;
Chris@82 918 T1f = VBYI(VSUB(T1b, T1e));
Chris@82 919 T1k = VSUB(T1i, T1j);
Chris@82 920 T1l = BYTW(&(W[TWVL * 4]), VADD(T1f, T1k));
Chris@82 921 T1m = BYTW(&(W[TWVL * 8]), VSUB(T1k, T1f));
Chris@82 922 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 3)]));
Chris@82 923 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 5)]));
Chris@82 924 }
Chris@82 925 {
Chris@82 926 V T3t, T3u, T3n, T3s;
Chris@82 927 T3n = VBYI(VSUB(T3j, T3m));
Chris@82 928 T3s = VSUB(T3q, T3r);
Chris@82 929 T3t = BYTW(&(W[TWVL * 4]), VADD(T3n, T3s));
Chris@82 930 T3u = BYTW(&(W[TWVL * 8]), VSUB(T3s, T3n));
Chris@82 931 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 3)]));
Chris@82 932 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 5)]));
Chris@82 933 }
Chris@82 934 {
Chris@82 935 V T2W, T2X, T2Q, T2V;
Chris@82 936 T2Q = VBYI(VSUB(T2M, T2P));
Chris@82 937 T2V = VSUB(T2T, T2U);
Chris@82 938 T2W = BYTW(&(W[TWVL * 4]), VADD(T2Q, T2V));
Chris@82 939 T2X = BYTW(&(W[TWVL * 8]), VSUB(T2V, T2Q));
Chris@82 940 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 941 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 942 }
Chris@82 943 {
Chris@82 944 V T1p, T1q, T1n, T1o;
Chris@82 945 T1n = VBYI(VADD(T1e, T1b));
Chris@82 946 T1o = VADD(T1i, T1j);
Chris@82 947 T1p = BYTW(&(W[0]), VADD(T1n, T1o));
Chris@82 948 T1q = BYTW(&(W[TWVL * 12]), VSUB(T1o, T1n));
Chris@82 949 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 1)]));
Chris@82 950 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 7)]));
Chris@82 951 }
Chris@82 952 {
Chris@82 953 V Tl, Tm, Tj, Tk;
Chris@82 954 Tj = VBYI(VADD(Ta, T7));
Chris@82 955 Tk = VADD(Te, Tf);
Chris@82 956 Tl = BYTW(&(W[0]), VADD(Tj, Tk));
Chris@82 957 Tm = BYTW(&(W[TWVL * 12]), VSUB(Tk, Tj));
Chris@82 958 ST(&(x[WS(vs, 1)]), Tl, ms, &(x[WS(vs, 1)]));
Chris@82 959 ST(&(x[WS(vs, 7)]), Tm, ms, &(x[WS(vs, 7)]));
Chris@82 960 }
Chris@82 961 {
Chris@82 962 V T2t, T2u, T2r, T2s;
Chris@82 963 T2r = VBYI(VADD(T2i, T2f));
Chris@82 964 T2s = VADD(T2m, T2n);
Chris@82 965 T2t = BYTW(&(W[0]), VADD(T2r, T2s));
Chris@82 966 T2u = BYTW(&(W[TWVL * 12]), VSUB(T2s, T2r));
Chris@82 967 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 1)]));
Chris@82 968 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 7)]));
Chris@82 969 }
Chris@82 970 {
Chris@82 971 V T3x, T3y, T3v, T3w;
Chris@82 972 T3v = VBYI(VADD(T3m, T3j));
Chris@82 973 T3w = VADD(T3q, T3r);
Chris@82 974 T3x = BYTW(&(W[0]), VADD(T3v, T3w));
Chris@82 975 T3y = BYTW(&(W[TWVL * 12]), VSUB(T3w, T3v));
Chris@82 976 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 1)]));
Chris@82 977 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 7)]));
Chris@82 978 }
Chris@82 979 {
Chris@82 980 V TS, TT, TQ, TR;
Chris@82 981 TQ = VBYI(VADD(TH, TE));
Chris@82 982 TR = VADD(TL, TM);
Chris@82 983 TS = BYTW(&(W[0]), VADD(TQ, TR));
Chris@82 984 TT = BYTW(&(W[TWVL * 12]), VSUB(TR, TQ));
Chris@82 985 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 986 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 987 }
Chris@82 988 {
Chris@82 989 V T1W, T1X, T1U, T1V;
Chris@82 990 T1U = VBYI(VADD(T1L, T1I));
Chris@82 991 T1V = VADD(T1P, T1Q);
Chris@82 992 T1W = BYTW(&(W[0]), VADD(T1U, T1V));
Chris@82 993 T1X = BYTW(&(W[TWVL * 12]), VSUB(T1V, T1U));
Chris@82 994 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 995 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 996 }
Chris@82 997 {
Chris@82 998 V T30, T31, T2Y, T2Z;
Chris@82 999 T2Y = VBYI(VADD(T2P, T2M));
Chris@82 1000 T2Z = VADD(T2T, T2U);
Chris@82 1001 T30 = BYTW(&(W[0]), VADD(T2Y, T2Z));
Chris@82 1002 T31 = BYTW(&(W[TWVL * 12]), VSUB(T2Z, T2Y));
Chris@82 1003 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 1004 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 1005 }
Chris@82 1006 {
Chris@82 1007 V T44, T45, T42, T43;
Chris@82 1008 T42 = VBYI(VADD(T3T, T3Q));
Chris@82 1009 T43 = VADD(T3X, T3Y);
Chris@82 1010 T44 = BYTW(&(W[0]), VADD(T42, T43));
Chris@82 1011 T45 = BYTW(&(W[TWVL * 12]), VSUB(T43, T42));
Chris@82 1012 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 1013 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 1014 }
Chris@82 1015 }
Chris@82 1016 }
Chris@82 1017 VLEAVE();
Chris@82 1018 }
Chris@82 1019
Chris@82 1020 static const tw_instr twinstr[] = {
Chris@82 1021 VTW(0, 1),
Chris@82 1022 VTW(0, 2),
Chris@82 1023 VTW(0, 3),
Chris@82 1024 VTW(0, 4),
Chris@82 1025 VTW(0, 5),
Chris@82 1026 VTW(0, 6),
Chris@82 1027 VTW(0, 7),
Chris@82 1028 {TW_NEXT, VL, 0}
Chris@82 1029 };
Chris@82 1030
Chris@82 1031 static const ct_desc desc = { 8, XSIMD_STRING("q1bv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
Chris@82 1032
Chris@82 1033 void XSIMD(codelet_q1bv_8) (planner *p) {
Chris@82 1034 X(kdft_difsq_register) (p, q1bv_8, &desc);
Chris@82 1035 }
Chris@82 1036 #endif