annotate src/fftw-3.3.8/dft/simd/common/q1fv_8.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:06:14 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twidsq_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include dft/simd/q1f.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 264 FP additions, 192 FP multiplications,
Chris@82 32 * (or, 184 additions, 112 multiplications, 80 fused multiply/add),
Chris@82 33 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/q1f.h"
Chris@82 36
Chris@82 37 static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 40 {
Chris@82 41 INT m;
Chris@82 42 R *x;
Chris@82 43 x = ri;
Chris@82 44 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@82 45 V T3, Tu, Te, Tp, T1E, T25, T1P, T20, T2b, T2C, T2m, T2x, T3M, T4d, T3X;
Chris@82 46 V T48, TA, T11, TL, TW, T17, T1y, T1i, T1t, T2I, T39, T2T, T34, T3f, T3G;
Chris@82 47 V T3q, T3B, Ta, Tv, Tf, Ts, T1L, T26, T1Q, T23, T2i, T2D, T2n, T2A, T3T;
Chris@82 48 V T4e, T3Y, T4b, TH, T12, TM, TZ, T1e, T1z, T1j, T1w, T2P, T3a, T2U, T37;
Chris@82 49 V T3m, T3H, T3r, T3E, T28, T14;
Chris@82 50 {
Chris@82 51 V T1, T2, Tn, Tc, Td, To;
Chris@82 52 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 53 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 54 Tn = VADD(T1, T2);
Chris@82 55 Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 56 Td = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 57 To = VADD(Tc, Td);
Chris@82 58 T3 = VSUB(T1, T2);
Chris@82 59 Tu = VSUB(Tn, To);
Chris@82 60 Te = VSUB(Tc, Td);
Chris@82 61 Tp = VADD(Tn, To);
Chris@82 62 }
Chris@82 63 {
Chris@82 64 V T1C, T1D, T1Y, T1N, T1O, T1Z;
Chris@82 65 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 66 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 67 T1Y = VADD(T1C, T1D);
Chris@82 68 T1N = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 69 T1O = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@82 70 T1Z = VADD(T1N, T1O);
Chris@82 71 T1E = VSUB(T1C, T1D);
Chris@82 72 T25 = VSUB(T1Y, T1Z);
Chris@82 73 T1P = VSUB(T1N, T1O);
Chris@82 74 T20 = VADD(T1Y, T1Z);
Chris@82 75 }
Chris@82 76 {
Chris@82 77 V T29, T2a, T2v, T2k, T2l, T2w;
Chris@82 78 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 79 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 80 T2v = VADD(T29, T2a);
Chris@82 81 T2k = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 82 T2l = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@82 83 T2w = VADD(T2k, T2l);
Chris@82 84 T2b = VSUB(T29, T2a);
Chris@82 85 T2C = VSUB(T2v, T2w);
Chris@82 86 T2m = VSUB(T2k, T2l);
Chris@82 87 T2x = VADD(T2v, T2w);
Chris@82 88 }
Chris@82 89 {
Chris@82 90 V T3K, T3L, T46, T3V, T3W, T47;
Chris@82 91 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@82 92 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@82 93 T46 = VADD(T3K, T3L);
Chris@82 94 T3V = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@82 95 T3W = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@82 96 T47 = VADD(T3V, T3W);
Chris@82 97 T3M = VSUB(T3K, T3L);
Chris@82 98 T4d = VSUB(T46, T47);
Chris@82 99 T3X = VSUB(T3V, T3W);
Chris@82 100 T48 = VADD(T46, T47);
Chris@82 101 }
Chris@82 102 {
Chris@82 103 V Ty, Tz, TU, TJ, TK, TV;
Chris@82 104 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 105 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 106 TU = VADD(Ty, Tz);
Chris@82 107 TJ = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 108 TK = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@82 109 TV = VADD(TJ, TK);
Chris@82 110 TA = VSUB(Ty, Tz);
Chris@82 111 T11 = VSUB(TU, TV);
Chris@82 112 TL = VSUB(TJ, TK);
Chris@82 113 TW = VADD(TU, TV);
Chris@82 114 }
Chris@82 115 {
Chris@82 116 V T15, T16, T1r, T1g, T1h, T1s;
Chris@82 117 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 118 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 119 T1r = VADD(T15, T16);
Chris@82 120 T1g = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 121 T1h = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@82 122 T1s = VADD(T1g, T1h);
Chris@82 123 T17 = VSUB(T15, T16);
Chris@82 124 T1y = VSUB(T1r, T1s);
Chris@82 125 T1i = VSUB(T1g, T1h);
Chris@82 126 T1t = VADD(T1r, T1s);
Chris@82 127 }
Chris@82 128 {
Chris@82 129 V T2G, T2H, T32, T2R, T2S, T33;
Chris@82 130 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@82 131 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@82 132 T32 = VADD(T2G, T2H);
Chris@82 133 T2R = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@82 134 T2S = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@82 135 T33 = VADD(T2R, T2S);
Chris@82 136 T2I = VSUB(T2G, T2H);
Chris@82 137 T39 = VSUB(T32, T33);
Chris@82 138 T2T = VSUB(T2R, T2S);
Chris@82 139 T34 = VADD(T32, T33);
Chris@82 140 }
Chris@82 141 {
Chris@82 142 V T3d, T3e, T3z, T3o, T3p, T3A;
Chris@82 143 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 144 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@82 145 T3z = VADD(T3d, T3e);
Chris@82 146 T3o = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@82 147 T3p = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 148 T3A = VADD(T3o, T3p);
Chris@82 149 T3f = VSUB(T3d, T3e);
Chris@82 150 T3G = VSUB(T3z, T3A);
Chris@82 151 T3q = VSUB(T3o, T3p);
Chris@82 152 T3B = VADD(T3z, T3A);
Chris@82 153 }
Chris@82 154 {
Chris@82 155 V T6, Tq, T9, Tr;
Chris@82 156 {
Chris@82 157 V T4, T5, T7, T8;
Chris@82 158 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 159 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 160 T6 = VSUB(T4, T5);
Chris@82 161 Tq = VADD(T4, T5);
Chris@82 162 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 163 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 164 T9 = VSUB(T7, T8);
Chris@82 165 Tr = VADD(T7, T8);
Chris@82 166 }
Chris@82 167 Ta = VADD(T6, T9);
Chris@82 168 Tv = VSUB(Tr, Tq);
Chris@82 169 Tf = VSUB(T9, T6);
Chris@82 170 Ts = VADD(Tq, Tr);
Chris@82 171 }
Chris@82 172 {
Chris@82 173 V T1H, T21, T1K, T22;
Chris@82 174 {
Chris@82 175 V T1F, T1G, T1I, T1J;
Chris@82 176 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 177 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 178 T1H = VSUB(T1F, T1G);
Chris@82 179 T21 = VADD(T1F, T1G);
Chris@82 180 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 181 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 182 T1K = VSUB(T1I, T1J);
Chris@82 183 T22 = VADD(T1I, T1J);
Chris@82 184 }
Chris@82 185 T1L = VADD(T1H, T1K);
Chris@82 186 T26 = VSUB(T22, T21);
Chris@82 187 T1Q = VSUB(T1K, T1H);
Chris@82 188 T23 = VADD(T21, T22);
Chris@82 189 }
Chris@82 190 {
Chris@82 191 V T2e, T2y, T2h, T2z;
Chris@82 192 {
Chris@82 193 V T2c, T2d, T2f, T2g;
Chris@82 194 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 195 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 196 T2e = VSUB(T2c, T2d);
Chris@82 197 T2y = VADD(T2c, T2d);
Chris@82 198 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 199 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 200 T2h = VSUB(T2f, T2g);
Chris@82 201 T2z = VADD(T2f, T2g);
Chris@82 202 }
Chris@82 203 T2i = VADD(T2e, T2h);
Chris@82 204 T2D = VSUB(T2z, T2y);
Chris@82 205 T2n = VSUB(T2h, T2e);
Chris@82 206 T2A = VADD(T2y, T2z);
Chris@82 207 }
Chris@82 208 {
Chris@82 209 V T3P, T49, T3S, T4a;
Chris@82 210 {
Chris@82 211 V T3N, T3O, T3Q, T3R;
Chris@82 212 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 213 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 214 T3P = VSUB(T3N, T3O);
Chris@82 215 T49 = VADD(T3N, T3O);
Chris@82 216 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 217 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 218 T3S = VSUB(T3Q, T3R);
Chris@82 219 T4a = VADD(T3Q, T3R);
Chris@82 220 }
Chris@82 221 T3T = VADD(T3P, T3S);
Chris@82 222 T4e = VSUB(T4a, T49);
Chris@82 223 T3Y = VSUB(T3S, T3P);
Chris@82 224 T4b = VADD(T49, T4a);
Chris@82 225 }
Chris@82 226 {
Chris@82 227 V TD, TX, TG, TY;
Chris@82 228 {
Chris@82 229 V TB, TC, TE, TF;
Chris@82 230 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 231 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 232 TD = VSUB(TB, TC);
Chris@82 233 TX = VADD(TB, TC);
Chris@82 234 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 235 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 236 TG = VSUB(TE, TF);
Chris@82 237 TY = VADD(TE, TF);
Chris@82 238 }
Chris@82 239 TH = VADD(TD, TG);
Chris@82 240 T12 = VSUB(TY, TX);
Chris@82 241 TM = VSUB(TG, TD);
Chris@82 242 TZ = VADD(TX, TY);
Chris@82 243 }
Chris@82 244 {
Chris@82 245 V T1a, T1u, T1d, T1v;
Chris@82 246 {
Chris@82 247 V T18, T19, T1b, T1c;
Chris@82 248 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 249 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 250 T1a = VSUB(T18, T19);
Chris@82 251 T1u = VADD(T18, T19);
Chris@82 252 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 253 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 254 T1d = VSUB(T1b, T1c);
Chris@82 255 T1v = VADD(T1b, T1c);
Chris@82 256 }
Chris@82 257 T1e = VADD(T1a, T1d);
Chris@82 258 T1z = VSUB(T1v, T1u);
Chris@82 259 T1j = VSUB(T1d, T1a);
Chris@82 260 T1w = VADD(T1u, T1v);
Chris@82 261 }
Chris@82 262 {
Chris@82 263 V T2L, T35, T2O, T36;
Chris@82 264 {
Chris@82 265 V T2J, T2K, T2M, T2N;
Chris@82 266 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 267 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 268 T2L = VSUB(T2J, T2K);
Chris@82 269 T35 = VADD(T2J, T2K);
Chris@82 270 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 271 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 272 T2O = VSUB(T2M, T2N);
Chris@82 273 T36 = VADD(T2M, T2N);
Chris@82 274 }
Chris@82 275 T2P = VADD(T2L, T2O);
Chris@82 276 T3a = VSUB(T36, T35);
Chris@82 277 T2U = VSUB(T2O, T2L);
Chris@82 278 T37 = VADD(T35, T36);
Chris@82 279 }
Chris@82 280 {
Chris@82 281 V T3i, T3C, T3l, T3D;
Chris@82 282 {
Chris@82 283 V T3g, T3h, T3j, T3k;
Chris@82 284 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 285 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 286 T3i = VSUB(T3g, T3h);
Chris@82 287 T3C = VADD(T3g, T3h);
Chris@82 288 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 289 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 290 T3l = VSUB(T3j, T3k);
Chris@82 291 T3D = VADD(T3j, T3k);
Chris@82 292 }
Chris@82 293 T3m = VADD(T3i, T3l);
Chris@82 294 T3H = VSUB(T3D, T3C);
Chris@82 295 T3r = VSUB(T3l, T3i);
Chris@82 296 T3E = VADD(T3C, T3D);
Chris@82 297 }
Chris@82 298 ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
Chris@82 299 ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
Chris@82 300 ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
Chris@82 301 ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
Chris@82 302 ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
Chris@82 303 ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
Chris@82 304 {
Chris@82 305 V Tt, T4c, T2B, T24;
Chris@82 306 ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
Chris@82 307 ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
Chris@82 308 Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
Chris@82 309 ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
Chris@82 310 T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
Chris@82 311 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 312 T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
Chris@82 313 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
Chris@82 314 T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
Chris@82 315 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 316 }
Chris@82 317 {
Chris@82 318 V T10, T1x, T3F, T38, T1A, Tw;
Chris@82 319 T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
Chris@82 320 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 321 T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
Chris@82 322 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
Chris@82 323 T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
Chris@82 324 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
Chris@82 325 T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
Chris@82 326 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 327 T1A = BYTWJ(&(W[TWVL * 10]), VFNMSI(T1z, T1y));
Chris@82 328 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
Chris@82 329 Tw = BYTWJ(&(W[TWVL * 10]), VFNMSI(Tv, Tu));
Chris@82 330 ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
Chris@82 331 }
Chris@82 332 {
Chris@82 333 V T2E, T3I, T13, T27, T3b, T4f;
Chris@82 334 T2E = BYTWJ(&(W[TWVL * 10]), VFNMSI(T2D, T2C));
Chris@82 335 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
Chris@82 336 T3I = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3H, T3G));
Chris@82 337 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
Chris@82 338 T13 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T12, T11));
Chris@82 339 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 340 T27 = BYTWJ(&(W[TWVL * 10]), VFNMSI(T26, T25));
Chris@82 341 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 342 T3b = BYTWJ(&(W[TWVL * 10]), VFNMSI(T3a, T39));
Chris@82 343 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 344 T4f = BYTWJ(&(W[TWVL * 10]), VFNMSI(T4e, T4d));
Chris@82 345 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 346 }
Chris@82 347 {
Chris@82 348 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@82 349 Tx = BYTWJ(&(W[TWVL * 2]), VFMAI(Tv, Tu));
Chris@82 350 ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@82 351 T1B = BYTWJ(&(W[TWVL * 2]), VFMAI(T1z, T1y));
Chris@82 352 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@82 353 T3c = BYTWJ(&(W[TWVL * 2]), VFMAI(T3a, T39));
Chris@82 354 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 355 T4g = BYTWJ(&(W[TWVL * 2]), VFMAI(T4e, T4d));
Chris@82 356 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 357 T3J = BYTWJ(&(W[TWVL * 2]), VFMAI(T3H, T3G));
Chris@82 358 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
Chris@82 359 T2F = BYTWJ(&(W[TWVL * 2]), VFMAI(T2D, T2C));
Chris@82 360 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
Chris@82 361 }
Chris@82 362 T28 = BYTWJ(&(W[TWVL * 2]), VFMAI(T26, T25));
Chris@82 363 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 364 T14 = BYTWJ(&(W[TWVL * 2]), VFMAI(T12, T11));
Chris@82 365 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 366 {
Chris@82 367 V Th, Ti, Tb, Tg;
Chris@82 368 Tb = VFMA(LDK(KP707106781), Ta, T3);
Chris@82 369 Tg = VFNMS(LDK(KP707106781), Tf, Te);
Chris@82 370 Th = BYTWJ(&(W[0]), VFNMSI(Tg, Tb));
Chris@82 371 Ti = BYTWJ(&(W[TWVL * 12]), VFMAI(Tg, Tb));
Chris@82 372 ST(&(x[WS(vs, 1)]), Th, ms, &(x[WS(vs, 1)]));
Chris@82 373 ST(&(x[WS(vs, 7)]), Ti, ms, &(x[WS(vs, 7)]));
Chris@82 374 }
Chris@82 375 {
Chris@82 376 V T40, T41, T3U, T3Z;
Chris@82 377 T3U = VFMA(LDK(KP707106781), T3T, T3M);
Chris@82 378 T3Z = VFNMS(LDK(KP707106781), T3Y, T3X);
Chris@82 379 T40 = BYTWJ(&(W[0]), VFNMSI(T3Z, T3U));
Chris@82 380 T41 = BYTWJ(&(W[TWVL * 12]), VFMAI(T3Z, T3U));
Chris@82 381 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 382 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 383 }
Chris@82 384 {
Chris@82 385 V T2p, T2q, T2j, T2o;
Chris@82 386 T2j = VFMA(LDK(KP707106781), T2i, T2b);
Chris@82 387 T2o = VFNMS(LDK(KP707106781), T2n, T2m);
Chris@82 388 T2p = BYTWJ(&(W[0]), VFNMSI(T2o, T2j));
Chris@82 389 T2q = BYTWJ(&(W[TWVL * 12]), VFMAI(T2o, T2j));
Chris@82 390 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 1)]));
Chris@82 391 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 7)]));
Chris@82 392 }
Chris@82 393 {
Chris@82 394 V T1S, T1T, T1M, T1R;
Chris@82 395 T1M = VFMA(LDK(KP707106781), T1L, T1E);
Chris@82 396 T1R = VFNMS(LDK(KP707106781), T1Q, T1P);
Chris@82 397 T1S = BYTWJ(&(W[0]), VFNMSI(T1R, T1M));
Chris@82 398 T1T = BYTWJ(&(W[TWVL * 12]), VFMAI(T1R, T1M));
Chris@82 399 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 400 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 401 }
Chris@82 402 {
Chris@82 403 V TO, TP, TI, TN;
Chris@82 404 TI = VFMA(LDK(KP707106781), TH, TA);
Chris@82 405 TN = VFNMS(LDK(KP707106781), TM, TL);
Chris@82 406 TO = BYTWJ(&(W[0]), VFNMSI(TN, TI));
Chris@82 407 TP = BYTWJ(&(W[TWVL * 12]), VFMAI(TN, TI));
Chris@82 408 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 409 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 410 }
Chris@82 411 {
Chris@82 412 V T1l, T1m, T1f, T1k;
Chris@82 413 T1f = VFMA(LDK(KP707106781), T1e, T17);
Chris@82 414 T1k = VFNMS(LDK(KP707106781), T1j, T1i);
Chris@82 415 T1l = BYTWJ(&(W[0]), VFNMSI(T1k, T1f));
Chris@82 416 T1m = BYTWJ(&(W[TWVL * 12]), VFMAI(T1k, T1f));
Chris@82 417 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 1)]));
Chris@82 418 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 7)]));
Chris@82 419 }
Chris@82 420 {
Chris@82 421 V T3t, T3u, T3n, T3s;
Chris@82 422 T3n = VFMA(LDK(KP707106781), T3m, T3f);
Chris@82 423 T3s = VFNMS(LDK(KP707106781), T3r, T3q);
Chris@82 424 T3t = BYTWJ(&(W[0]), VFNMSI(T3s, T3n));
Chris@82 425 T3u = BYTWJ(&(W[TWVL * 12]), VFMAI(T3s, T3n));
Chris@82 426 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 1)]));
Chris@82 427 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 7)]));
Chris@82 428 }
Chris@82 429 {
Chris@82 430 V T2W, T2X, T2Q, T2V;
Chris@82 431 T2Q = VFMA(LDK(KP707106781), T2P, T2I);
Chris@82 432 T2V = VFNMS(LDK(KP707106781), T2U, T2T);
Chris@82 433 T2W = BYTWJ(&(W[0]), VFNMSI(T2V, T2Q));
Chris@82 434 T2X = BYTWJ(&(W[TWVL * 12]), VFMAI(T2V, T2Q));
Chris@82 435 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 436 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 437 }
Chris@82 438 {
Chris@82 439 V T1p, T1q, T1n, T1o;
Chris@82 440 T1n = VFNMS(LDK(KP707106781), T1e, T17);
Chris@82 441 T1o = VFMA(LDK(KP707106781), T1j, T1i);
Chris@82 442 T1p = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1o, T1n));
Chris@82 443 T1q = BYTWJ(&(W[TWVL * 4]), VFMAI(T1o, T1n));
Chris@82 444 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
Chris@82 445 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
Chris@82 446 }
Chris@82 447 {
Chris@82 448 V Tl, Tm, Tj, Tk;
Chris@82 449 Tj = VFNMS(LDK(KP707106781), Ta, T3);
Chris@82 450 Tk = VFMA(LDK(KP707106781), Tf, Te);
Chris@82 451 Tl = BYTWJ(&(W[TWVL * 8]), VFNMSI(Tk, Tj));
Chris@82 452 Tm = BYTWJ(&(W[TWVL * 4]), VFMAI(Tk, Tj));
Chris@82 453 ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
Chris@82 454 ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
Chris@82 455 }
Chris@82 456 {
Chris@82 457 V T2t, T2u, T2r, T2s;
Chris@82 458 T2r = VFNMS(LDK(KP707106781), T2i, T2b);
Chris@82 459 T2s = VFMA(LDK(KP707106781), T2n, T2m);
Chris@82 460 T2t = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2s, T2r));
Chris@82 461 T2u = BYTWJ(&(W[TWVL * 4]), VFMAI(T2s, T2r));
Chris@82 462 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
Chris@82 463 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
Chris@82 464 }
Chris@82 465 {
Chris@82 466 V T3x, T3y, T3v, T3w;
Chris@82 467 T3v = VFNMS(LDK(KP707106781), T3m, T3f);
Chris@82 468 T3w = VFMA(LDK(KP707106781), T3r, T3q);
Chris@82 469 T3x = BYTWJ(&(W[TWVL * 8]), VFNMSI(T3w, T3v));
Chris@82 470 T3y = BYTWJ(&(W[TWVL * 4]), VFMAI(T3w, T3v));
Chris@82 471 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
Chris@82 472 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
Chris@82 473 }
Chris@82 474 {
Chris@82 475 V TS, TT, TQ, TR;
Chris@82 476 TQ = VFNMS(LDK(KP707106781), TH, TA);
Chris@82 477 TR = VFMA(LDK(KP707106781), TM, TL);
Chris@82 478 TS = BYTWJ(&(W[TWVL * 8]), VFNMSI(TR, TQ));
Chris@82 479 TT = BYTWJ(&(W[TWVL * 4]), VFMAI(TR, TQ));
Chris@82 480 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 481 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 482 }
Chris@82 483 {
Chris@82 484 V T1W, T1X, T1U, T1V;
Chris@82 485 T1U = VFNMS(LDK(KP707106781), T1L, T1E);
Chris@82 486 T1V = VFMA(LDK(KP707106781), T1Q, T1P);
Chris@82 487 T1W = BYTWJ(&(W[TWVL * 8]), VFNMSI(T1V, T1U));
Chris@82 488 T1X = BYTWJ(&(W[TWVL * 4]), VFMAI(T1V, T1U));
Chris@82 489 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 490 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 491 }
Chris@82 492 {
Chris@82 493 V T30, T31, T2Y, T2Z;
Chris@82 494 T2Y = VFNMS(LDK(KP707106781), T2P, T2I);
Chris@82 495 T2Z = VFMA(LDK(KP707106781), T2U, T2T);
Chris@82 496 T30 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T2Z, T2Y));
Chris@82 497 T31 = BYTWJ(&(W[TWVL * 4]), VFMAI(T2Z, T2Y));
Chris@82 498 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 499 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 500 }
Chris@82 501 {
Chris@82 502 V T44, T45, T42, T43;
Chris@82 503 T42 = VFNMS(LDK(KP707106781), T3T, T3M);
Chris@82 504 T43 = VFMA(LDK(KP707106781), T3Y, T3X);
Chris@82 505 T44 = BYTWJ(&(W[TWVL * 8]), VFNMSI(T43, T42));
Chris@82 506 T45 = BYTWJ(&(W[TWVL * 4]), VFMAI(T43, T42));
Chris@82 507 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 508 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 509 }
Chris@82 510 }
Chris@82 511 }
Chris@82 512 VLEAVE();
Chris@82 513 }
Chris@82 514
Chris@82 515 static const tw_instr twinstr[] = {
Chris@82 516 VTW(0, 1),
Chris@82 517 VTW(0, 2),
Chris@82 518 VTW(0, 3),
Chris@82 519 VTW(0, 4),
Chris@82 520 VTW(0, 5),
Chris@82 521 VTW(0, 6),
Chris@82 522 VTW(0, 7),
Chris@82 523 {TW_NEXT, VL, 0}
Chris@82 524 };
Chris@82 525
Chris@82 526 static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {184, 112, 80, 0}, 0, 0, 0 };
Chris@82 527
Chris@82 528 void XSIMD(codelet_q1fv_8) (planner *p) {
Chris@82 529 X(kdft_difsq_register) (p, q1fv_8, &desc);
Chris@82 530 }
Chris@82 531 #else
Chris@82 532
Chris@82 533 /* Generated by: ../../../genfft/gen_twidsq_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 8 -dif -name q1fv_8 -include dft/simd/q1f.h */
Chris@82 534
Chris@82 535 /*
Chris@82 536 * This function contains 264 FP additions, 128 FP multiplications,
Chris@82 537 * (or, 264 additions, 128 multiplications, 0 fused multiply/add),
Chris@82 538 * 77 stack variables, 1 constants, and 128 memory accesses
Chris@82 539 */
Chris@82 540 #include "dft/simd/q1f.h"
Chris@82 541
Chris@82 542 static void q1fv_8(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
Chris@82 543 {
Chris@82 544 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 545 {
Chris@82 546 INT m;
Chris@82 547 R *x;
Chris@82 548 x = ri;
Chris@82 549 for (m = mb, W = W + (mb * ((TWVL / VL) * 14)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, vs)) {
Chris@82 550 V T3, Tu, Tf, Tp, T1E, T25, T1Q, T20, T2b, T2C, T2n, T2x, T3M, T4d, T3Y;
Chris@82 551 V T48, TA, T11, TM, TW, T17, T1y, T1j, T1t, T2I, T39, T2U, T34, T3f, T3G;
Chris@82 552 V T3r, T3B, Ta, Tv, Tc, Ts, T1L, T26, T1N, T23, T2i, T2D, T2k, T2A, T3T;
Chris@82 553 V T4e, T3V, T4b, TH, T12, TJ, TZ, T1e, T1z, T1g, T1w, T2P, T3a, T2R, T37;
Chris@82 554 V T3m, T3H, T3o, T3E, T28, T14;
Chris@82 555 {
Chris@82 556 V T1, T2, Tn, Td, Te, To;
Chris@82 557 T1 = LD(&(x[0]), ms, &(x[0]));
Chris@82 558 T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
Chris@82 559 Tn = VADD(T1, T2);
Chris@82 560 Td = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
Chris@82 561 Te = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
Chris@82 562 To = VADD(Td, Te);
Chris@82 563 T3 = VSUB(T1, T2);
Chris@82 564 Tu = VSUB(Tn, To);
Chris@82 565 Tf = VSUB(Td, Te);
Chris@82 566 Tp = VADD(Tn, To);
Chris@82 567 }
Chris@82 568 {
Chris@82 569 V T1C, T1D, T1Y, T1O, T1P, T1Z;
Chris@82 570 T1C = LD(&(x[WS(vs, 3)]), ms, &(x[WS(vs, 3)]));
Chris@82 571 T1D = LD(&(x[WS(vs, 3) + WS(rs, 4)]), ms, &(x[WS(vs, 3)]));
Chris@82 572 T1Y = VADD(T1C, T1D);
Chris@82 573 T1O = LD(&(x[WS(vs, 3) + WS(rs, 2)]), ms, &(x[WS(vs, 3)]));
Chris@82 574 T1P = LD(&(x[WS(vs, 3) + WS(rs, 6)]), ms, &(x[WS(vs, 3)]));
Chris@82 575 T1Z = VADD(T1O, T1P);
Chris@82 576 T1E = VSUB(T1C, T1D);
Chris@82 577 T25 = VSUB(T1Y, T1Z);
Chris@82 578 T1Q = VSUB(T1O, T1P);
Chris@82 579 T20 = VADD(T1Y, T1Z);
Chris@82 580 }
Chris@82 581 {
Chris@82 582 V T29, T2a, T2v, T2l, T2m, T2w;
Chris@82 583 T29 = LD(&(x[WS(vs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 584 T2a = LD(&(x[WS(vs, 4) + WS(rs, 4)]), ms, &(x[WS(vs, 4)]));
Chris@82 585 T2v = VADD(T29, T2a);
Chris@82 586 T2l = LD(&(x[WS(vs, 4) + WS(rs, 2)]), ms, &(x[WS(vs, 4)]));
Chris@82 587 T2m = LD(&(x[WS(vs, 4) + WS(rs, 6)]), ms, &(x[WS(vs, 4)]));
Chris@82 588 T2w = VADD(T2l, T2m);
Chris@82 589 T2b = VSUB(T29, T2a);
Chris@82 590 T2C = VSUB(T2v, T2w);
Chris@82 591 T2n = VSUB(T2l, T2m);
Chris@82 592 T2x = VADD(T2v, T2w);
Chris@82 593 }
Chris@82 594 {
Chris@82 595 V T3K, T3L, T46, T3W, T3X, T47;
Chris@82 596 T3K = LD(&(x[WS(vs, 7)]), ms, &(x[WS(vs, 7)]));
Chris@82 597 T3L = LD(&(x[WS(vs, 7) + WS(rs, 4)]), ms, &(x[WS(vs, 7)]));
Chris@82 598 T46 = VADD(T3K, T3L);
Chris@82 599 T3W = LD(&(x[WS(vs, 7) + WS(rs, 2)]), ms, &(x[WS(vs, 7)]));
Chris@82 600 T3X = LD(&(x[WS(vs, 7) + WS(rs, 6)]), ms, &(x[WS(vs, 7)]));
Chris@82 601 T47 = VADD(T3W, T3X);
Chris@82 602 T3M = VSUB(T3K, T3L);
Chris@82 603 T4d = VSUB(T46, T47);
Chris@82 604 T3Y = VSUB(T3W, T3X);
Chris@82 605 T48 = VADD(T46, T47);
Chris@82 606 }
Chris@82 607 {
Chris@82 608 V Ty, Tz, TU, TK, TL, TV;
Chris@82 609 Ty = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)]));
Chris@82 610 Tz = LD(&(x[WS(vs, 1) + WS(rs, 4)]), ms, &(x[WS(vs, 1)]));
Chris@82 611 TU = VADD(Ty, Tz);
Chris@82 612 TK = LD(&(x[WS(vs, 1) + WS(rs, 2)]), ms, &(x[WS(vs, 1)]));
Chris@82 613 TL = LD(&(x[WS(vs, 1) + WS(rs, 6)]), ms, &(x[WS(vs, 1)]));
Chris@82 614 TV = VADD(TK, TL);
Chris@82 615 TA = VSUB(Ty, Tz);
Chris@82 616 T11 = VSUB(TU, TV);
Chris@82 617 TM = VSUB(TK, TL);
Chris@82 618 TW = VADD(TU, TV);
Chris@82 619 }
Chris@82 620 {
Chris@82 621 V T15, T16, T1r, T1h, T1i, T1s;
Chris@82 622 T15 = LD(&(x[WS(vs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 623 T16 = LD(&(x[WS(vs, 2) + WS(rs, 4)]), ms, &(x[WS(vs, 2)]));
Chris@82 624 T1r = VADD(T15, T16);
Chris@82 625 T1h = LD(&(x[WS(vs, 2) + WS(rs, 2)]), ms, &(x[WS(vs, 2)]));
Chris@82 626 T1i = LD(&(x[WS(vs, 2) + WS(rs, 6)]), ms, &(x[WS(vs, 2)]));
Chris@82 627 T1s = VADD(T1h, T1i);
Chris@82 628 T17 = VSUB(T15, T16);
Chris@82 629 T1y = VSUB(T1r, T1s);
Chris@82 630 T1j = VSUB(T1h, T1i);
Chris@82 631 T1t = VADD(T1r, T1s);
Chris@82 632 }
Chris@82 633 {
Chris@82 634 V T2G, T2H, T32, T2S, T2T, T33;
Chris@82 635 T2G = LD(&(x[WS(vs, 5)]), ms, &(x[WS(vs, 5)]));
Chris@82 636 T2H = LD(&(x[WS(vs, 5) + WS(rs, 4)]), ms, &(x[WS(vs, 5)]));
Chris@82 637 T32 = VADD(T2G, T2H);
Chris@82 638 T2S = LD(&(x[WS(vs, 5) + WS(rs, 2)]), ms, &(x[WS(vs, 5)]));
Chris@82 639 T2T = LD(&(x[WS(vs, 5) + WS(rs, 6)]), ms, &(x[WS(vs, 5)]));
Chris@82 640 T33 = VADD(T2S, T2T);
Chris@82 641 T2I = VSUB(T2G, T2H);
Chris@82 642 T39 = VSUB(T32, T33);
Chris@82 643 T2U = VSUB(T2S, T2T);
Chris@82 644 T34 = VADD(T32, T33);
Chris@82 645 }
Chris@82 646 {
Chris@82 647 V T3d, T3e, T3z, T3p, T3q, T3A;
Chris@82 648 T3d = LD(&(x[WS(vs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 649 T3e = LD(&(x[WS(vs, 6) + WS(rs, 4)]), ms, &(x[WS(vs, 6)]));
Chris@82 650 T3z = VADD(T3d, T3e);
Chris@82 651 T3p = LD(&(x[WS(vs, 6) + WS(rs, 2)]), ms, &(x[WS(vs, 6)]));
Chris@82 652 T3q = LD(&(x[WS(vs, 6) + WS(rs, 6)]), ms, &(x[WS(vs, 6)]));
Chris@82 653 T3A = VADD(T3p, T3q);
Chris@82 654 T3f = VSUB(T3d, T3e);
Chris@82 655 T3G = VSUB(T3z, T3A);
Chris@82 656 T3r = VSUB(T3p, T3q);
Chris@82 657 T3B = VADD(T3z, T3A);
Chris@82 658 }
Chris@82 659 {
Chris@82 660 V T6, Tq, T9, Tr;
Chris@82 661 {
Chris@82 662 V T4, T5, T7, T8;
Chris@82 663 T4 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
Chris@82 664 T5 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
Chris@82 665 T6 = VSUB(T4, T5);
Chris@82 666 Tq = VADD(T4, T5);
Chris@82 667 T7 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
Chris@82 668 T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
Chris@82 669 T9 = VSUB(T7, T8);
Chris@82 670 Tr = VADD(T7, T8);
Chris@82 671 }
Chris@82 672 Ta = VMUL(LDK(KP707106781), VADD(T6, T9));
Chris@82 673 Tv = VBYI(VSUB(Tr, Tq));
Chris@82 674 Tc = VMUL(LDK(KP707106781), VSUB(T9, T6));
Chris@82 675 Ts = VADD(Tq, Tr);
Chris@82 676 }
Chris@82 677 {
Chris@82 678 V T1H, T21, T1K, T22;
Chris@82 679 {
Chris@82 680 V T1F, T1G, T1I, T1J;
Chris@82 681 T1F = LD(&(x[WS(vs, 3) + WS(rs, 1)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 682 T1G = LD(&(x[WS(vs, 3) + WS(rs, 5)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 683 T1H = VSUB(T1F, T1G);
Chris@82 684 T21 = VADD(T1F, T1G);
Chris@82 685 T1I = LD(&(x[WS(vs, 3) + WS(rs, 7)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 686 T1J = LD(&(x[WS(vs, 3) + WS(rs, 3)]), ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 687 T1K = VSUB(T1I, T1J);
Chris@82 688 T22 = VADD(T1I, T1J);
Chris@82 689 }
Chris@82 690 T1L = VMUL(LDK(KP707106781), VADD(T1H, T1K));
Chris@82 691 T26 = VBYI(VSUB(T22, T21));
Chris@82 692 T1N = VMUL(LDK(KP707106781), VSUB(T1K, T1H));
Chris@82 693 T23 = VADD(T21, T22);
Chris@82 694 }
Chris@82 695 {
Chris@82 696 V T2e, T2y, T2h, T2z;
Chris@82 697 {
Chris@82 698 V T2c, T2d, T2f, T2g;
Chris@82 699 T2c = LD(&(x[WS(vs, 4) + WS(rs, 1)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 700 T2d = LD(&(x[WS(vs, 4) + WS(rs, 5)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 701 T2e = VSUB(T2c, T2d);
Chris@82 702 T2y = VADD(T2c, T2d);
Chris@82 703 T2f = LD(&(x[WS(vs, 4) + WS(rs, 7)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 704 T2g = LD(&(x[WS(vs, 4) + WS(rs, 3)]), ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 705 T2h = VSUB(T2f, T2g);
Chris@82 706 T2z = VADD(T2f, T2g);
Chris@82 707 }
Chris@82 708 T2i = VMUL(LDK(KP707106781), VADD(T2e, T2h));
Chris@82 709 T2D = VBYI(VSUB(T2z, T2y));
Chris@82 710 T2k = VMUL(LDK(KP707106781), VSUB(T2h, T2e));
Chris@82 711 T2A = VADD(T2y, T2z);
Chris@82 712 }
Chris@82 713 {
Chris@82 714 V T3P, T49, T3S, T4a;
Chris@82 715 {
Chris@82 716 V T3N, T3O, T3Q, T3R;
Chris@82 717 T3N = LD(&(x[WS(vs, 7) + WS(rs, 1)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 718 T3O = LD(&(x[WS(vs, 7) + WS(rs, 5)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 719 T3P = VSUB(T3N, T3O);
Chris@82 720 T49 = VADD(T3N, T3O);
Chris@82 721 T3Q = LD(&(x[WS(vs, 7) + WS(rs, 7)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 722 T3R = LD(&(x[WS(vs, 7) + WS(rs, 3)]), ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 723 T3S = VSUB(T3Q, T3R);
Chris@82 724 T4a = VADD(T3Q, T3R);
Chris@82 725 }
Chris@82 726 T3T = VMUL(LDK(KP707106781), VADD(T3P, T3S));
Chris@82 727 T4e = VBYI(VSUB(T4a, T49));
Chris@82 728 T3V = VMUL(LDK(KP707106781), VSUB(T3S, T3P));
Chris@82 729 T4b = VADD(T49, T4a);
Chris@82 730 }
Chris@82 731 {
Chris@82 732 V TD, TX, TG, TY;
Chris@82 733 {
Chris@82 734 V TB, TC, TE, TF;
Chris@82 735 TB = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 736 TC = LD(&(x[WS(vs, 1) + WS(rs, 5)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 737 TD = VSUB(TB, TC);
Chris@82 738 TX = VADD(TB, TC);
Chris@82 739 TE = LD(&(x[WS(vs, 1) + WS(rs, 7)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 740 TF = LD(&(x[WS(vs, 1) + WS(rs, 3)]), ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 741 TG = VSUB(TE, TF);
Chris@82 742 TY = VADD(TE, TF);
Chris@82 743 }
Chris@82 744 TH = VMUL(LDK(KP707106781), VADD(TD, TG));
Chris@82 745 T12 = VBYI(VSUB(TY, TX));
Chris@82 746 TJ = VMUL(LDK(KP707106781), VSUB(TG, TD));
Chris@82 747 TZ = VADD(TX, TY);
Chris@82 748 }
Chris@82 749 {
Chris@82 750 V T1a, T1u, T1d, T1v;
Chris@82 751 {
Chris@82 752 V T18, T19, T1b, T1c;
Chris@82 753 T18 = LD(&(x[WS(vs, 2) + WS(rs, 1)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 754 T19 = LD(&(x[WS(vs, 2) + WS(rs, 5)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 755 T1a = VSUB(T18, T19);
Chris@82 756 T1u = VADD(T18, T19);
Chris@82 757 T1b = LD(&(x[WS(vs, 2) + WS(rs, 7)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 758 T1c = LD(&(x[WS(vs, 2) + WS(rs, 3)]), ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 759 T1d = VSUB(T1b, T1c);
Chris@82 760 T1v = VADD(T1b, T1c);
Chris@82 761 }
Chris@82 762 T1e = VMUL(LDK(KP707106781), VADD(T1a, T1d));
Chris@82 763 T1z = VBYI(VSUB(T1v, T1u));
Chris@82 764 T1g = VMUL(LDK(KP707106781), VSUB(T1d, T1a));
Chris@82 765 T1w = VADD(T1u, T1v);
Chris@82 766 }
Chris@82 767 {
Chris@82 768 V T2L, T35, T2O, T36;
Chris@82 769 {
Chris@82 770 V T2J, T2K, T2M, T2N;
Chris@82 771 T2J = LD(&(x[WS(vs, 5) + WS(rs, 1)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 772 T2K = LD(&(x[WS(vs, 5) + WS(rs, 5)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 773 T2L = VSUB(T2J, T2K);
Chris@82 774 T35 = VADD(T2J, T2K);
Chris@82 775 T2M = LD(&(x[WS(vs, 5) + WS(rs, 7)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 776 T2N = LD(&(x[WS(vs, 5) + WS(rs, 3)]), ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 777 T2O = VSUB(T2M, T2N);
Chris@82 778 T36 = VADD(T2M, T2N);
Chris@82 779 }
Chris@82 780 T2P = VMUL(LDK(KP707106781), VADD(T2L, T2O));
Chris@82 781 T3a = VBYI(VSUB(T36, T35));
Chris@82 782 T2R = VMUL(LDK(KP707106781), VSUB(T2O, T2L));
Chris@82 783 T37 = VADD(T35, T36);
Chris@82 784 }
Chris@82 785 {
Chris@82 786 V T3i, T3C, T3l, T3D;
Chris@82 787 {
Chris@82 788 V T3g, T3h, T3j, T3k;
Chris@82 789 T3g = LD(&(x[WS(vs, 6) + WS(rs, 1)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 790 T3h = LD(&(x[WS(vs, 6) + WS(rs, 5)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 791 T3i = VSUB(T3g, T3h);
Chris@82 792 T3C = VADD(T3g, T3h);
Chris@82 793 T3j = LD(&(x[WS(vs, 6) + WS(rs, 7)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 794 T3k = LD(&(x[WS(vs, 6) + WS(rs, 3)]), ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 795 T3l = VSUB(T3j, T3k);
Chris@82 796 T3D = VADD(T3j, T3k);
Chris@82 797 }
Chris@82 798 T3m = VMUL(LDK(KP707106781), VADD(T3i, T3l));
Chris@82 799 T3H = VBYI(VSUB(T3D, T3C));
Chris@82 800 T3o = VMUL(LDK(KP707106781), VSUB(T3l, T3i));
Chris@82 801 T3E = VADD(T3C, T3D);
Chris@82 802 }
Chris@82 803 ST(&(x[0]), VADD(Tp, Ts), ms, &(x[0]));
Chris@82 804 ST(&(x[WS(rs, 2)]), VADD(T1t, T1w), ms, &(x[0]));
Chris@82 805 ST(&(x[WS(rs, 5)]), VADD(T34, T37), ms, &(x[WS(rs, 1)]));
Chris@82 806 ST(&(x[WS(rs, 7)]), VADD(T48, T4b), ms, &(x[WS(rs, 1)]));
Chris@82 807 ST(&(x[WS(rs, 6)]), VADD(T3B, T3E), ms, &(x[0]));
Chris@82 808 ST(&(x[WS(rs, 4)]), VADD(T2x, T2A), ms, &(x[0]));
Chris@82 809 {
Chris@82 810 V Tt, T4c, T2B, T24;
Chris@82 811 ST(&(x[WS(rs, 3)]), VADD(T20, T23), ms, &(x[WS(rs, 1)]));
Chris@82 812 ST(&(x[WS(rs, 1)]), VADD(TW, TZ), ms, &(x[WS(rs, 1)]));
Chris@82 813 Tt = BYTWJ(&(W[TWVL * 6]), VSUB(Tp, Ts));
Chris@82 814 ST(&(x[WS(vs, 4)]), Tt, ms, &(x[WS(vs, 4)]));
Chris@82 815 T4c = BYTWJ(&(W[TWVL * 6]), VSUB(T48, T4b));
Chris@82 816 ST(&(x[WS(vs, 4) + WS(rs, 7)]), T4c, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 817 T2B = BYTWJ(&(W[TWVL * 6]), VSUB(T2x, T2A));
Chris@82 818 ST(&(x[WS(vs, 4) + WS(rs, 4)]), T2B, ms, &(x[WS(vs, 4)]));
Chris@82 819 T24 = BYTWJ(&(W[TWVL * 6]), VSUB(T20, T23));
Chris@82 820 ST(&(x[WS(vs, 4) + WS(rs, 3)]), T24, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 821 }
Chris@82 822 {
Chris@82 823 V T10, T1x, T3F, T38, T1A, Tw;
Chris@82 824 T10 = BYTWJ(&(W[TWVL * 6]), VSUB(TW, TZ));
Chris@82 825 ST(&(x[WS(vs, 4) + WS(rs, 1)]), T10, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 826 T1x = BYTWJ(&(W[TWVL * 6]), VSUB(T1t, T1w));
Chris@82 827 ST(&(x[WS(vs, 4) + WS(rs, 2)]), T1x, ms, &(x[WS(vs, 4)]));
Chris@82 828 T3F = BYTWJ(&(W[TWVL * 6]), VSUB(T3B, T3E));
Chris@82 829 ST(&(x[WS(vs, 4) + WS(rs, 6)]), T3F, ms, &(x[WS(vs, 4)]));
Chris@82 830 T38 = BYTWJ(&(W[TWVL * 6]), VSUB(T34, T37));
Chris@82 831 ST(&(x[WS(vs, 4) + WS(rs, 5)]), T38, ms, &(x[WS(vs, 4) + WS(rs, 1)]));
Chris@82 832 T1A = BYTWJ(&(W[TWVL * 10]), VSUB(T1y, T1z));
Chris@82 833 ST(&(x[WS(vs, 6) + WS(rs, 2)]), T1A, ms, &(x[WS(vs, 6)]));
Chris@82 834 Tw = BYTWJ(&(W[TWVL * 10]), VSUB(Tu, Tv));
Chris@82 835 ST(&(x[WS(vs, 6)]), Tw, ms, &(x[WS(vs, 6)]));
Chris@82 836 }
Chris@82 837 {
Chris@82 838 V T2E, T3I, T13, T27, T3b, T4f;
Chris@82 839 T2E = BYTWJ(&(W[TWVL * 10]), VSUB(T2C, T2D));
Chris@82 840 ST(&(x[WS(vs, 6) + WS(rs, 4)]), T2E, ms, &(x[WS(vs, 6)]));
Chris@82 841 T3I = BYTWJ(&(W[TWVL * 10]), VSUB(T3G, T3H));
Chris@82 842 ST(&(x[WS(vs, 6) + WS(rs, 6)]), T3I, ms, &(x[WS(vs, 6)]));
Chris@82 843 T13 = BYTWJ(&(W[TWVL * 10]), VSUB(T11, T12));
Chris@82 844 ST(&(x[WS(vs, 6) + WS(rs, 1)]), T13, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 845 T27 = BYTWJ(&(W[TWVL * 10]), VSUB(T25, T26));
Chris@82 846 ST(&(x[WS(vs, 6) + WS(rs, 3)]), T27, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 847 T3b = BYTWJ(&(W[TWVL * 10]), VSUB(T39, T3a));
Chris@82 848 ST(&(x[WS(vs, 6) + WS(rs, 5)]), T3b, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 849 T4f = BYTWJ(&(W[TWVL * 10]), VSUB(T4d, T4e));
Chris@82 850 ST(&(x[WS(vs, 6) + WS(rs, 7)]), T4f, ms, &(x[WS(vs, 6) + WS(rs, 1)]));
Chris@82 851 }
Chris@82 852 {
Chris@82 853 V Tx, T1B, T3c, T4g, T3J, T2F;
Chris@82 854 Tx = BYTWJ(&(W[TWVL * 2]), VADD(Tu, Tv));
Chris@82 855 ST(&(x[WS(vs, 2)]), Tx, ms, &(x[WS(vs, 2)]));
Chris@82 856 T1B = BYTWJ(&(W[TWVL * 2]), VADD(T1y, T1z));
Chris@82 857 ST(&(x[WS(vs, 2) + WS(rs, 2)]), T1B, ms, &(x[WS(vs, 2)]));
Chris@82 858 T3c = BYTWJ(&(W[TWVL * 2]), VADD(T39, T3a));
Chris@82 859 ST(&(x[WS(vs, 2) + WS(rs, 5)]), T3c, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 860 T4g = BYTWJ(&(W[TWVL * 2]), VADD(T4d, T4e));
Chris@82 861 ST(&(x[WS(vs, 2) + WS(rs, 7)]), T4g, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 862 T3J = BYTWJ(&(W[TWVL * 2]), VADD(T3G, T3H));
Chris@82 863 ST(&(x[WS(vs, 2) + WS(rs, 6)]), T3J, ms, &(x[WS(vs, 2)]));
Chris@82 864 T2F = BYTWJ(&(W[TWVL * 2]), VADD(T2C, T2D));
Chris@82 865 ST(&(x[WS(vs, 2) + WS(rs, 4)]), T2F, ms, &(x[WS(vs, 2)]));
Chris@82 866 }
Chris@82 867 T28 = BYTWJ(&(W[TWVL * 2]), VADD(T25, T26));
Chris@82 868 ST(&(x[WS(vs, 2) + WS(rs, 3)]), T28, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 869 T14 = BYTWJ(&(W[TWVL * 2]), VADD(T11, T12));
Chris@82 870 ST(&(x[WS(vs, 2) + WS(rs, 1)]), T14, ms, &(x[WS(vs, 2) + WS(rs, 1)]));
Chris@82 871 {
Chris@82 872 V Th, Ti, Tb, Tg;
Chris@82 873 Tb = VADD(T3, Ta);
Chris@82 874 Tg = VBYI(VSUB(Tc, Tf));
Chris@82 875 Th = BYTWJ(&(W[TWVL * 12]), VSUB(Tb, Tg));
Chris@82 876 Ti = BYTWJ(&(W[0]), VADD(Tb, Tg));
Chris@82 877 ST(&(x[WS(vs, 7)]), Th, ms, &(x[WS(vs, 7)]));
Chris@82 878 ST(&(x[WS(vs, 1)]), Ti, ms, &(x[WS(vs, 1)]));
Chris@82 879 }
Chris@82 880 {
Chris@82 881 V T40, T41, T3U, T3Z;
Chris@82 882 T3U = VADD(T3M, T3T);
Chris@82 883 T3Z = VBYI(VSUB(T3V, T3Y));
Chris@82 884 T40 = BYTWJ(&(W[TWVL * 12]), VSUB(T3U, T3Z));
Chris@82 885 T41 = BYTWJ(&(W[0]), VADD(T3U, T3Z));
Chris@82 886 ST(&(x[WS(vs, 7) + WS(rs, 7)]), T40, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 887 ST(&(x[WS(vs, 1) + WS(rs, 7)]), T41, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 888 }
Chris@82 889 {
Chris@82 890 V T2p, T2q, T2j, T2o;
Chris@82 891 T2j = VADD(T2b, T2i);
Chris@82 892 T2o = VBYI(VSUB(T2k, T2n));
Chris@82 893 T2p = BYTWJ(&(W[TWVL * 12]), VSUB(T2j, T2o));
Chris@82 894 T2q = BYTWJ(&(W[0]), VADD(T2j, T2o));
Chris@82 895 ST(&(x[WS(vs, 7) + WS(rs, 4)]), T2p, ms, &(x[WS(vs, 7)]));
Chris@82 896 ST(&(x[WS(vs, 1) + WS(rs, 4)]), T2q, ms, &(x[WS(vs, 1)]));
Chris@82 897 }
Chris@82 898 {
Chris@82 899 V T1S, T1T, T1M, T1R;
Chris@82 900 T1M = VADD(T1E, T1L);
Chris@82 901 T1R = VBYI(VSUB(T1N, T1Q));
Chris@82 902 T1S = BYTWJ(&(W[TWVL * 12]), VSUB(T1M, T1R));
Chris@82 903 T1T = BYTWJ(&(W[0]), VADD(T1M, T1R));
Chris@82 904 ST(&(x[WS(vs, 7) + WS(rs, 3)]), T1S, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 905 ST(&(x[WS(vs, 1) + WS(rs, 3)]), T1T, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 906 }
Chris@82 907 {
Chris@82 908 V TO, TP, TI, TN;
Chris@82 909 TI = VADD(TA, TH);
Chris@82 910 TN = VBYI(VSUB(TJ, TM));
Chris@82 911 TO = BYTWJ(&(W[TWVL * 12]), VSUB(TI, TN));
Chris@82 912 TP = BYTWJ(&(W[0]), VADD(TI, TN));
Chris@82 913 ST(&(x[WS(vs, 7) + WS(rs, 1)]), TO, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 914 ST(&(x[WS(vs, 1) + WS(rs, 1)]), TP, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 915 }
Chris@82 916 {
Chris@82 917 V T1l, T1m, T1f, T1k;
Chris@82 918 T1f = VADD(T17, T1e);
Chris@82 919 T1k = VBYI(VSUB(T1g, T1j));
Chris@82 920 T1l = BYTWJ(&(W[TWVL * 12]), VSUB(T1f, T1k));
Chris@82 921 T1m = BYTWJ(&(W[0]), VADD(T1f, T1k));
Chris@82 922 ST(&(x[WS(vs, 7) + WS(rs, 2)]), T1l, ms, &(x[WS(vs, 7)]));
Chris@82 923 ST(&(x[WS(vs, 1) + WS(rs, 2)]), T1m, ms, &(x[WS(vs, 1)]));
Chris@82 924 }
Chris@82 925 {
Chris@82 926 V T3t, T3u, T3n, T3s;
Chris@82 927 T3n = VADD(T3f, T3m);
Chris@82 928 T3s = VBYI(VSUB(T3o, T3r));
Chris@82 929 T3t = BYTWJ(&(W[TWVL * 12]), VSUB(T3n, T3s));
Chris@82 930 T3u = BYTWJ(&(W[0]), VADD(T3n, T3s));
Chris@82 931 ST(&(x[WS(vs, 7) + WS(rs, 6)]), T3t, ms, &(x[WS(vs, 7)]));
Chris@82 932 ST(&(x[WS(vs, 1) + WS(rs, 6)]), T3u, ms, &(x[WS(vs, 1)]));
Chris@82 933 }
Chris@82 934 {
Chris@82 935 V T2W, T2X, T2Q, T2V;
Chris@82 936 T2Q = VADD(T2I, T2P);
Chris@82 937 T2V = VBYI(VSUB(T2R, T2U));
Chris@82 938 T2W = BYTWJ(&(W[TWVL * 12]), VSUB(T2Q, T2V));
Chris@82 939 T2X = BYTWJ(&(W[0]), VADD(T2Q, T2V));
Chris@82 940 ST(&(x[WS(vs, 7) + WS(rs, 5)]), T2W, ms, &(x[WS(vs, 7) + WS(rs, 1)]));
Chris@82 941 ST(&(x[WS(vs, 1) + WS(rs, 5)]), T2X, ms, &(x[WS(vs, 1) + WS(rs, 1)]));
Chris@82 942 }
Chris@82 943 {
Chris@82 944 V T1p, T1q, T1n, T1o;
Chris@82 945 T1n = VSUB(T17, T1e);
Chris@82 946 T1o = VBYI(VADD(T1j, T1g));
Chris@82 947 T1p = BYTWJ(&(W[TWVL * 8]), VSUB(T1n, T1o));
Chris@82 948 T1q = BYTWJ(&(W[TWVL * 4]), VADD(T1n, T1o));
Chris@82 949 ST(&(x[WS(vs, 5) + WS(rs, 2)]), T1p, ms, &(x[WS(vs, 5)]));
Chris@82 950 ST(&(x[WS(vs, 3) + WS(rs, 2)]), T1q, ms, &(x[WS(vs, 3)]));
Chris@82 951 }
Chris@82 952 {
Chris@82 953 V Tl, Tm, Tj, Tk;
Chris@82 954 Tj = VSUB(T3, Ta);
Chris@82 955 Tk = VBYI(VADD(Tf, Tc));
Chris@82 956 Tl = BYTWJ(&(W[TWVL * 8]), VSUB(Tj, Tk));
Chris@82 957 Tm = BYTWJ(&(W[TWVL * 4]), VADD(Tj, Tk));
Chris@82 958 ST(&(x[WS(vs, 5)]), Tl, ms, &(x[WS(vs, 5)]));
Chris@82 959 ST(&(x[WS(vs, 3)]), Tm, ms, &(x[WS(vs, 3)]));
Chris@82 960 }
Chris@82 961 {
Chris@82 962 V T2t, T2u, T2r, T2s;
Chris@82 963 T2r = VSUB(T2b, T2i);
Chris@82 964 T2s = VBYI(VADD(T2n, T2k));
Chris@82 965 T2t = BYTWJ(&(W[TWVL * 8]), VSUB(T2r, T2s));
Chris@82 966 T2u = BYTWJ(&(W[TWVL * 4]), VADD(T2r, T2s));
Chris@82 967 ST(&(x[WS(vs, 5) + WS(rs, 4)]), T2t, ms, &(x[WS(vs, 5)]));
Chris@82 968 ST(&(x[WS(vs, 3) + WS(rs, 4)]), T2u, ms, &(x[WS(vs, 3)]));
Chris@82 969 }
Chris@82 970 {
Chris@82 971 V T3x, T3y, T3v, T3w;
Chris@82 972 T3v = VSUB(T3f, T3m);
Chris@82 973 T3w = VBYI(VADD(T3r, T3o));
Chris@82 974 T3x = BYTWJ(&(W[TWVL * 8]), VSUB(T3v, T3w));
Chris@82 975 T3y = BYTWJ(&(W[TWVL * 4]), VADD(T3v, T3w));
Chris@82 976 ST(&(x[WS(vs, 5) + WS(rs, 6)]), T3x, ms, &(x[WS(vs, 5)]));
Chris@82 977 ST(&(x[WS(vs, 3) + WS(rs, 6)]), T3y, ms, &(x[WS(vs, 3)]));
Chris@82 978 }
Chris@82 979 {
Chris@82 980 V TS, TT, TQ, TR;
Chris@82 981 TQ = VSUB(TA, TH);
Chris@82 982 TR = VBYI(VADD(TM, TJ));
Chris@82 983 TS = BYTWJ(&(W[TWVL * 8]), VSUB(TQ, TR));
Chris@82 984 TT = BYTWJ(&(W[TWVL * 4]), VADD(TQ, TR));
Chris@82 985 ST(&(x[WS(vs, 5) + WS(rs, 1)]), TS, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 986 ST(&(x[WS(vs, 3) + WS(rs, 1)]), TT, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 987 }
Chris@82 988 {
Chris@82 989 V T1W, T1X, T1U, T1V;
Chris@82 990 T1U = VSUB(T1E, T1L);
Chris@82 991 T1V = VBYI(VADD(T1Q, T1N));
Chris@82 992 T1W = BYTWJ(&(W[TWVL * 8]), VSUB(T1U, T1V));
Chris@82 993 T1X = BYTWJ(&(W[TWVL * 4]), VADD(T1U, T1V));
Chris@82 994 ST(&(x[WS(vs, 5) + WS(rs, 3)]), T1W, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 995 ST(&(x[WS(vs, 3) + WS(rs, 3)]), T1X, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 996 }
Chris@82 997 {
Chris@82 998 V T30, T31, T2Y, T2Z;
Chris@82 999 T2Y = VSUB(T2I, T2P);
Chris@82 1000 T2Z = VBYI(VADD(T2U, T2R));
Chris@82 1001 T30 = BYTWJ(&(W[TWVL * 8]), VSUB(T2Y, T2Z));
Chris@82 1002 T31 = BYTWJ(&(W[TWVL * 4]), VADD(T2Y, T2Z));
Chris@82 1003 ST(&(x[WS(vs, 5) + WS(rs, 5)]), T30, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 1004 ST(&(x[WS(vs, 3) + WS(rs, 5)]), T31, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 1005 }
Chris@82 1006 {
Chris@82 1007 V T44, T45, T42, T43;
Chris@82 1008 T42 = VSUB(T3M, T3T);
Chris@82 1009 T43 = VBYI(VADD(T3Y, T3V));
Chris@82 1010 T44 = BYTWJ(&(W[TWVL * 8]), VSUB(T42, T43));
Chris@82 1011 T45 = BYTWJ(&(W[TWVL * 4]), VADD(T42, T43));
Chris@82 1012 ST(&(x[WS(vs, 5) + WS(rs, 7)]), T44, ms, &(x[WS(vs, 5) + WS(rs, 1)]));
Chris@82 1013 ST(&(x[WS(vs, 3) + WS(rs, 7)]), T45, ms, &(x[WS(vs, 3) + WS(rs, 1)]));
Chris@82 1014 }
Chris@82 1015 }
Chris@82 1016 }
Chris@82 1017 VLEAVE();
Chris@82 1018 }
Chris@82 1019
Chris@82 1020 static const tw_instr twinstr[] = {
Chris@82 1021 VTW(0, 1),
Chris@82 1022 VTW(0, 2),
Chris@82 1023 VTW(0, 3),
Chris@82 1024 VTW(0, 4),
Chris@82 1025 VTW(0, 5),
Chris@82 1026 VTW(0, 6),
Chris@82 1027 VTW(0, 7),
Chris@82 1028 {TW_NEXT, VL, 0}
Chris@82 1029 };
Chris@82 1030
Chris@82 1031 static const ct_desc desc = { 8, XSIMD_STRING("q1fv_8"), twinstr, &GENUS, {264, 128, 0, 0}, 0, 0, 0 };
Chris@82 1032
Chris@82 1033 void XSIMD(codelet_q1fv_8) (planner *p) {
Chris@82 1034 X(kdft_difsq_register) (p, q1fv_8, &desc);
Chris@82 1035 }
Chris@82 1036 #endif