annotate src/fftw-3.3.8/dft/simd/common/n2sv_32.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:05:20 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_notw.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 372 FP additions, 136 FP multiplications,
Chris@82 32 * (or, 236 additions, 0 multiplications, 136 fused multiply/add),
Chris@82 33 * 138 stack variables, 7 constants, and 144 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/simd/n2s.h"
Chris@82 36
Chris@82 37 static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 38 {
Chris@82 39 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 40 DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
Chris@82 41 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 42 DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
Chris@82 43 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 44 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 45 DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 46 {
Chris@82 47 INT i;
Chris@82 48 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 49 V T7, T4r, T4Z, T18, T1z, T3t, T3T, T2T, Te, T1f, T50, T4s, T2W, T3u, T1G;
Chris@82 50 V T3U, Tm, T1n, T1O, T2Z, T3y, T3X, T4w, T53, Tt, T1u, T1V, T2Y, T3B, T3W;
Chris@82 51 V T4z, T52, T2t, T3L, T3O, T2K, TR, TY, T5F, T5G, T5H, T5I, T4R, T5k, T2E;
Chris@82 52 V T3M, T4W, T5j, T2N, T3P, T22, T3E, T3H, T2j, TC, TJ, T5A, T5B, T5C, T5D;
Chris@82 53 V T4G, T5h, T2d, T3F, T4L, T5g, T2m, T3I;
Chris@82 54 {
Chris@82 55 V T3, T1x, T14, T2R, T6, T2S, T17, T1y;
Chris@82 56 {
Chris@82 57 V T1, T2, T12, T13;
Chris@82 58 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 59 T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
Chris@82 60 T3 = VADD(T1, T2);
Chris@82 61 T1x = VSUB(T1, T2);
Chris@82 62 T12 = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 63 T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
Chris@82 64 T14 = VADD(T12, T13);
Chris@82 65 T2R = VSUB(T12, T13);
Chris@82 66 }
Chris@82 67 {
Chris@82 68 V T4, T5, T15, T16;
Chris@82 69 T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@82 70 T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
Chris@82 71 T6 = VADD(T4, T5);
Chris@82 72 T2S = VSUB(T4, T5);
Chris@82 73 T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@82 74 T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
Chris@82 75 T17 = VADD(T15, T16);
Chris@82 76 T1y = VSUB(T15, T16);
Chris@82 77 }
Chris@82 78 T7 = VADD(T3, T6);
Chris@82 79 T4r = VSUB(T3, T6);
Chris@82 80 T4Z = VSUB(T14, T17);
Chris@82 81 T18 = VADD(T14, T17);
Chris@82 82 T1z = VADD(T1x, T1y);
Chris@82 83 T3t = VSUB(T1x, T1y);
Chris@82 84 T3T = VADD(T2S, T2R);
Chris@82 85 T2T = VSUB(T2R, T2S);
Chris@82 86 }
Chris@82 87 {
Chris@82 88 V Ta, T1A, T1b, T1B, Td, T1D, T1e, T1E;
Chris@82 89 {
Chris@82 90 V T8, T9, T19, T1a;
Chris@82 91 T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 92 T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
Chris@82 93 Ta = VADD(T8, T9);
Chris@82 94 T1A = VSUB(T8, T9);
Chris@82 95 T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 96 T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
Chris@82 97 T1b = VADD(T19, T1a);
Chris@82 98 T1B = VSUB(T19, T1a);
Chris@82 99 }
Chris@82 100 {
Chris@82 101 V Tb, Tc, T1c, T1d;
Chris@82 102 Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
Chris@82 103 Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@82 104 Td = VADD(Tb, Tc);
Chris@82 105 T1D = VSUB(Tb, Tc);
Chris@82 106 T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
Chris@82 107 T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@82 108 T1e = VADD(T1c, T1d);
Chris@82 109 T1E = VSUB(T1c, T1d);
Chris@82 110 }
Chris@82 111 Te = VADD(Ta, Td);
Chris@82 112 T1f = VADD(T1b, T1e);
Chris@82 113 T50 = VSUB(Td, Ta);
Chris@82 114 T4s = VSUB(T1b, T1e);
Chris@82 115 {
Chris@82 116 V T2U, T2V, T1C, T1F;
Chris@82 117 T2U = VSUB(T1B, T1A);
Chris@82 118 T2V = VADD(T1D, T1E);
Chris@82 119 T2W = VADD(T2U, T2V);
Chris@82 120 T3u = VSUB(T2U, T2V);
Chris@82 121 T1C = VADD(T1A, T1B);
Chris@82 122 T1F = VSUB(T1D, T1E);
Chris@82 123 T1G = VADD(T1C, T1F);
Chris@82 124 T3U = VSUB(T1F, T1C);
Chris@82 125 }
Chris@82 126 }
Chris@82 127 {
Chris@82 128 V Ti, T1L, T1j, T1I, Tl, T1J, T1m, T1M, T1K, T1N;
Chris@82 129 {
Chris@82 130 V Tg, Th, T1h, T1i;
Chris@82 131 Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 132 Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
Chris@82 133 Ti = VADD(Tg, Th);
Chris@82 134 T1L = VSUB(Tg, Th);
Chris@82 135 T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 136 T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
Chris@82 137 T1j = VADD(T1h, T1i);
Chris@82 138 T1I = VSUB(T1h, T1i);
Chris@82 139 }
Chris@82 140 {
Chris@82 141 V Tj, Tk, T1k, T1l;
Chris@82 142 Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@82 143 Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
Chris@82 144 Tl = VADD(Tj, Tk);
Chris@82 145 T1J = VSUB(Tj, Tk);
Chris@82 146 T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@82 147 T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
Chris@82 148 T1m = VADD(T1k, T1l);
Chris@82 149 T1M = VSUB(T1k, T1l);
Chris@82 150 }
Chris@82 151 Tm = VADD(Ti, Tl);
Chris@82 152 T1n = VADD(T1j, T1m);
Chris@82 153 T1K = VSUB(T1I, T1J);
Chris@82 154 T1N = VADD(T1L, T1M);
Chris@82 155 T1O = VFNMS(LDK(KP414213562), T1N, T1K);
Chris@82 156 T2Z = VFMA(LDK(KP414213562), T1K, T1N);
Chris@82 157 {
Chris@82 158 V T3w, T3x, T4u, T4v;
Chris@82 159 T3w = VADD(T1J, T1I);
Chris@82 160 T3x = VSUB(T1L, T1M);
Chris@82 161 T3y = VFMA(LDK(KP414213562), T3x, T3w);
Chris@82 162 T3X = VFNMS(LDK(KP414213562), T3w, T3x);
Chris@82 163 T4u = VSUB(T1j, T1m);
Chris@82 164 T4v = VSUB(Ti, Tl);
Chris@82 165 T4w = VSUB(T4u, T4v);
Chris@82 166 T53 = VADD(T4v, T4u);
Chris@82 167 }
Chris@82 168 }
Chris@82 169 {
Chris@82 170 V Tp, T1S, T1q, T1P, Ts, T1Q, T1t, T1T, T1R, T1U;
Chris@82 171 {
Chris@82 172 V Tn, To, T1o, T1p;
Chris@82 173 Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
Chris@82 174 To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@82 175 Tp = VADD(Tn, To);
Chris@82 176 T1S = VSUB(Tn, To);
Chris@82 177 T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
Chris@82 178 T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@82 179 T1q = VADD(T1o, T1p);
Chris@82 180 T1P = VSUB(T1o, T1p);
Chris@82 181 }
Chris@82 182 {
Chris@82 183 V Tq, Tr, T1r, T1s;
Chris@82 184 Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 185 Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
Chris@82 186 Ts = VADD(Tq, Tr);
Chris@82 187 T1Q = VSUB(Tq, Tr);
Chris@82 188 T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 189 T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
Chris@82 190 T1t = VADD(T1r, T1s);
Chris@82 191 T1T = VSUB(T1r, T1s);
Chris@82 192 }
Chris@82 193 Tt = VADD(Tp, Ts);
Chris@82 194 T1u = VADD(T1q, T1t);
Chris@82 195 T1R = VSUB(T1P, T1Q);
Chris@82 196 T1U = VADD(T1S, T1T);
Chris@82 197 T1V = VFMA(LDK(KP414213562), T1U, T1R);
Chris@82 198 T2Y = VFNMS(LDK(KP414213562), T1R, T1U);
Chris@82 199 {
Chris@82 200 V T3z, T3A, T4x, T4y;
Chris@82 201 T3z = VADD(T1Q, T1P);
Chris@82 202 T3A = VSUB(T1S, T1T);
Chris@82 203 T3B = VFNMS(LDK(KP414213562), T3A, T3z);
Chris@82 204 T3W = VFMA(LDK(KP414213562), T3z, T3A);
Chris@82 205 T4x = VSUB(Tp, Ts);
Chris@82 206 T4y = VSUB(T1q, T1t);
Chris@82 207 T4z = VADD(T4x, T4y);
Chris@82 208 T52 = VSUB(T4x, T4y);
Chris@82 209 }
Chris@82 210 }
Chris@82 211 {
Chris@82 212 V TN, T2G, T2r, T4N, TQ, T2s, T2J, T4O, TU, T2x, T2w, T4T, TX, T2z, T2C;
Chris@82 213 V T4U;
Chris@82 214 {
Chris@82 215 V TL, TM, T2p, T2q;
Chris@82 216 TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
Chris@82 217 TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@82 218 TN = VADD(TL, TM);
Chris@82 219 T2G = VSUB(TL, TM);
Chris@82 220 T2p = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
Chris@82 221 T2q = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@82 222 T2r = VSUB(T2p, T2q);
Chris@82 223 T4N = VADD(T2p, T2q);
Chris@82 224 }
Chris@82 225 {
Chris@82 226 V TO, TP, T2H, T2I;
Chris@82 227 TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 228 TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
Chris@82 229 TQ = VADD(TO, TP);
Chris@82 230 T2s = VSUB(TO, TP);
Chris@82 231 T2H = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 232 T2I = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
Chris@82 233 T2J = VSUB(T2H, T2I);
Chris@82 234 T4O = VADD(T2H, T2I);
Chris@82 235 }
Chris@82 236 {
Chris@82 237 V TS, TT, T2u, T2v;
Chris@82 238 TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 239 TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
Chris@82 240 TU = VADD(TS, TT);
Chris@82 241 T2x = VSUB(TS, TT);
Chris@82 242 T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 243 T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
Chris@82 244 T2w = VSUB(T2u, T2v);
Chris@82 245 T4T = VADD(T2u, T2v);
Chris@82 246 }
Chris@82 247 {
Chris@82 248 V TV, TW, T2A, T2B;
Chris@82 249 TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
Chris@82 250 TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@82 251 TX = VADD(TV, TW);
Chris@82 252 T2z = VSUB(TV, TW);
Chris@82 253 T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
Chris@82 254 T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@82 255 T2C = VSUB(T2A, T2B);
Chris@82 256 T4U = VADD(T2A, T2B);
Chris@82 257 }
Chris@82 258 T2t = VSUB(T2r, T2s);
Chris@82 259 T3L = VSUB(T2G, T2J);
Chris@82 260 T3O = VADD(T2s, T2r);
Chris@82 261 T2K = VADD(T2G, T2J);
Chris@82 262 TR = VADD(TN, TQ);
Chris@82 263 TY = VADD(TU, TX);
Chris@82 264 T5F = VSUB(TR, TY);
Chris@82 265 {
Chris@82 266 V T4P, T4Q, T2y, T2D;
Chris@82 267 T5G = VADD(T4N, T4O);
Chris@82 268 T5H = VADD(T4T, T4U);
Chris@82 269 T5I = VSUB(T5G, T5H);
Chris@82 270 T4P = VSUB(T4N, T4O);
Chris@82 271 T4Q = VSUB(TX, TU);
Chris@82 272 T4R = VSUB(T4P, T4Q);
Chris@82 273 T5k = VADD(T4Q, T4P);
Chris@82 274 T2y = VSUB(T2w, T2x);
Chris@82 275 T2D = VADD(T2z, T2C);
Chris@82 276 T2E = VADD(T2y, T2D);
Chris@82 277 T3M = VSUB(T2D, T2y);
Chris@82 278 {
Chris@82 279 V T4S, T4V, T2L, T2M;
Chris@82 280 T4S = VSUB(TN, TQ);
Chris@82 281 T4V = VSUB(T4T, T4U);
Chris@82 282 T4W = VSUB(T4S, T4V);
Chris@82 283 T5j = VADD(T4S, T4V);
Chris@82 284 T2L = VADD(T2x, T2w);
Chris@82 285 T2M = VSUB(T2z, T2C);
Chris@82 286 T2N = VADD(T2L, T2M);
Chris@82 287 T3P = VSUB(T2L, T2M);
Chris@82 288 }
Chris@82 289 }
Chris@82 290 }
Chris@82 291 {
Chris@82 292 V Ty, T2f, T20, T4C, TB, T21, T2i, T4D, TF, T26, T25, T4I, TI, T28, T2b;
Chris@82 293 V T4J;
Chris@82 294 {
Chris@82 295 V Tw, Tx, T1Y, T1Z;
Chris@82 296 Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 297 Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
Chris@82 298 Ty = VADD(Tw, Tx);
Chris@82 299 T2f = VSUB(Tw, Tx);
Chris@82 300 T1Y = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 301 T1Z = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
Chris@82 302 T20 = VSUB(T1Y, T1Z);
Chris@82 303 T4C = VADD(T1Y, T1Z);
Chris@82 304 }
Chris@82 305 {
Chris@82 306 V Tz, TA, T2g, T2h;
Chris@82 307 Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@82 308 TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
Chris@82 309 TB = VADD(Tz, TA);
Chris@82 310 T21 = VSUB(Tz, TA);
Chris@82 311 T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@82 312 T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
Chris@82 313 T2i = VSUB(T2g, T2h);
Chris@82 314 T4D = VADD(T2g, T2h);
Chris@82 315 }
Chris@82 316 {
Chris@82 317 V TD, TE, T23, T24;
Chris@82 318 TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 319 TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
Chris@82 320 TF = VADD(TD, TE);
Chris@82 321 T26 = VSUB(TD, TE);
Chris@82 322 T23 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 323 T24 = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
Chris@82 324 T25 = VSUB(T23, T24);
Chris@82 325 T4I = VADD(T23, T24);
Chris@82 326 }
Chris@82 327 {
Chris@82 328 V TG, TH, T29, T2a;
Chris@82 329 TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
Chris@82 330 TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@82 331 TI = VADD(TG, TH);
Chris@82 332 T28 = VSUB(TG, TH);
Chris@82 333 T29 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
Chris@82 334 T2a = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@82 335 T2b = VSUB(T29, T2a);
Chris@82 336 T4J = VADD(T29, T2a);
Chris@82 337 }
Chris@82 338 T22 = VSUB(T20, T21);
Chris@82 339 T3E = VSUB(T2f, T2i);
Chris@82 340 T3H = VADD(T21, T20);
Chris@82 341 T2j = VADD(T2f, T2i);
Chris@82 342 TC = VADD(Ty, TB);
Chris@82 343 TJ = VADD(TF, TI);
Chris@82 344 T5A = VSUB(TC, TJ);
Chris@82 345 {
Chris@82 346 V T4E, T4F, T27, T2c;
Chris@82 347 T5B = VADD(T4C, T4D);
Chris@82 348 T5C = VADD(T4I, T4J);
Chris@82 349 T5D = VSUB(T5B, T5C);
Chris@82 350 T4E = VSUB(T4C, T4D);
Chris@82 351 T4F = VSUB(TI, TF);
Chris@82 352 T4G = VSUB(T4E, T4F);
Chris@82 353 T5h = VADD(T4F, T4E);
Chris@82 354 T27 = VSUB(T25, T26);
Chris@82 355 T2c = VADD(T28, T2b);
Chris@82 356 T2d = VADD(T27, T2c);
Chris@82 357 T3F = VSUB(T2c, T27);
Chris@82 358 {
Chris@82 359 V T4H, T4K, T2k, T2l;
Chris@82 360 T4H = VSUB(Ty, TB);
Chris@82 361 T4K = VSUB(T4I, T4J);
Chris@82 362 T4L = VSUB(T4H, T4K);
Chris@82 363 T5g = VADD(T4H, T4K);
Chris@82 364 T2k = VADD(T26, T25);
Chris@82 365 T2l = VSUB(T28, T2b);
Chris@82 366 T2m = VADD(T2k, T2l);
Chris@82 367 T3I = VSUB(T2k, T2l);
Chris@82 368 }
Chris@82 369 }
Chris@82 370 }
Chris@82 371 {
Chris@82 372 V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
Chris@82 373 V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
Chris@82 374 V T6v, T6w;
Chris@82 375 {
Chris@82 376 V T4B, T5b, T5a, T5c, T4Y, T56, T55, T57;
Chris@82 377 {
Chris@82 378 V T4t, T4A, T58, T59;
Chris@82 379 T4t = VSUB(T4r, T4s);
Chris@82 380 T4A = VSUB(T4w, T4z);
Chris@82 381 T4B = VFMA(LDK(KP707106781), T4A, T4t);
Chris@82 382 T5b = VFNMS(LDK(KP707106781), T4A, T4t);
Chris@82 383 T58 = VFMA(LDK(KP414213562), T4R, T4W);
Chris@82 384 T59 = VFNMS(LDK(KP414213562), T4G, T4L);
Chris@82 385 T5a = VSUB(T58, T59);
Chris@82 386 T5c = VADD(T59, T58);
Chris@82 387 }
Chris@82 388 {
Chris@82 389 V T4M, T4X, T51, T54;
Chris@82 390 T4M = VFMA(LDK(KP414213562), T4L, T4G);
Chris@82 391 T4X = VFNMS(LDK(KP414213562), T4W, T4R);
Chris@82 392 T4Y = VSUB(T4M, T4X);
Chris@82 393 T56 = VADD(T4M, T4X);
Chris@82 394 T51 = VSUB(T4Z, T50);
Chris@82 395 T54 = VSUB(T52, T53);
Chris@82 396 T55 = VFNMS(LDK(KP707106781), T54, T51);
Chris@82 397 T57 = VFMA(LDK(KP707106781), T54, T51);
Chris@82 398 }
Chris@82 399 T61 = VFNMS(LDK(KP923879532), T4Y, T4B);
Chris@82 400 STM4(&(ro[22]), T61, ovs, &(ro[0]));
Chris@82 401 T62 = VFNMS(LDK(KP923879532), T5a, T57);
Chris@82 402 STM4(&(io[22]), T62, ovs, &(io[0]));
Chris@82 403 T63 = VFMA(LDK(KP923879532), T4Y, T4B);
Chris@82 404 STM4(&(ro[6]), T63, ovs, &(ro[0]));
Chris@82 405 T64 = VFMA(LDK(KP923879532), T5a, T57);
Chris@82 406 STM4(&(io[6]), T64, ovs, &(io[0]));
Chris@82 407 T65 = VFNMS(LDK(KP923879532), T56, T55);
Chris@82 408 STM4(&(io[14]), T65, ovs, &(io[0]));
Chris@82 409 T66 = VFNMS(LDK(KP923879532), T5c, T5b);
Chris@82 410 STM4(&(ro[14]), T66, ovs, &(ro[0]));
Chris@82 411 T67 = VFMA(LDK(KP923879532), T56, T55);
Chris@82 412 STM4(&(io[30]), T67, ovs, &(io[0]));
Chris@82 413 T68 = VFMA(LDK(KP923879532), T5c, T5b);
Chris@82 414 STM4(&(ro[30]), T68, ovs, &(ro[0]));
Chris@82 415 }
Chris@82 416 {
Chris@82 417 V T5f, T5r, T5u, T5w, T5m, T5q, T5p, T5v;
Chris@82 418 {
Chris@82 419 V T5d, T5e, T5s, T5t;
Chris@82 420 T5d = VADD(T4r, T4s);
Chris@82 421 T5e = VADD(T53, T52);
Chris@82 422 T5f = VFMA(LDK(KP707106781), T5e, T5d);
Chris@82 423 T5r = VFNMS(LDK(KP707106781), T5e, T5d);
Chris@82 424 T5s = VFNMS(LDK(KP414213562), T5g, T5h);
Chris@82 425 T5t = VFMA(LDK(KP414213562), T5j, T5k);
Chris@82 426 T5u = VSUB(T5s, T5t);
Chris@82 427 T5w = VADD(T5s, T5t);
Chris@82 428 }
Chris@82 429 {
Chris@82 430 V T5i, T5l, T5n, T5o;
Chris@82 431 T5i = VFMA(LDK(KP414213562), T5h, T5g);
Chris@82 432 T5l = VFNMS(LDK(KP414213562), T5k, T5j);
Chris@82 433 T5m = VADD(T5i, T5l);
Chris@82 434 T5q = VSUB(T5l, T5i);
Chris@82 435 T5n = VADD(T50, T4Z);
Chris@82 436 T5o = VADD(T4w, T4z);
Chris@82 437 T5p = VFNMS(LDK(KP707106781), T5o, T5n);
Chris@82 438 T5v = VFMA(LDK(KP707106781), T5o, T5n);
Chris@82 439 }
Chris@82 440 T69 = VFNMS(LDK(KP923879532), T5m, T5f);
Chris@82 441 STM4(&(ro[18]), T69, ovs, &(ro[0]));
Chris@82 442 T6a = VFNMS(LDK(KP923879532), T5w, T5v);
Chris@82 443 STM4(&(io[18]), T6a, ovs, &(io[0]));
Chris@82 444 T6b = VFMA(LDK(KP923879532), T5m, T5f);
Chris@82 445 STM4(&(ro[2]), T6b, ovs, &(ro[0]));
Chris@82 446 T6c = VFMA(LDK(KP923879532), T5w, T5v);
Chris@82 447 STM4(&(io[2]), T6c, ovs, &(io[0]));
Chris@82 448 T6d = VFNMS(LDK(KP923879532), T5q, T5p);
Chris@82 449 STM4(&(io[26]), T6d, ovs, &(io[0]));
Chris@82 450 T6e = VFNMS(LDK(KP923879532), T5u, T5r);
Chris@82 451 STM4(&(ro[26]), T6e, ovs, &(ro[0]));
Chris@82 452 T6f = VFMA(LDK(KP923879532), T5q, T5p);
Chris@82 453 STM4(&(io[10]), T6f, ovs, &(io[0]));
Chris@82 454 T6g = VFMA(LDK(KP923879532), T5u, T5r);
Chris@82 455 STM4(&(ro[10]), T6g, ovs, &(ro[0]));
Chris@82 456 }
Chris@82 457 {
Chris@82 458 V T5z, T5P, T5S, T5U, T5K, T5O, T5N, T5T;
Chris@82 459 {
Chris@82 460 V T5x, T5y, T5Q, T5R;
Chris@82 461 T5x = VSUB(T7, Te);
Chris@82 462 T5y = VSUB(T1n, T1u);
Chris@82 463 T5z = VADD(T5x, T5y);
Chris@82 464 T5P = VSUB(T5x, T5y);
Chris@82 465 T5Q = VSUB(T5D, T5A);
Chris@82 466 T5R = VADD(T5F, T5I);
Chris@82 467 T5S = VSUB(T5Q, T5R);
Chris@82 468 T5U = VADD(T5Q, T5R);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 V T5E, T5J, T5L, T5M;
Chris@82 472 T5E = VADD(T5A, T5D);
Chris@82 473 T5J = VSUB(T5F, T5I);
Chris@82 474 T5K = VADD(T5E, T5J);
Chris@82 475 T5O = VSUB(T5J, T5E);
Chris@82 476 T5L = VSUB(T18, T1f);
Chris@82 477 T5M = VSUB(Tt, Tm);
Chris@82 478 T5N = VSUB(T5L, T5M);
Chris@82 479 T5T = VADD(T5M, T5L);
Chris@82 480 }
Chris@82 481 T6h = VFNMS(LDK(KP707106781), T5K, T5z);
Chris@82 482 STM4(&(ro[20]), T6h, ovs, &(ro[0]));
Chris@82 483 T6i = VFNMS(LDK(KP707106781), T5U, T5T);
Chris@82 484 STM4(&(io[20]), T6i, ovs, &(io[0]));
Chris@82 485 T6j = VFMA(LDK(KP707106781), T5K, T5z);
Chris@82 486 STM4(&(ro[4]), T6j, ovs, &(ro[0]));
Chris@82 487 T6k = VFMA(LDK(KP707106781), T5U, T5T);
Chris@82 488 STM4(&(io[4]), T6k, ovs, &(io[0]));
Chris@82 489 T6l = VFNMS(LDK(KP707106781), T5O, T5N);
Chris@82 490 STM4(&(io[28]), T6l, ovs, &(io[0]));
Chris@82 491 T6m = VFNMS(LDK(KP707106781), T5S, T5P);
Chris@82 492 STM4(&(ro[28]), T6m, ovs, &(ro[0]));
Chris@82 493 T6n = VFMA(LDK(KP707106781), T5O, T5N);
Chris@82 494 STM4(&(io[12]), T6n, ovs, &(io[0]));
Chris@82 495 T6o = VFMA(LDK(KP707106781), T5S, T5P);
Chris@82 496 STM4(&(ro[12]), T6o, ovs, &(ro[0]));
Chris@82 497 }
Chris@82 498 {
Chris@82 499 V Tv, T5V, T5Y, T60, T10, T11, T1w, T5Z;
Chris@82 500 {
Chris@82 501 V Tf, Tu, T5W, T5X;
Chris@82 502 Tf = VADD(T7, Te);
Chris@82 503 Tu = VADD(Tm, Tt);
Chris@82 504 Tv = VADD(Tf, Tu);
Chris@82 505 T5V = VSUB(Tf, Tu);
Chris@82 506 T5W = VADD(T5B, T5C);
Chris@82 507 T5X = VADD(T5G, T5H);
Chris@82 508 T5Y = VSUB(T5W, T5X);
Chris@82 509 T60 = VADD(T5W, T5X);
Chris@82 510 }
Chris@82 511 {
Chris@82 512 V TK, TZ, T1g, T1v;
Chris@82 513 TK = VADD(TC, TJ);
Chris@82 514 TZ = VADD(TR, TY);
Chris@82 515 T10 = VADD(TK, TZ);
Chris@82 516 T11 = VSUB(TZ, TK);
Chris@82 517 T1g = VADD(T18, T1f);
Chris@82 518 T1v = VADD(T1n, T1u);
Chris@82 519 T1w = VSUB(T1g, T1v);
Chris@82 520 T5Z = VADD(T1g, T1v);
Chris@82 521 }
Chris@82 522 T6p = VSUB(Tv, T10);
Chris@82 523 STM4(&(ro[16]), T6p, ovs, &(ro[0]));
Chris@82 524 T6q = VSUB(T5Z, T60);
Chris@82 525 STM4(&(io[16]), T6q, ovs, &(io[0]));
Chris@82 526 T6r = VADD(Tv, T10);
Chris@82 527 STM4(&(ro[0]), T6r, ovs, &(ro[0]));
Chris@82 528 T6s = VADD(T5Z, T60);
Chris@82 529 STM4(&(io[0]), T6s, ovs, &(io[0]));
Chris@82 530 T6t = VADD(T11, T1w);
Chris@82 531 STM4(&(io[8]), T6t, ovs, &(io[0]));
Chris@82 532 T6u = VADD(T5V, T5Y);
Chris@82 533 STM4(&(ro[8]), T6u, ovs, &(ro[0]));
Chris@82 534 T6v = VSUB(T1w, T11);
Chris@82 535 STM4(&(io[24]), T6v, ovs, &(io[0]));
Chris@82 536 T6w = VSUB(T5V, T5Y);
Chris@82 537 STM4(&(ro[24]), T6w, ovs, &(ro[0]));
Chris@82 538 }
Chris@82 539 {
Chris@82 540 V T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E, T6F, T6G, T6H, T6I, T6J, T6K, T6L;
Chris@82 541 V T6M;
Chris@82 542 {
Chris@82 543 V T1X, T37, T31, T33, T2o, T35, T2P, T34;
Chris@82 544 {
Chris@82 545 V T1H, T1W, T2X, T30;
Chris@82 546 T1H = VFNMS(LDK(KP707106781), T1G, T1z);
Chris@82 547 T1W = VSUB(T1O, T1V);
Chris@82 548 T1X = VFMA(LDK(KP923879532), T1W, T1H);
Chris@82 549 T37 = VFNMS(LDK(KP923879532), T1W, T1H);
Chris@82 550 T2X = VFNMS(LDK(KP707106781), T2W, T2T);
Chris@82 551 T30 = VSUB(T2Y, T2Z);
Chris@82 552 T31 = VFNMS(LDK(KP923879532), T30, T2X);
Chris@82 553 T33 = VFMA(LDK(KP923879532), T30, T2X);
Chris@82 554 }
Chris@82 555 {
Chris@82 556 V T2e, T2n, T2F, T2O;
Chris@82 557 T2e = VFNMS(LDK(KP707106781), T2d, T22);
Chris@82 558 T2n = VFNMS(LDK(KP707106781), T2m, T2j);
Chris@82 559 T2o = VFMA(LDK(KP668178637), T2n, T2e);
Chris@82 560 T35 = VFNMS(LDK(KP668178637), T2e, T2n);
Chris@82 561 T2F = VFNMS(LDK(KP707106781), T2E, T2t);
Chris@82 562 T2O = VFNMS(LDK(KP707106781), T2N, T2K);
Chris@82 563 T2P = VFNMS(LDK(KP668178637), T2O, T2F);
Chris@82 564 T34 = VFMA(LDK(KP668178637), T2F, T2O);
Chris@82 565 }
Chris@82 566 {
Chris@82 567 V T2Q, T36, T32, T38;
Chris@82 568 T2Q = VSUB(T2o, T2P);
Chris@82 569 T6x = VFNMS(LDK(KP831469612), T2Q, T1X);
Chris@82 570 STM4(&(ro[21]), T6x, ovs, &(ro[1]));
Chris@82 571 T6y = VFMA(LDK(KP831469612), T2Q, T1X);
Chris@82 572 STM4(&(ro[5]), T6y, ovs, &(ro[1]));
Chris@82 573 T36 = VSUB(T34, T35);
Chris@82 574 T6z = VFNMS(LDK(KP831469612), T36, T33);
Chris@82 575 STM4(&(io[21]), T6z, ovs, &(io[1]));
Chris@82 576 T6A = VFMA(LDK(KP831469612), T36, T33);
Chris@82 577 STM4(&(io[5]), T6A, ovs, &(io[1]));
Chris@82 578 T32 = VADD(T2o, T2P);
Chris@82 579 T6B = VFNMS(LDK(KP831469612), T32, T31);
Chris@82 580 STM4(&(io[13]), T6B, ovs, &(io[1]));
Chris@82 581 T6C = VFMA(LDK(KP831469612), T32, T31);
Chris@82 582 STM4(&(io[29]), T6C, ovs, &(io[1]));
Chris@82 583 T38 = VADD(T35, T34);
Chris@82 584 T6D = VFNMS(LDK(KP831469612), T38, T37);
Chris@82 585 STM4(&(ro[13]), T6D, ovs, &(ro[1]));
Chris@82 586 T6E = VFMA(LDK(KP831469612), T38, T37);
Chris@82 587 STM4(&(ro[29]), T6E, ovs, &(ro[1]));
Chris@82 588 }
Chris@82 589 }
Chris@82 590 {
Chris@82 591 V T3D, T41, T3Z, T45, T3K, T42, T3R, T43;
Chris@82 592 {
Chris@82 593 V T3v, T3C, T3V, T3Y;
Chris@82 594 T3v = VFMA(LDK(KP707106781), T3u, T3t);
Chris@82 595 T3C = VSUB(T3y, T3B);
Chris@82 596 T3D = VFMA(LDK(KP923879532), T3C, T3v);
Chris@82 597 T41 = VFNMS(LDK(KP923879532), T3C, T3v);
Chris@82 598 T3V = VFMA(LDK(KP707106781), T3U, T3T);
Chris@82 599 T3Y = VSUB(T3W, T3X);
Chris@82 600 T3Z = VFNMS(LDK(KP923879532), T3Y, T3V);
Chris@82 601 T45 = VFMA(LDK(KP923879532), T3Y, T3V);
Chris@82 602 }
Chris@82 603 {
Chris@82 604 V T3G, T3J, T3N, T3Q;
Chris@82 605 T3G = VFNMS(LDK(KP707106781), T3F, T3E);
Chris@82 606 T3J = VFNMS(LDK(KP707106781), T3I, T3H);
Chris@82 607 T3K = VFMA(LDK(KP668178637), T3J, T3G);
Chris@82 608 T42 = VFNMS(LDK(KP668178637), T3G, T3J);
Chris@82 609 T3N = VFNMS(LDK(KP707106781), T3M, T3L);
Chris@82 610 T3Q = VFNMS(LDK(KP707106781), T3P, T3O);
Chris@82 611 T3R = VFNMS(LDK(KP668178637), T3Q, T3N);
Chris@82 612 T43 = VFMA(LDK(KP668178637), T3N, T3Q);
Chris@82 613 }
Chris@82 614 {
Chris@82 615 V T3S, T46, T40, T44;
Chris@82 616 T3S = VADD(T3K, T3R);
Chris@82 617 T6F = VFNMS(LDK(KP831469612), T3S, T3D);
Chris@82 618 STM4(&(ro[19]), T6F, ovs, &(ro[1]));
Chris@82 619 T6G = VFMA(LDK(KP831469612), T3S, T3D);
Chris@82 620 STM4(&(ro[3]), T6G, ovs, &(ro[1]));
Chris@82 621 T46 = VADD(T42, T43);
Chris@82 622 T6H = VFNMS(LDK(KP831469612), T46, T45);
Chris@82 623 STM4(&(io[19]), T6H, ovs, &(io[1]));
Chris@82 624 T6I = VFMA(LDK(KP831469612), T46, T45);
Chris@82 625 STM4(&(io[3]), T6I, ovs, &(io[1]));
Chris@82 626 T40 = VSUB(T3R, T3K);
Chris@82 627 T6J = VFNMS(LDK(KP831469612), T40, T3Z);
Chris@82 628 STM4(&(io[27]), T6J, ovs, &(io[1]));
Chris@82 629 T6K = VFMA(LDK(KP831469612), T40, T3Z);
Chris@82 630 STM4(&(io[11]), T6K, ovs, &(io[1]));
Chris@82 631 T44 = VSUB(T42, T43);
Chris@82 632 T6L = VFNMS(LDK(KP831469612), T44, T41);
Chris@82 633 STM4(&(ro[27]), T6L, ovs, &(ro[1]));
Chris@82 634 T6M = VFMA(LDK(KP831469612), T44, T41);
Chris@82 635 STM4(&(ro[11]), T6M, ovs, &(ro[1]));
Chris@82 636 }
Chris@82 637 }
Chris@82 638 {
Chris@82 639 V T49, T4p, T4j, T4l, T4c, T4n, T4f, T4m;
Chris@82 640 {
Chris@82 641 V T47, T48, T4h, T4i;
Chris@82 642 T47 = VFNMS(LDK(KP707106781), T3u, T3t);
Chris@82 643 T48 = VADD(T3X, T3W);
Chris@82 644 T49 = VFNMS(LDK(KP923879532), T48, T47);
Chris@82 645 T4p = VFMA(LDK(KP923879532), T48, T47);
Chris@82 646 T4h = VFNMS(LDK(KP707106781), T3U, T3T);
Chris@82 647 T4i = VADD(T3y, T3B);
Chris@82 648 T4j = VFMA(LDK(KP923879532), T4i, T4h);
Chris@82 649 T4l = VFNMS(LDK(KP923879532), T4i, T4h);
Chris@82 650 }
Chris@82 651 {
Chris@82 652 V T4a, T4b, T4d, T4e;
Chris@82 653 T4a = VFMA(LDK(KP707106781), T3I, T3H);
Chris@82 654 T4b = VFMA(LDK(KP707106781), T3F, T3E);
Chris@82 655 T4c = VFMA(LDK(KP198912367), T4b, T4a);
Chris@82 656 T4n = VFNMS(LDK(KP198912367), T4a, T4b);
Chris@82 657 T4d = VFMA(LDK(KP707106781), T3P, T3O);
Chris@82 658 T4e = VFMA(LDK(KP707106781), T3M, T3L);
Chris@82 659 T4f = VFNMS(LDK(KP198912367), T4e, T4d);
Chris@82 660 T4m = VFMA(LDK(KP198912367), T4d, T4e);
Chris@82 661 }
Chris@82 662 {
Chris@82 663 V T4g, T6N, T6O, T4o, T6P, T6Q;
Chris@82 664 T4g = VSUB(T4c, T4f);
Chris@82 665 T6N = VFNMS(LDK(KP980785280), T4g, T49);
Chris@82 666 STM4(&(ro[23]), T6N, ovs, &(ro[1]));
Chris@82 667 STN4(&(ro[20]), T6h, T6x, T61, T6N, ovs);
Chris@82 668 T6O = VFMA(LDK(KP980785280), T4g, T49);
Chris@82 669 STM4(&(ro[7]), T6O, ovs, &(ro[1]));
Chris@82 670 STN4(&(ro[4]), T6j, T6y, T63, T6O, ovs);
Chris@82 671 T4o = VSUB(T4m, T4n);
Chris@82 672 T6P = VFNMS(LDK(KP980785280), T4o, T4l);
Chris@82 673 STM4(&(io[23]), T6P, ovs, &(io[1]));
Chris@82 674 STN4(&(io[20]), T6i, T6z, T62, T6P, ovs);
Chris@82 675 T6Q = VFMA(LDK(KP980785280), T4o, T4l);
Chris@82 676 STM4(&(io[7]), T6Q, ovs, &(io[1]));
Chris@82 677 STN4(&(io[4]), T6k, T6A, T64, T6Q, ovs);
Chris@82 678 }
Chris@82 679 {
Chris@82 680 V T4k, T6R, T6S, T4q, T6T, T6U;
Chris@82 681 T4k = VADD(T4c, T4f);
Chris@82 682 T6R = VFNMS(LDK(KP980785280), T4k, T4j);
Chris@82 683 STM4(&(io[15]), T6R, ovs, &(io[1]));
Chris@82 684 STN4(&(io[12]), T6n, T6B, T65, T6R, ovs);
Chris@82 685 T6S = VFMA(LDK(KP980785280), T4k, T4j);
Chris@82 686 STM4(&(io[31]), T6S, ovs, &(io[1]));
Chris@82 687 STN4(&(io[28]), T6l, T6C, T67, T6S, ovs);
Chris@82 688 T4q = VADD(T4n, T4m);
Chris@82 689 T6T = VFNMS(LDK(KP980785280), T4q, T4p);
Chris@82 690 STM4(&(ro[15]), T6T, ovs, &(ro[1]));
Chris@82 691 STN4(&(ro[12]), T6o, T6D, T66, T6T, ovs);
Chris@82 692 T6U = VFMA(LDK(KP980785280), T4q, T4p);
Chris@82 693 STM4(&(ro[31]), T6U, ovs, &(ro[1]));
Chris@82 694 STN4(&(ro[28]), T6m, T6E, T68, T6U, ovs);
Chris@82 695 }
Chris@82 696 }
Chris@82 697 {
Chris@82 698 V T3b, T3n, T3l, T3r, T3e, T3o, T3h, T3p;
Chris@82 699 {
Chris@82 700 V T39, T3a, T3j, T3k;
Chris@82 701 T39 = VFMA(LDK(KP707106781), T1G, T1z);
Chris@82 702 T3a = VADD(T2Z, T2Y);
Chris@82 703 T3b = VFMA(LDK(KP923879532), T3a, T39);
Chris@82 704 T3n = VFNMS(LDK(KP923879532), T3a, T39);
Chris@82 705 T3j = VFMA(LDK(KP707106781), T2W, T2T);
Chris@82 706 T3k = VADD(T1O, T1V);
Chris@82 707 T3l = VFNMS(LDK(KP923879532), T3k, T3j);
Chris@82 708 T3r = VFMA(LDK(KP923879532), T3k, T3j);
Chris@82 709 }
Chris@82 710 {
Chris@82 711 V T3c, T3d, T3f, T3g;
Chris@82 712 T3c = VFMA(LDK(KP707106781), T2m, T2j);
Chris@82 713 T3d = VFMA(LDK(KP707106781), T2d, T22);
Chris@82 714 T3e = VFMA(LDK(KP198912367), T3d, T3c);
Chris@82 715 T3o = VFNMS(LDK(KP198912367), T3c, T3d);
Chris@82 716 T3f = VFMA(LDK(KP707106781), T2N, T2K);
Chris@82 717 T3g = VFMA(LDK(KP707106781), T2E, T2t);
Chris@82 718 T3h = VFNMS(LDK(KP198912367), T3g, T3f);
Chris@82 719 T3p = VFMA(LDK(KP198912367), T3f, T3g);
Chris@82 720 }
Chris@82 721 {
Chris@82 722 V T3i, T6V, T6W, T3s, T6X, T6Y;
Chris@82 723 T3i = VADD(T3e, T3h);
Chris@82 724 T6V = VFNMS(LDK(KP980785280), T3i, T3b);
Chris@82 725 STM4(&(ro[17]), T6V, ovs, &(ro[1]));
Chris@82 726 STN4(&(ro[16]), T6p, T6V, T69, T6F, ovs);
Chris@82 727 T6W = VFMA(LDK(KP980785280), T3i, T3b);
Chris@82 728 STM4(&(ro[1]), T6W, ovs, &(ro[1]));
Chris@82 729 STN4(&(ro[0]), T6r, T6W, T6b, T6G, ovs);
Chris@82 730 T3s = VADD(T3o, T3p);
Chris@82 731 T6X = VFNMS(LDK(KP980785280), T3s, T3r);
Chris@82 732 STM4(&(io[17]), T6X, ovs, &(io[1]));
Chris@82 733 STN4(&(io[16]), T6q, T6X, T6a, T6H, ovs);
Chris@82 734 T6Y = VFMA(LDK(KP980785280), T3s, T3r);
Chris@82 735 STM4(&(io[1]), T6Y, ovs, &(io[1]));
Chris@82 736 STN4(&(io[0]), T6s, T6Y, T6c, T6I, ovs);
Chris@82 737 }
Chris@82 738 {
Chris@82 739 V T3m, T6Z, T70, T3q, T71, T72;
Chris@82 740 T3m = VSUB(T3h, T3e);
Chris@82 741 T6Z = VFNMS(LDK(KP980785280), T3m, T3l);
Chris@82 742 STM4(&(io[25]), T6Z, ovs, &(io[1]));
Chris@82 743 STN4(&(io[24]), T6v, T6Z, T6d, T6J, ovs);
Chris@82 744 T70 = VFMA(LDK(KP980785280), T3m, T3l);
Chris@82 745 STM4(&(io[9]), T70, ovs, &(io[1]));
Chris@82 746 STN4(&(io[8]), T6t, T70, T6f, T6K, ovs);
Chris@82 747 T3q = VSUB(T3o, T3p);
Chris@82 748 T71 = VFNMS(LDK(KP980785280), T3q, T3n);
Chris@82 749 STM4(&(ro[25]), T71, ovs, &(ro[1]));
Chris@82 750 STN4(&(ro[24]), T6w, T71, T6e, T6L, ovs);
Chris@82 751 T72 = VFMA(LDK(KP980785280), T3q, T3n);
Chris@82 752 STM4(&(ro[9]), T72, ovs, &(ro[1]));
Chris@82 753 STN4(&(ro[8]), T6u, T72, T6g, T6M, ovs);
Chris@82 754 }
Chris@82 755 }
Chris@82 756 }
Chris@82 757 }
Chris@82 758 }
Chris@82 759 }
Chris@82 760 VLEAVE();
Chris@82 761 }
Chris@82 762
Chris@82 763 static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {236, 0, 136, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 764
Chris@82 765 void XSIMD(codelet_n2sv_32) (planner *p) {
Chris@82 766 X(kdft_register) (p, n2sv_32, &desc);
Chris@82 767 }
Chris@82 768
Chris@82 769 #else
Chris@82 770
Chris@82 771 /* Generated by: ../../../genfft/gen_notw.native -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name n2sv_32 -with-ostride 1 -include dft/simd/n2s.h -store-multiple 4 */
Chris@82 772
Chris@82 773 /*
Chris@82 774 * This function contains 372 FP additions, 84 FP multiplications,
Chris@82 775 * (or, 340 additions, 52 multiplications, 32 fused multiply/add),
Chris@82 776 * 130 stack variables, 7 constants, and 144 memory accesses
Chris@82 777 */
Chris@82 778 #include "dft/simd/n2s.h"
Chris@82 779
Chris@82 780 static void n2sv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
Chris@82 781 {
Chris@82 782 DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
Chris@82 783 DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
Chris@82 784 DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
Chris@82 785 DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
Chris@82 786 DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 787 DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 788 DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 789 {
Chris@82 790 INT i;
Chris@82 791 for (i = v; i > 0; i = i - (2 * VL), ri = ri + ((2 * VL) * ivs), ii = ii + ((2 * VL) * ivs), ro = ro + ((2 * VL) * ovs), io = io + ((2 * VL) * ovs), MAKE_VOLATILE_STRIDE(128, is), MAKE_VOLATILE_STRIDE(128, os)) {
Chris@82 792 V T7, T4r, T4Z, T18, T1z, T3t, T3T, T2T, Te, T1f, T50, T4s, T2W, T3u, T1G;
Chris@82 793 V T3U, Tm, T1n, T1O, T2Z, T3y, T3X, T4w, T53, Tt, T1u, T1V, T2Y, T3B, T3W;
Chris@82 794 V T4z, T52, T2t, T3L, T3O, T2K, TR, TY, T5F, T5G, T5H, T5I, T4R, T5j, T2E;
Chris@82 795 V T3P, T4W, T5k, T2N, T3M, T22, T3E, T3H, T2j, TC, TJ, T5A, T5B, T5C, T5D;
Chris@82 796 V T4G, T5g, T2d, T3F, T4L, T5h, T2m, T3I;
Chris@82 797 {
Chris@82 798 V T3, T1x, T14, T2S, T6, T2R, T17, T1y;
Chris@82 799 {
Chris@82 800 V T1, T2, T12, T13;
Chris@82 801 T1 = LD(&(ri[0]), ivs, &(ri[0]));
Chris@82 802 T2 = LD(&(ri[WS(is, 16)]), ivs, &(ri[0]));
Chris@82 803 T3 = VADD(T1, T2);
Chris@82 804 T1x = VSUB(T1, T2);
Chris@82 805 T12 = LD(&(ii[0]), ivs, &(ii[0]));
Chris@82 806 T13 = LD(&(ii[WS(is, 16)]), ivs, &(ii[0]));
Chris@82 807 T14 = VADD(T12, T13);
Chris@82 808 T2S = VSUB(T12, T13);
Chris@82 809 }
Chris@82 810 {
Chris@82 811 V T4, T5, T15, T16;
Chris@82 812 T4 = LD(&(ri[WS(is, 8)]), ivs, &(ri[0]));
Chris@82 813 T5 = LD(&(ri[WS(is, 24)]), ivs, &(ri[0]));
Chris@82 814 T6 = VADD(T4, T5);
Chris@82 815 T2R = VSUB(T4, T5);
Chris@82 816 T15 = LD(&(ii[WS(is, 8)]), ivs, &(ii[0]));
Chris@82 817 T16 = LD(&(ii[WS(is, 24)]), ivs, &(ii[0]));
Chris@82 818 T17 = VADD(T15, T16);
Chris@82 819 T1y = VSUB(T15, T16);
Chris@82 820 }
Chris@82 821 T7 = VADD(T3, T6);
Chris@82 822 T4r = VSUB(T3, T6);
Chris@82 823 T4Z = VSUB(T14, T17);
Chris@82 824 T18 = VADD(T14, T17);
Chris@82 825 T1z = VSUB(T1x, T1y);
Chris@82 826 T3t = VADD(T1x, T1y);
Chris@82 827 T3T = VSUB(T2S, T2R);
Chris@82 828 T2T = VADD(T2R, T2S);
Chris@82 829 }
Chris@82 830 {
Chris@82 831 V Ta, T1B, T1b, T1A, Td, T1D, T1e, T1E;
Chris@82 832 {
Chris@82 833 V T8, T9, T19, T1a;
Chris@82 834 T8 = LD(&(ri[WS(is, 4)]), ivs, &(ri[0]));
Chris@82 835 T9 = LD(&(ri[WS(is, 20)]), ivs, &(ri[0]));
Chris@82 836 Ta = VADD(T8, T9);
Chris@82 837 T1B = VSUB(T8, T9);
Chris@82 838 T19 = LD(&(ii[WS(is, 4)]), ivs, &(ii[0]));
Chris@82 839 T1a = LD(&(ii[WS(is, 20)]), ivs, &(ii[0]));
Chris@82 840 T1b = VADD(T19, T1a);
Chris@82 841 T1A = VSUB(T19, T1a);
Chris@82 842 }
Chris@82 843 {
Chris@82 844 V Tb, Tc, T1c, T1d;
Chris@82 845 Tb = LD(&(ri[WS(is, 28)]), ivs, &(ri[0]));
Chris@82 846 Tc = LD(&(ri[WS(is, 12)]), ivs, &(ri[0]));
Chris@82 847 Td = VADD(Tb, Tc);
Chris@82 848 T1D = VSUB(Tb, Tc);
Chris@82 849 T1c = LD(&(ii[WS(is, 28)]), ivs, &(ii[0]));
Chris@82 850 T1d = LD(&(ii[WS(is, 12)]), ivs, &(ii[0]));
Chris@82 851 T1e = VADD(T1c, T1d);
Chris@82 852 T1E = VSUB(T1c, T1d);
Chris@82 853 }
Chris@82 854 Te = VADD(Ta, Td);
Chris@82 855 T1f = VADD(T1b, T1e);
Chris@82 856 T50 = VSUB(Td, Ta);
Chris@82 857 T4s = VSUB(T1b, T1e);
Chris@82 858 {
Chris@82 859 V T2U, T2V, T1C, T1F;
Chris@82 860 T2U = VSUB(T1D, T1E);
Chris@82 861 T2V = VADD(T1B, T1A);
Chris@82 862 T2W = VMUL(LDK(KP707106781), VSUB(T2U, T2V));
Chris@82 863 T3u = VMUL(LDK(KP707106781), VADD(T2V, T2U));
Chris@82 864 T1C = VSUB(T1A, T1B);
Chris@82 865 T1F = VADD(T1D, T1E);
Chris@82 866 T1G = VMUL(LDK(KP707106781), VSUB(T1C, T1F));
Chris@82 867 T3U = VMUL(LDK(KP707106781), VADD(T1C, T1F));
Chris@82 868 }
Chris@82 869 }
Chris@82 870 {
Chris@82 871 V Ti, T1L, T1j, T1J, Tl, T1I, T1m, T1M, T1K, T1N;
Chris@82 872 {
Chris@82 873 V Tg, Th, T1h, T1i;
Chris@82 874 Tg = LD(&(ri[WS(is, 2)]), ivs, &(ri[0]));
Chris@82 875 Th = LD(&(ri[WS(is, 18)]), ivs, &(ri[0]));
Chris@82 876 Ti = VADD(Tg, Th);
Chris@82 877 T1L = VSUB(Tg, Th);
Chris@82 878 T1h = LD(&(ii[WS(is, 2)]), ivs, &(ii[0]));
Chris@82 879 T1i = LD(&(ii[WS(is, 18)]), ivs, &(ii[0]));
Chris@82 880 T1j = VADD(T1h, T1i);
Chris@82 881 T1J = VSUB(T1h, T1i);
Chris@82 882 }
Chris@82 883 {
Chris@82 884 V Tj, Tk, T1k, T1l;
Chris@82 885 Tj = LD(&(ri[WS(is, 10)]), ivs, &(ri[0]));
Chris@82 886 Tk = LD(&(ri[WS(is, 26)]), ivs, &(ri[0]));
Chris@82 887 Tl = VADD(Tj, Tk);
Chris@82 888 T1I = VSUB(Tj, Tk);
Chris@82 889 T1k = LD(&(ii[WS(is, 10)]), ivs, &(ii[0]));
Chris@82 890 T1l = LD(&(ii[WS(is, 26)]), ivs, &(ii[0]));
Chris@82 891 T1m = VADD(T1k, T1l);
Chris@82 892 T1M = VSUB(T1k, T1l);
Chris@82 893 }
Chris@82 894 Tm = VADD(Ti, Tl);
Chris@82 895 T1n = VADD(T1j, T1m);
Chris@82 896 T1K = VADD(T1I, T1J);
Chris@82 897 T1N = VSUB(T1L, T1M);
Chris@82 898 T1O = VFNMS(LDK(KP923879532), T1N, VMUL(LDK(KP382683432), T1K));
Chris@82 899 T2Z = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1N));
Chris@82 900 {
Chris@82 901 V T3w, T3x, T4u, T4v;
Chris@82 902 T3w = VSUB(T1J, T1I);
Chris@82 903 T3x = VADD(T1L, T1M);
Chris@82 904 T3y = VFNMS(LDK(KP382683432), T3x, VMUL(LDK(KP923879532), T3w));
Chris@82 905 T3X = VFMA(LDK(KP382683432), T3w, VMUL(LDK(KP923879532), T3x));
Chris@82 906 T4u = VSUB(T1j, T1m);
Chris@82 907 T4v = VSUB(Ti, Tl);
Chris@82 908 T4w = VSUB(T4u, T4v);
Chris@82 909 T53 = VADD(T4v, T4u);
Chris@82 910 }
Chris@82 911 }
Chris@82 912 {
Chris@82 913 V Tp, T1S, T1q, T1Q, Ts, T1P, T1t, T1T, T1R, T1U;
Chris@82 914 {
Chris@82 915 V Tn, To, T1o, T1p;
Chris@82 916 Tn = LD(&(ri[WS(is, 30)]), ivs, &(ri[0]));
Chris@82 917 To = LD(&(ri[WS(is, 14)]), ivs, &(ri[0]));
Chris@82 918 Tp = VADD(Tn, To);
Chris@82 919 T1S = VSUB(Tn, To);
Chris@82 920 T1o = LD(&(ii[WS(is, 30)]), ivs, &(ii[0]));
Chris@82 921 T1p = LD(&(ii[WS(is, 14)]), ivs, &(ii[0]));
Chris@82 922 T1q = VADD(T1o, T1p);
Chris@82 923 T1Q = VSUB(T1o, T1p);
Chris@82 924 }
Chris@82 925 {
Chris@82 926 V Tq, Tr, T1r, T1s;
Chris@82 927 Tq = LD(&(ri[WS(is, 6)]), ivs, &(ri[0]));
Chris@82 928 Tr = LD(&(ri[WS(is, 22)]), ivs, &(ri[0]));
Chris@82 929 Ts = VADD(Tq, Tr);
Chris@82 930 T1P = VSUB(Tq, Tr);
Chris@82 931 T1r = LD(&(ii[WS(is, 6)]), ivs, &(ii[0]));
Chris@82 932 T1s = LD(&(ii[WS(is, 22)]), ivs, &(ii[0]));
Chris@82 933 T1t = VADD(T1r, T1s);
Chris@82 934 T1T = VSUB(T1r, T1s);
Chris@82 935 }
Chris@82 936 Tt = VADD(Tp, Ts);
Chris@82 937 T1u = VADD(T1q, T1t);
Chris@82 938 T1R = VADD(T1P, T1Q);
Chris@82 939 T1U = VSUB(T1S, T1T);
Chris@82 940 T1V = VFMA(LDK(KP382683432), T1R, VMUL(LDK(KP923879532), T1U));
Chris@82 941 T2Y = VFNMS(LDK(KP923879532), T1R, VMUL(LDK(KP382683432), T1U));
Chris@82 942 {
Chris@82 943 V T3z, T3A, T4x, T4y;
Chris@82 944 T3z = VSUB(T1Q, T1P);
Chris@82 945 T3A = VADD(T1S, T1T);
Chris@82 946 T3B = VFMA(LDK(KP923879532), T3z, VMUL(LDK(KP382683432), T3A));
Chris@82 947 T3W = VFNMS(LDK(KP382683432), T3z, VMUL(LDK(KP923879532), T3A));
Chris@82 948 T4x = VSUB(Tp, Ts);
Chris@82 949 T4y = VSUB(T1q, T1t);
Chris@82 950 T4z = VADD(T4x, T4y);
Chris@82 951 T52 = VSUB(T4x, T4y);
Chris@82 952 }
Chris@82 953 }
Chris@82 954 {
Chris@82 955 V TN, T2p, T2J, T4S, TQ, T2G, T2s, T4T, TU, T2x, T2w, T4O, TX, T2z, T2C;
Chris@82 956 V T4P;
Chris@82 957 {
Chris@82 958 V TL, TM, T2H, T2I;
Chris@82 959 TL = LD(&(ri[WS(is, 31)]), ivs, &(ri[WS(is, 1)]));
Chris@82 960 TM = LD(&(ri[WS(is, 15)]), ivs, &(ri[WS(is, 1)]));
Chris@82 961 TN = VADD(TL, TM);
Chris@82 962 T2p = VSUB(TL, TM);
Chris@82 963 T2H = LD(&(ii[WS(is, 31)]), ivs, &(ii[WS(is, 1)]));
Chris@82 964 T2I = LD(&(ii[WS(is, 15)]), ivs, &(ii[WS(is, 1)]));
Chris@82 965 T2J = VSUB(T2H, T2I);
Chris@82 966 T4S = VADD(T2H, T2I);
Chris@82 967 }
Chris@82 968 {
Chris@82 969 V TO, TP, T2q, T2r;
Chris@82 970 TO = LD(&(ri[WS(is, 7)]), ivs, &(ri[WS(is, 1)]));
Chris@82 971 TP = LD(&(ri[WS(is, 23)]), ivs, &(ri[WS(is, 1)]));
Chris@82 972 TQ = VADD(TO, TP);
Chris@82 973 T2G = VSUB(TO, TP);
Chris@82 974 T2q = LD(&(ii[WS(is, 7)]), ivs, &(ii[WS(is, 1)]));
Chris@82 975 T2r = LD(&(ii[WS(is, 23)]), ivs, &(ii[WS(is, 1)]));
Chris@82 976 T2s = VSUB(T2q, T2r);
Chris@82 977 T4T = VADD(T2q, T2r);
Chris@82 978 }
Chris@82 979 {
Chris@82 980 V TS, TT, T2u, T2v;
Chris@82 981 TS = LD(&(ri[WS(is, 3)]), ivs, &(ri[WS(is, 1)]));
Chris@82 982 TT = LD(&(ri[WS(is, 19)]), ivs, &(ri[WS(is, 1)]));
Chris@82 983 TU = VADD(TS, TT);
Chris@82 984 T2x = VSUB(TS, TT);
Chris@82 985 T2u = LD(&(ii[WS(is, 3)]), ivs, &(ii[WS(is, 1)]));
Chris@82 986 T2v = LD(&(ii[WS(is, 19)]), ivs, &(ii[WS(is, 1)]));
Chris@82 987 T2w = VSUB(T2u, T2v);
Chris@82 988 T4O = VADD(T2u, T2v);
Chris@82 989 }
Chris@82 990 {
Chris@82 991 V TV, TW, T2A, T2B;
Chris@82 992 TV = LD(&(ri[WS(is, 27)]), ivs, &(ri[WS(is, 1)]));
Chris@82 993 TW = LD(&(ri[WS(is, 11)]), ivs, &(ri[WS(is, 1)]));
Chris@82 994 TX = VADD(TV, TW);
Chris@82 995 T2z = VSUB(TV, TW);
Chris@82 996 T2A = LD(&(ii[WS(is, 27)]), ivs, &(ii[WS(is, 1)]));
Chris@82 997 T2B = LD(&(ii[WS(is, 11)]), ivs, &(ii[WS(is, 1)]));
Chris@82 998 T2C = VSUB(T2A, T2B);
Chris@82 999 T4P = VADD(T2A, T2B);
Chris@82 1000 }
Chris@82 1001 T2t = VSUB(T2p, T2s);
Chris@82 1002 T3L = VADD(T2p, T2s);
Chris@82 1003 T3O = VSUB(T2J, T2G);
Chris@82 1004 T2K = VADD(T2G, T2J);
Chris@82 1005 TR = VADD(TN, TQ);
Chris@82 1006 TY = VADD(TU, TX);
Chris@82 1007 T5F = VSUB(TR, TY);
Chris@82 1008 {
Chris@82 1009 V T4N, T4Q, T2y, T2D;
Chris@82 1010 T5G = VADD(T4S, T4T);
Chris@82 1011 T5H = VADD(T4O, T4P);
Chris@82 1012 T5I = VSUB(T5G, T5H);
Chris@82 1013 T4N = VSUB(TN, TQ);
Chris@82 1014 T4Q = VSUB(T4O, T4P);
Chris@82 1015 T4R = VSUB(T4N, T4Q);
Chris@82 1016 T5j = VADD(T4N, T4Q);
Chris@82 1017 T2y = VSUB(T2w, T2x);
Chris@82 1018 T2D = VADD(T2z, T2C);
Chris@82 1019 T2E = VMUL(LDK(KP707106781), VSUB(T2y, T2D));
Chris@82 1020 T3P = VMUL(LDK(KP707106781), VADD(T2y, T2D));
Chris@82 1021 {
Chris@82 1022 V T4U, T4V, T2L, T2M;
Chris@82 1023 T4U = VSUB(T4S, T4T);
Chris@82 1024 T4V = VSUB(TX, TU);
Chris@82 1025 T4W = VSUB(T4U, T4V);
Chris@82 1026 T5k = VADD(T4V, T4U);
Chris@82 1027 T2L = VSUB(T2z, T2C);
Chris@82 1028 T2M = VADD(T2x, T2w);
Chris@82 1029 T2N = VMUL(LDK(KP707106781), VSUB(T2L, T2M));
Chris@82 1030 T3M = VMUL(LDK(KP707106781), VADD(T2M, T2L));
Chris@82 1031 }
Chris@82 1032 }
Chris@82 1033 }
Chris@82 1034 {
Chris@82 1035 V Ty, T2f, T21, T4C, TB, T1Y, T2i, T4D, TF, T28, T2b, T4I, TI, T23, T26;
Chris@82 1036 V T4J;
Chris@82 1037 {
Chris@82 1038 V Tw, Tx, T1Z, T20;
Chris@82 1039 Tw = LD(&(ri[WS(is, 1)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1040 Tx = LD(&(ri[WS(is, 17)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1041 Ty = VADD(Tw, Tx);
Chris@82 1042 T2f = VSUB(Tw, Tx);
Chris@82 1043 T1Z = LD(&(ii[WS(is, 1)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1044 T20 = LD(&(ii[WS(is, 17)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1045 T21 = VSUB(T1Z, T20);
Chris@82 1046 T4C = VADD(T1Z, T20);
Chris@82 1047 }
Chris@82 1048 {
Chris@82 1049 V Tz, TA, T2g, T2h;
Chris@82 1050 Tz = LD(&(ri[WS(is, 9)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1051 TA = LD(&(ri[WS(is, 25)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1052 TB = VADD(Tz, TA);
Chris@82 1053 T1Y = VSUB(Tz, TA);
Chris@82 1054 T2g = LD(&(ii[WS(is, 9)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1055 T2h = LD(&(ii[WS(is, 25)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1056 T2i = VSUB(T2g, T2h);
Chris@82 1057 T4D = VADD(T2g, T2h);
Chris@82 1058 }
Chris@82 1059 {
Chris@82 1060 V TD, TE, T29, T2a;
Chris@82 1061 TD = LD(&(ri[WS(is, 5)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1062 TE = LD(&(ri[WS(is, 21)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1063 TF = VADD(TD, TE);
Chris@82 1064 T28 = VSUB(TD, TE);
Chris@82 1065 T29 = LD(&(ii[WS(is, 5)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1066 T2a = LD(&(ii[WS(is, 21)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1067 T2b = VSUB(T29, T2a);
Chris@82 1068 T4I = VADD(T29, T2a);
Chris@82 1069 }
Chris@82 1070 {
Chris@82 1071 V TG, TH, T24, T25;
Chris@82 1072 TG = LD(&(ri[WS(is, 29)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1073 TH = LD(&(ri[WS(is, 13)]), ivs, &(ri[WS(is, 1)]));
Chris@82 1074 TI = VADD(TG, TH);
Chris@82 1075 T23 = VSUB(TG, TH);
Chris@82 1076 T24 = LD(&(ii[WS(is, 29)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1077 T25 = LD(&(ii[WS(is, 13)]), ivs, &(ii[WS(is, 1)]));
Chris@82 1078 T26 = VSUB(T24, T25);
Chris@82 1079 T4J = VADD(T24, T25);
Chris@82 1080 }
Chris@82 1081 T22 = VADD(T1Y, T21);
Chris@82 1082 T3E = VADD(T2f, T2i);
Chris@82 1083 T3H = VSUB(T21, T1Y);
Chris@82 1084 T2j = VSUB(T2f, T2i);
Chris@82 1085 TC = VADD(Ty, TB);
Chris@82 1086 TJ = VADD(TF, TI);
Chris@82 1087 T5A = VSUB(TC, TJ);
Chris@82 1088 {
Chris@82 1089 V T4E, T4F, T27, T2c;
Chris@82 1090 T5B = VADD(T4C, T4D);
Chris@82 1091 T5C = VADD(T4I, T4J);
Chris@82 1092 T5D = VSUB(T5B, T5C);
Chris@82 1093 T4E = VSUB(T4C, T4D);
Chris@82 1094 T4F = VSUB(TI, TF);
Chris@82 1095 T4G = VSUB(T4E, T4F);
Chris@82 1096 T5g = VADD(T4F, T4E);
Chris@82 1097 T27 = VSUB(T23, T26);
Chris@82 1098 T2c = VADD(T28, T2b);
Chris@82 1099 T2d = VMUL(LDK(KP707106781), VSUB(T27, T2c));
Chris@82 1100 T3F = VMUL(LDK(KP707106781), VADD(T2c, T27));
Chris@82 1101 {
Chris@82 1102 V T4H, T4K, T2k, T2l;
Chris@82 1103 T4H = VSUB(Ty, TB);
Chris@82 1104 T4K = VSUB(T4I, T4J);
Chris@82 1105 T4L = VSUB(T4H, T4K);
Chris@82 1106 T5h = VADD(T4H, T4K);
Chris@82 1107 T2k = VSUB(T2b, T28);
Chris@82 1108 T2l = VADD(T23, T26);
Chris@82 1109 T2m = VMUL(LDK(KP707106781), VSUB(T2k, T2l));
Chris@82 1110 T3I = VMUL(LDK(KP707106781), VADD(T2k, T2l));
Chris@82 1111 }
Chris@82 1112 }
Chris@82 1113 }
Chris@82 1114 {
Chris@82 1115 V T61, T62, T63, T64, T65, T66, T67, T68, T69, T6a, T6b, T6c, T6d, T6e, T6f;
Chris@82 1116 V T6g, T6h, T6i, T6j, T6k, T6l, T6m, T6n, T6o, T6p, T6q, T6r, T6s, T6t, T6u;
Chris@82 1117 V T6v, T6w;
Chris@82 1118 {
Chris@82 1119 V T4B, T57, T5a, T5c, T4Y, T56, T55, T5b;
Chris@82 1120 {
Chris@82 1121 V T4t, T4A, T58, T59;
Chris@82 1122 T4t = VSUB(T4r, T4s);
Chris@82 1123 T4A = VMUL(LDK(KP707106781), VSUB(T4w, T4z));
Chris@82 1124 T4B = VADD(T4t, T4A);
Chris@82 1125 T57 = VSUB(T4t, T4A);
Chris@82 1126 T58 = VFNMS(LDK(KP923879532), T4L, VMUL(LDK(KP382683432), T4G));
Chris@82 1127 T59 = VFMA(LDK(KP382683432), T4W, VMUL(LDK(KP923879532), T4R));
Chris@82 1128 T5a = VSUB(T58, T59);
Chris@82 1129 T5c = VADD(T58, T59);
Chris@82 1130 }
Chris@82 1131 {
Chris@82 1132 V T4M, T4X, T51, T54;
Chris@82 1133 T4M = VFMA(LDK(KP923879532), T4G, VMUL(LDK(KP382683432), T4L));
Chris@82 1134 T4X = VFNMS(LDK(KP923879532), T4W, VMUL(LDK(KP382683432), T4R));
Chris@82 1135 T4Y = VADD(T4M, T4X);
Chris@82 1136 T56 = VSUB(T4X, T4M);
Chris@82 1137 T51 = VSUB(T4Z, T50);
Chris@82 1138 T54 = VMUL(LDK(KP707106781), VSUB(T52, T53));
Chris@82 1139 T55 = VSUB(T51, T54);
Chris@82 1140 T5b = VADD(T51, T54);
Chris@82 1141 }
Chris@82 1142 T61 = VSUB(T4B, T4Y);
Chris@82 1143 STM4(&(ro[22]), T61, ovs, &(ro[0]));
Chris@82 1144 T62 = VSUB(T5b, T5c);
Chris@82 1145 STM4(&(io[22]), T62, ovs, &(io[0]));
Chris@82 1146 T63 = VADD(T4B, T4Y);
Chris@82 1147 STM4(&(ro[6]), T63, ovs, &(ro[0]));
Chris@82 1148 T64 = VADD(T5b, T5c);
Chris@82 1149 STM4(&(io[6]), T64, ovs, &(io[0]));
Chris@82 1150 T65 = VSUB(T55, T56);
Chris@82 1151 STM4(&(io[30]), T65, ovs, &(io[0]));
Chris@82 1152 T66 = VSUB(T57, T5a);
Chris@82 1153 STM4(&(ro[30]), T66, ovs, &(ro[0]));
Chris@82 1154 T67 = VADD(T55, T56);
Chris@82 1155 STM4(&(io[14]), T67, ovs, &(io[0]));
Chris@82 1156 T68 = VADD(T57, T5a);
Chris@82 1157 STM4(&(ro[14]), T68, ovs, &(ro[0]));
Chris@82 1158 }
Chris@82 1159 {
Chris@82 1160 V T5f, T5r, T5u, T5w, T5m, T5q, T5p, T5v;
Chris@82 1161 {
Chris@82 1162 V T5d, T5e, T5s, T5t;
Chris@82 1163 T5d = VADD(T4r, T4s);
Chris@82 1164 T5e = VMUL(LDK(KP707106781), VADD(T53, T52));
Chris@82 1165 T5f = VADD(T5d, T5e);
Chris@82 1166 T5r = VSUB(T5d, T5e);
Chris@82 1167 T5s = VFNMS(LDK(KP382683432), T5h, VMUL(LDK(KP923879532), T5g));
Chris@82 1168 T5t = VFMA(LDK(KP923879532), T5k, VMUL(LDK(KP382683432), T5j));
Chris@82 1169 T5u = VSUB(T5s, T5t);
Chris@82 1170 T5w = VADD(T5s, T5t);
Chris@82 1171 }
Chris@82 1172 {
Chris@82 1173 V T5i, T5l, T5n, T5o;
Chris@82 1174 T5i = VFMA(LDK(KP382683432), T5g, VMUL(LDK(KP923879532), T5h));
Chris@82 1175 T5l = VFNMS(LDK(KP382683432), T5k, VMUL(LDK(KP923879532), T5j));
Chris@82 1176 T5m = VADD(T5i, T5l);
Chris@82 1177 T5q = VSUB(T5l, T5i);
Chris@82 1178 T5n = VADD(T50, T4Z);
Chris@82 1179 T5o = VMUL(LDK(KP707106781), VADD(T4w, T4z));
Chris@82 1180 T5p = VSUB(T5n, T5o);
Chris@82 1181 T5v = VADD(T5n, T5o);
Chris@82 1182 }
Chris@82 1183 T69 = VSUB(T5f, T5m);
Chris@82 1184 STM4(&(ro[18]), T69, ovs, &(ro[0]));
Chris@82 1185 T6a = VSUB(T5v, T5w);
Chris@82 1186 STM4(&(io[18]), T6a, ovs, &(io[0]));
Chris@82 1187 T6b = VADD(T5f, T5m);
Chris@82 1188 STM4(&(ro[2]), T6b, ovs, &(ro[0]));
Chris@82 1189 T6c = VADD(T5v, T5w);
Chris@82 1190 STM4(&(io[2]), T6c, ovs, &(io[0]));
Chris@82 1191 T6d = VSUB(T5p, T5q);
Chris@82 1192 STM4(&(io[26]), T6d, ovs, &(io[0]));
Chris@82 1193 T6e = VSUB(T5r, T5u);
Chris@82 1194 STM4(&(ro[26]), T6e, ovs, &(ro[0]));
Chris@82 1195 T6f = VADD(T5p, T5q);
Chris@82 1196 STM4(&(io[10]), T6f, ovs, &(io[0]));
Chris@82 1197 T6g = VADD(T5r, T5u);
Chris@82 1198 STM4(&(ro[10]), T6g, ovs, &(ro[0]));
Chris@82 1199 }
Chris@82 1200 {
Chris@82 1201 V T5z, T5P, T5S, T5U, T5K, T5O, T5N, T5T;
Chris@82 1202 {
Chris@82 1203 V T5x, T5y, T5Q, T5R;
Chris@82 1204 T5x = VSUB(T7, Te);
Chris@82 1205 T5y = VSUB(T1n, T1u);
Chris@82 1206 T5z = VADD(T5x, T5y);
Chris@82 1207 T5P = VSUB(T5x, T5y);
Chris@82 1208 T5Q = VSUB(T5D, T5A);
Chris@82 1209 T5R = VADD(T5F, T5I);
Chris@82 1210 T5S = VMUL(LDK(KP707106781), VSUB(T5Q, T5R));
Chris@82 1211 T5U = VMUL(LDK(KP707106781), VADD(T5Q, T5R));
Chris@82 1212 }
Chris@82 1213 {
Chris@82 1214 V T5E, T5J, T5L, T5M;
Chris@82 1215 T5E = VADD(T5A, T5D);
Chris@82 1216 T5J = VSUB(T5F, T5I);
Chris@82 1217 T5K = VMUL(LDK(KP707106781), VADD(T5E, T5J));
Chris@82 1218 T5O = VMUL(LDK(KP707106781), VSUB(T5J, T5E));
Chris@82 1219 T5L = VSUB(T18, T1f);
Chris@82 1220 T5M = VSUB(Tt, Tm);
Chris@82 1221 T5N = VSUB(T5L, T5M);
Chris@82 1222 T5T = VADD(T5M, T5L);
Chris@82 1223 }
Chris@82 1224 T6h = VSUB(T5z, T5K);
Chris@82 1225 STM4(&(ro[20]), T6h, ovs, &(ro[0]));
Chris@82 1226 T6i = VSUB(T5T, T5U);
Chris@82 1227 STM4(&(io[20]), T6i, ovs, &(io[0]));
Chris@82 1228 T6j = VADD(T5z, T5K);
Chris@82 1229 STM4(&(ro[4]), T6j, ovs, &(ro[0]));
Chris@82 1230 T6k = VADD(T5T, T5U);
Chris@82 1231 STM4(&(io[4]), T6k, ovs, &(io[0]));
Chris@82 1232 T6l = VSUB(T5N, T5O);
Chris@82 1233 STM4(&(io[28]), T6l, ovs, &(io[0]));
Chris@82 1234 T6m = VSUB(T5P, T5S);
Chris@82 1235 STM4(&(ro[28]), T6m, ovs, &(ro[0]));
Chris@82 1236 T6n = VADD(T5N, T5O);
Chris@82 1237 STM4(&(io[12]), T6n, ovs, &(io[0]));
Chris@82 1238 T6o = VADD(T5P, T5S);
Chris@82 1239 STM4(&(ro[12]), T6o, ovs, &(ro[0]));
Chris@82 1240 }
Chris@82 1241 {
Chris@82 1242 V Tv, T5V, T5Y, T60, T10, T11, T1w, T5Z;
Chris@82 1243 {
Chris@82 1244 V Tf, Tu, T5W, T5X;
Chris@82 1245 Tf = VADD(T7, Te);
Chris@82 1246 Tu = VADD(Tm, Tt);
Chris@82 1247 Tv = VADD(Tf, Tu);
Chris@82 1248 T5V = VSUB(Tf, Tu);
Chris@82 1249 T5W = VADD(T5B, T5C);
Chris@82 1250 T5X = VADD(T5G, T5H);
Chris@82 1251 T5Y = VSUB(T5W, T5X);
Chris@82 1252 T60 = VADD(T5W, T5X);
Chris@82 1253 }
Chris@82 1254 {
Chris@82 1255 V TK, TZ, T1g, T1v;
Chris@82 1256 TK = VADD(TC, TJ);
Chris@82 1257 TZ = VADD(TR, TY);
Chris@82 1258 T10 = VADD(TK, TZ);
Chris@82 1259 T11 = VSUB(TZ, TK);
Chris@82 1260 T1g = VADD(T18, T1f);
Chris@82 1261 T1v = VADD(T1n, T1u);
Chris@82 1262 T1w = VSUB(T1g, T1v);
Chris@82 1263 T5Z = VADD(T1g, T1v);
Chris@82 1264 }
Chris@82 1265 T6p = VSUB(Tv, T10);
Chris@82 1266 STM4(&(ro[16]), T6p, ovs, &(ro[0]));
Chris@82 1267 T6q = VSUB(T5Z, T60);
Chris@82 1268 STM4(&(io[16]), T6q, ovs, &(io[0]));
Chris@82 1269 T6r = VADD(Tv, T10);
Chris@82 1270 STM4(&(ro[0]), T6r, ovs, &(ro[0]));
Chris@82 1271 T6s = VADD(T5Z, T60);
Chris@82 1272 STM4(&(io[0]), T6s, ovs, &(io[0]));
Chris@82 1273 T6t = VADD(T11, T1w);
Chris@82 1274 STM4(&(io[8]), T6t, ovs, &(io[0]));
Chris@82 1275 T6u = VADD(T5V, T5Y);
Chris@82 1276 STM4(&(ro[8]), T6u, ovs, &(ro[0]));
Chris@82 1277 T6v = VSUB(T1w, T11);
Chris@82 1278 STM4(&(io[24]), T6v, ovs, &(io[0]));
Chris@82 1279 T6w = VSUB(T5V, T5Y);
Chris@82 1280 STM4(&(ro[24]), T6w, ovs, &(ro[0]));
Chris@82 1281 }
Chris@82 1282 {
Chris@82 1283 V T6x, T6y, T6z, T6A, T6B, T6C, T6D, T6E;
Chris@82 1284 {
Chris@82 1285 V T1X, T33, T31, T37, T2o, T34, T2P, T35;
Chris@82 1286 {
Chris@82 1287 V T1H, T1W, T2X, T30;
Chris@82 1288 T1H = VSUB(T1z, T1G);
Chris@82 1289 T1W = VSUB(T1O, T1V);
Chris@82 1290 T1X = VADD(T1H, T1W);
Chris@82 1291 T33 = VSUB(T1H, T1W);
Chris@82 1292 T2X = VSUB(T2T, T2W);
Chris@82 1293 T30 = VSUB(T2Y, T2Z);
Chris@82 1294 T31 = VSUB(T2X, T30);
Chris@82 1295 T37 = VADD(T2X, T30);
Chris@82 1296 }
Chris@82 1297 {
Chris@82 1298 V T2e, T2n, T2F, T2O;
Chris@82 1299 T2e = VSUB(T22, T2d);
Chris@82 1300 T2n = VSUB(T2j, T2m);
Chris@82 1301 T2o = VFMA(LDK(KP980785280), T2e, VMUL(LDK(KP195090322), T2n));
Chris@82 1302 T34 = VFNMS(LDK(KP980785280), T2n, VMUL(LDK(KP195090322), T2e));
Chris@82 1303 T2F = VSUB(T2t, T2E);
Chris@82 1304 T2O = VSUB(T2K, T2N);
Chris@82 1305 T2P = VFNMS(LDK(KP980785280), T2O, VMUL(LDK(KP195090322), T2F));
Chris@82 1306 T35 = VFMA(LDK(KP195090322), T2O, VMUL(LDK(KP980785280), T2F));
Chris@82 1307 }
Chris@82 1308 {
Chris@82 1309 V T2Q, T38, T32, T36;
Chris@82 1310 T2Q = VADD(T2o, T2P);
Chris@82 1311 T6x = VSUB(T1X, T2Q);
Chris@82 1312 STM4(&(ro[23]), T6x, ovs, &(ro[1]));
Chris@82 1313 T6y = VADD(T1X, T2Q);
Chris@82 1314 STM4(&(ro[7]), T6y, ovs, &(ro[1]));
Chris@82 1315 T38 = VADD(T34, T35);
Chris@82 1316 T6z = VSUB(T37, T38);
Chris@82 1317 STM4(&(io[23]), T6z, ovs, &(io[1]));
Chris@82 1318 T6A = VADD(T37, T38);
Chris@82 1319 STM4(&(io[7]), T6A, ovs, &(io[1]));
Chris@82 1320 T32 = VSUB(T2P, T2o);
Chris@82 1321 T6B = VSUB(T31, T32);
Chris@82 1322 STM4(&(io[31]), T6B, ovs, &(io[1]));
Chris@82 1323 T6C = VADD(T31, T32);
Chris@82 1324 STM4(&(io[15]), T6C, ovs, &(io[1]));
Chris@82 1325 T36 = VSUB(T34, T35);
Chris@82 1326 T6D = VSUB(T33, T36);
Chris@82 1327 STM4(&(ro[31]), T6D, ovs, &(ro[1]));
Chris@82 1328 T6E = VADD(T33, T36);
Chris@82 1329 STM4(&(ro[15]), T6E, ovs, &(ro[1]));
Chris@82 1330 }
Chris@82 1331 }
Chris@82 1332 {
Chris@82 1333 V T3D, T41, T3Z, T45, T3K, T42, T3R, T43;
Chris@82 1334 {
Chris@82 1335 V T3v, T3C, T3V, T3Y;
Chris@82 1336 T3v = VSUB(T3t, T3u);
Chris@82 1337 T3C = VSUB(T3y, T3B);
Chris@82 1338 T3D = VADD(T3v, T3C);
Chris@82 1339 T41 = VSUB(T3v, T3C);
Chris@82 1340 T3V = VSUB(T3T, T3U);
Chris@82 1341 T3Y = VSUB(T3W, T3X);
Chris@82 1342 T3Z = VSUB(T3V, T3Y);
Chris@82 1343 T45 = VADD(T3V, T3Y);
Chris@82 1344 }
Chris@82 1345 {
Chris@82 1346 V T3G, T3J, T3N, T3Q;
Chris@82 1347 T3G = VSUB(T3E, T3F);
Chris@82 1348 T3J = VSUB(T3H, T3I);
Chris@82 1349 T3K = VFMA(LDK(KP555570233), T3G, VMUL(LDK(KP831469612), T3J));
Chris@82 1350 T42 = VFNMS(LDK(KP831469612), T3G, VMUL(LDK(KP555570233), T3J));
Chris@82 1351 T3N = VSUB(T3L, T3M);
Chris@82 1352 T3Q = VSUB(T3O, T3P);
Chris@82 1353 T3R = VFNMS(LDK(KP831469612), T3Q, VMUL(LDK(KP555570233), T3N));
Chris@82 1354 T43 = VFMA(LDK(KP831469612), T3N, VMUL(LDK(KP555570233), T3Q));
Chris@82 1355 }
Chris@82 1356 {
Chris@82 1357 V T3S, T6F, T6G, T46, T6H, T6I;
Chris@82 1358 T3S = VADD(T3K, T3R);
Chris@82 1359 T6F = VSUB(T3D, T3S);
Chris@82 1360 STM4(&(ro[21]), T6F, ovs, &(ro[1]));
Chris@82 1361 STN4(&(ro[20]), T6h, T6F, T61, T6x, ovs);
Chris@82 1362 T6G = VADD(T3D, T3S);
Chris@82 1363 STM4(&(ro[5]), T6G, ovs, &(ro[1]));
Chris@82 1364 STN4(&(ro[4]), T6j, T6G, T63, T6y, ovs);
Chris@82 1365 T46 = VADD(T42, T43);
Chris@82 1366 T6H = VSUB(T45, T46);
Chris@82 1367 STM4(&(io[21]), T6H, ovs, &(io[1]));
Chris@82 1368 STN4(&(io[20]), T6i, T6H, T62, T6z, ovs);
Chris@82 1369 T6I = VADD(T45, T46);
Chris@82 1370 STM4(&(io[5]), T6I, ovs, &(io[1]));
Chris@82 1371 STN4(&(io[4]), T6k, T6I, T64, T6A, ovs);
Chris@82 1372 }
Chris@82 1373 {
Chris@82 1374 V T40, T6J, T6K, T44, T6L, T6M;
Chris@82 1375 T40 = VSUB(T3R, T3K);
Chris@82 1376 T6J = VSUB(T3Z, T40);
Chris@82 1377 STM4(&(io[29]), T6J, ovs, &(io[1]));
Chris@82 1378 STN4(&(io[28]), T6l, T6J, T65, T6B, ovs);
Chris@82 1379 T6K = VADD(T3Z, T40);
Chris@82 1380 STM4(&(io[13]), T6K, ovs, &(io[1]));
Chris@82 1381 STN4(&(io[12]), T6n, T6K, T67, T6C, ovs);
Chris@82 1382 T44 = VSUB(T42, T43);
Chris@82 1383 T6L = VSUB(T41, T44);
Chris@82 1384 STM4(&(ro[29]), T6L, ovs, &(ro[1]));
Chris@82 1385 STN4(&(ro[28]), T6m, T6L, T66, T6D, ovs);
Chris@82 1386 T6M = VADD(T41, T44);
Chris@82 1387 STM4(&(ro[13]), T6M, ovs, &(ro[1]));
Chris@82 1388 STN4(&(ro[12]), T6o, T6M, T68, T6E, ovs);
Chris@82 1389 }
Chris@82 1390 }
Chris@82 1391 }
Chris@82 1392 {
Chris@82 1393 V T6N, T6O, T6P, T6Q, T6R, T6S, T6T, T6U;
Chris@82 1394 {
Chris@82 1395 V T49, T4l, T4j, T4p, T4c, T4m, T4f, T4n;
Chris@82 1396 {
Chris@82 1397 V T47, T48, T4h, T4i;
Chris@82 1398 T47 = VADD(T3t, T3u);
Chris@82 1399 T48 = VADD(T3X, T3W);
Chris@82 1400 T49 = VADD(T47, T48);
Chris@82 1401 T4l = VSUB(T47, T48);
Chris@82 1402 T4h = VADD(T3T, T3U);
Chris@82 1403 T4i = VADD(T3y, T3B);
Chris@82 1404 T4j = VSUB(T4h, T4i);
Chris@82 1405 T4p = VADD(T4h, T4i);
Chris@82 1406 }
Chris@82 1407 {
Chris@82 1408 V T4a, T4b, T4d, T4e;
Chris@82 1409 T4a = VADD(T3E, T3F);
Chris@82 1410 T4b = VADD(T3H, T3I);
Chris@82 1411 T4c = VFMA(LDK(KP980785280), T4a, VMUL(LDK(KP195090322), T4b));
Chris@82 1412 T4m = VFNMS(LDK(KP195090322), T4a, VMUL(LDK(KP980785280), T4b));
Chris@82 1413 T4d = VADD(T3L, T3M);
Chris@82 1414 T4e = VADD(T3O, T3P);
Chris@82 1415 T4f = VFNMS(LDK(KP195090322), T4e, VMUL(LDK(KP980785280), T4d));
Chris@82 1416 T4n = VFMA(LDK(KP195090322), T4d, VMUL(LDK(KP980785280), T4e));
Chris@82 1417 }
Chris@82 1418 {
Chris@82 1419 V T4g, T4q, T4k, T4o;
Chris@82 1420 T4g = VADD(T4c, T4f);
Chris@82 1421 T6N = VSUB(T49, T4g);
Chris@82 1422 STM4(&(ro[17]), T6N, ovs, &(ro[1]));
Chris@82 1423 T6O = VADD(T49, T4g);
Chris@82 1424 STM4(&(ro[1]), T6O, ovs, &(ro[1]));
Chris@82 1425 T4q = VADD(T4m, T4n);
Chris@82 1426 T6P = VSUB(T4p, T4q);
Chris@82 1427 STM4(&(io[17]), T6P, ovs, &(io[1]));
Chris@82 1428 T6Q = VADD(T4p, T4q);
Chris@82 1429 STM4(&(io[1]), T6Q, ovs, &(io[1]));
Chris@82 1430 T4k = VSUB(T4f, T4c);
Chris@82 1431 T6R = VSUB(T4j, T4k);
Chris@82 1432 STM4(&(io[25]), T6R, ovs, &(io[1]));
Chris@82 1433 T6S = VADD(T4j, T4k);
Chris@82 1434 STM4(&(io[9]), T6S, ovs, &(io[1]));
Chris@82 1435 T4o = VSUB(T4m, T4n);
Chris@82 1436 T6T = VSUB(T4l, T4o);
Chris@82 1437 STM4(&(ro[25]), T6T, ovs, &(ro[1]));
Chris@82 1438 T6U = VADD(T4l, T4o);
Chris@82 1439 STM4(&(ro[9]), T6U, ovs, &(ro[1]));
Chris@82 1440 }
Chris@82 1441 }
Chris@82 1442 {
Chris@82 1443 V T3b, T3n, T3l, T3r, T3e, T3o, T3h, T3p;
Chris@82 1444 {
Chris@82 1445 V T39, T3a, T3j, T3k;
Chris@82 1446 T39 = VADD(T1z, T1G);
Chris@82 1447 T3a = VADD(T2Z, T2Y);
Chris@82 1448 T3b = VADD(T39, T3a);
Chris@82 1449 T3n = VSUB(T39, T3a);
Chris@82 1450 T3j = VADD(T2T, T2W);
Chris@82 1451 T3k = VADD(T1O, T1V);
Chris@82 1452 T3l = VSUB(T3j, T3k);
Chris@82 1453 T3r = VADD(T3j, T3k);
Chris@82 1454 }
Chris@82 1455 {
Chris@82 1456 V T3c, T3d, T3f, T3g;
Chris@82 1457 T3c = VADD(T22, T2d);
Chris@82 1458 T3d = VADD(T2j, T2m);
Chris@82 1459 T3e = VFMA(LDK(KP555570233), T3c, VMUL(LDK(KP831469612), T3d));
Chris@82 1460 T3o = VFNMS(LDK(KP555570233), T3d, VMUL(LDK(KP831469612), T3c));
Chris@82 1461 T3f = VADD(T2t, T2E);
Chris@82 1462 T3g = VADD(T2K, T2N);
Chris@82 1463 T3h = VFNMS(LDK(KP555570233), T3g, VMUL(LDK(KP831469612), T3f));
Chris@82 1464 T3p = VFMA(LDK(KP831469612), T3g, VMUL(LDK(KP555570233), T3f));
Chris@82 1465 }
Chris@82 1466 {
Chris@82 1467 V T3i, T6V, T6W, T3s, T6X, T6Y;
Chris@82 1468 T3i = VADD(T3e, T3h);
Chris@82 1469 T6V = VSUB(T3b, T3i);
Chris@82 1470 STM4(&(ro[19]), T6V, ovs, &(ro[1]));
Chris@82 1471 STN4(&(ro[16]), T6p, T6N, T69, T6V, ovs);
Chris@82 1472 T6W = VADD(T3b, T3i);
Chris@82 1473 STM4(&(ro[3]), T6W, ovs, &(ro[1]));
Chris@82 1474 STN4(&(ro[0]), T6r, T6O, T6b, T6W, ovs);
Chris@82 1475 T3s = VADD(T3o, T3p);
Chris@82 1476 T6X = VSUB(T3r, T3s);
Chris@82 1477 STM4(&(io[19]), T6X, ovs, &(io[1]));
Chris@82 1478 STN4(&(io[16]), T6q, T6P, T6a, T6X, ovs);
Chris@82 1479 T6Y = VADD(T3r, T3s);
Chris@82 1480 STM4(&(io[3]), T6Y, ovs, &(io[1]));
Chris@82 1481 STN4(&(io[0]), T6s, T6Q, T6c, T6Y, ovs);
Chris@82 1482 }
Chris@82 1483 {
Chris@82 1484 V T3m, T6Z, T70, T3q, T71, T72;
Chris@82 1485 T3m = VSUB(T3h, T3e);
Chris@82 1486 T6Z = VSUB(T3l, T3m);
Chris@82 1487 STM4(&(io[27]), T6Z, ovs, &(io[1]));
Chris@82 1488 STN4(&(io[24]), T6v, T6R, T6d, T6Z, ovs);
Chris@82 1489 T70 = VADD(T3l, T3m);
Chris@82 1490 STM4(&(io[11]), T70, ovs, &(io[1]));
Chris@82 1491 STN4(&(io[8]), T6t, T6S, T6f, T70, ovs);
Chris@82 1492 T3q = VSUB(T3o, T3p);
Chris@82 1493 T71 = VSUB(T3n, T3q);
Chris@82 1494 STM4(&(ro[27]), T71, ovs, &(ro[1]));
Chris@82 1495 STN4(&(ro[24]), T6w, T6T, T6e, T71, ovs);
Chris@82 1496 T72 = VADD(T3n, T3q);
Chris@82 1497 STM4(&(ro[11]), T72, ovs, &(ro[1]));
Chris@82 1498 STN4(&(ro[8]), T6u, T6U, T6g, T72, ovs);
Chris@82 1499 }
Chris@82 1500 }
Chris@82 1501 }
Chris@82 1502 }
Chris@82 1503 }
Chris@82 1504 }
Chris@82 1505 VLEAVE();
Chris@82 1506 }
Chris@82 1507
Chris@82 1508 static const kdft_desc desc = { 32, XSIMD_STRING("n2sv_32"), {340, 52, 32, 0}, &GENUS, 0, 1, 0, 0 };
Chris@82 1509
Chris@82 1510 void XSIMD(codelet_n2sv_32) (planner *p) {
Chris@82 1511 X(kdft_register) (p, n2sv_32, &desc);
Chris@82 1512 }
Chris@82 1513
Chris@82 1514 #endif