annotate src/fftw-3.3.5/kernel/transpose.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 #include "ifftw.h"
Chris@42 22
Chris@42 23 /* in place square transposition, iterative */
Chris@42 24 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
Chris@42 25 {
Chris@42 26 INT i0, i1, v;
Chris@42 27
Chris@42 28 switch (vl) {
Chris@42 29 case 1:
Chris@42 30 for (i1 = 1; i1 < n; ++i1) {
Chris@42 31 for (i0 = 0; i0 < i1; ++i0) {
Chris@42 32 R x0 = I[i1 * s0 + i0 * s1];
Chris@42 33 R y0 = I[i1 * s1 + i0 * s0];
Chris@42 34 I[i1 * s1 + i0 * s0] = x0;
Chris@42 35 I[i1 * s0 + i0 * s1] = y0;
Chris@42 36 }
Chris@42 37 }
Chris@42 38 break;
Chris@42 39 case 2:
Chris@42 40 for (i1 = 1; i1 < n; ++i1) {
Chris@42 41 for (i0 = 0; i0 < i1; ++i0) {
Chris@42 42 R x0 = I[i1 * s0 + i0 * s1];
Chris@42 43 R x1 = I[i1 * s0 + i0 * s1 + 1];
Chris@42 44 R y0 = I[i1 * s1 + i0 * s0];
Chris@42 45 R y1 = I[i1 * s1 + i0 * s0 + 1];
Chris@42 46 I[i1 * s1 + i0 * s0] = x0;
Chris@42 47 I[i1 * s1 + i0 * s0 + 1] = x1;
Chris@42 48 I[i1 * s0 + i0 * s1] = y0;
Chris@42 49 I[i1 * s0 + i0 * s1 + 1] = y1;
Chris@42 50 }
Chris@42 51 }
Chris@42 52 break;
Chris@42 53 default:
Chris@42 54 for (i1 = 1; i1 < n; ++i1) {
Chris@42 55 for (i0 = 0; i0 < i1; ++i0) {
Chris@42 56 for (v = 0; v < vl; ++v) {
Chris@42 57 R x0 = I[i1 * s0 + i0 * s1 + v];
Chris@42 58 R y0 = I[i1 * s1 + i0 * s0 + v];
Chris@42 59 I[i1 * s1 + i0 * s0 + v] = x0;
Chris@42 60 I[i1 * s0 + i0 * s1 + v] = y0;
Chris@42 61 }
Chris@42 62 }
Chris@42 63 }
Chris@42 64 break;
Chris@42 65 }
Chris@42 66 }
Chris@42 67
Chris@42 68 struct transpose_closure {
Chris@42 69 R *I;
Chris@42 70 INT s0, s1, vl, tilesz;
Chris@42 71 R *buf0, *buf1;
Chris@42 72 };
Chris@42 73
Chris@42 74 static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
Chris@42 75 {
Chris@42 76 struct transpose_closure *k = (struct transpose_closure *)args;
Chris@42 77 R *I = k->I;
Chris@42 78 INT s0 = k->s0, s1 = k->s1, vl = k->vl;
Chris@42 79 INT i0, i1, v;
Chris@42 80
Chris@42 81 switch (vl) {
Chris@42 82 case 1:
Chris@42 83 for (i1 = n1l; i1 < n1u; ++i1) {
Chris@42 84 for (i0 = n0l; i0 < n0u; ++i0) {
Chris@42 85 R x0 = I[i1 * s0 + i0 * s1];
Chris@42 86 R y0 = I[i1 * s1 + i0 * s0];
Chris@42 87 I[i1 * s1 + i0 * s0] = x0;
Chris@42 88 I[i1 * s0 + i0 * s1] = y0;
Chris@42 89 }
Chris@42 90 }
Chris@42 91 break;
Chris@42 92 case 2:
Chris@42 93 for (i1 = n1l; i1 < n1u; ++i1) {
Chris@42 94 for (i0 = n0l; i0 < n0u; ++i0) {
Chris@42 95 R x0 = I[i1 * s0 + i0 * s1];
Chris@42 96 R x1 = I[i1 * s0 + i0 * s1 + 1];
Chris@42 97 R y0 = I[i1 * s1 + i0 * s0];
Chris@42 98 R y1 = I[i1 * s1 + i0 * s0 + 1];
Chris@42 99 I[i1 * s1 + i0 * s0] = x0;
Chris@42 100 I[i1 * s1 + i0 * s0 + 1] = x1;
Chris@42 101 I[i1 * s0 + i0 * s1] = y0;
Chris@42 102 I[i1 * s0 + i0 * s1 + 1] = y1;
Chris@42 103 }
Chris@42 104 }
Chris@42 105 break;
Chris@42 106 default:
Chris@42 107 for (i1 = n1l; i1 < n1u; ++i1) {
Chris@42 108 for (i0 = n0l; i0 < n0u; ++i0) {
Chris@42 109 for (v = 0; v < vl; ++v) {
Chris@42 110 R x0 = I[i1 * s0 + i0 * s1 + v];
Chris@42 111 R y0 = I[i1 * s1 + i0 * s0 + v];
Chris@42 112 I[i1 * s1 + i0 * s0 + v] = x0;
Chris@42 113 I[i1 * s0 + i0 * s1 + v] = y0;
Chris@42 114 }
Chris@42 115 }
Chris@42 116 }
Chris@42 117 }
Chris@42 118 }
Chris@42 119
Chris@42 120 static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
Chris@42 121 {
Chris@42 122 struct transpose_closure *k = (struct transpose_closure *)args;
Chris@42 123 X(cpy2d_ci)(k->I + n0l * k->s0 + n1l * k->s1,
Chris@42 124 k->buf0,
Chris@42 125 n0u - n0l, k->s0, k->vl,
Chris@42 126 n1u - n1l, k->s1, k->vl * (n0u - n0l),
Chris@42 127 k->vl);
Chris@42 128 X(cpy2d_ci)(k->I + n0l * k->s1 + n1l * k->s0,
Chris@42 129 k->buf1,
Chris@42 130 n0u - n0l, k->s1, k->vl,
Chris@42 131 n1u - n1l, k->s0, k->vl * (n0u - n0l),
Chris@42 132 k->vl);
Chris@42 133 X(cpy2d_co)(k->buf1,
Chris@42 134 k->I + n0l * k->s0 + n1l * k->s1,
Chris@42 135 n0u - n0l, k->vl, k->s0,
Chris@42 136 n1u - n1l, k->vl * (n0u - n0l), k->s1,
Chris@42 137 k->vl);
Chris@42 138 X(cpy2d_co)(k->buf0,
Chris@42 139 k->I + n0l * k->s1 + n1l * k->s0,
Chris@42 140 n0u - n0l, k->vl, k->s1,
Chris@42 141 n1u - n1l, k->vl * (n0u - n0l), k->s0,
Chris@42 142 k->vl);
Chris@42 143 }
Chris@42 144
Chris@42 145 static void transpose_rec(R *I, INT n,
Chris@42 146 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u,
Chris@42 147 void *args),
Chris@42 148 struct transpose_closure *k)
Chris@42 149 {
Chris@42 150 tail:
Chris@42 151 if (n > 1) {
Chris@42 152 INT n2 = n / 2;
Chris@42 153 k->I = I;
Chris@42 154 X(tile2d)(0, n2, n2, n, k->tilesz, f, k);
Chris@42 155 transpose_rec(I, n2, f, k);
Chris@42 156 I += n2 * (k->s0 + k->s1); n -= n2; goto tail;
Chris@42 157 }
Chris@42 158 }
Chris@42 159
Chris@42 160 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl)
Chris@42 161 {
Chris@42 162 struct transpose_closure k;
Chris@42 163 k.s0 = s0;
Chris@42 164 k.s1 = s1;
Chris@42 165 k.vl = vl;
Chris@42 166 /* two blocks must be in cache, to be swapped */
Chris@42 167 k.tilesz = X(compute_tilesz)(vl, 2);
Chris@42 168 k.buf0 = k.buf1 = 0; /* unused */
Chris@42 169 transpose_rec(I, n, dotile, &k);
Chris@42 170 }
Chris@42 171
Chris@42 172 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl)
Chris@42 173 {
Chris@42 174 struct transpose_closure k;
Chris@42 175 /* Assume that the the rows of I conflict into the same cache
Chris@42 176 lines, and therefore we don't need to reserve cache space for
Chris@42 177 the input. If the rows don't conflict, there is no reason
Chris@42 178 to use tiledbuf at all.*/
Chris@42 179 R buf0[CACHESIZE / (2 * sizeof(R))];
Chris@42 180 R buf1[CACHESIZE / (2 * sizeof(R))];
Chris@42 181 k.s0 = s0;
Chris@42 182 k.s1 = s1;
Chris@42 183 k.vl = vl;
Chris@42 184 k.tilesz = X(compute_tilesz)(vl, 2);
Chris@42 185 k.buf0 = buf0;
Chris@42 186 k.buf1 = buf1;
Chris@42 187 A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf0));
Chris@42 188 A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf1));
Chris@42 189 transpose_rec(I, n, dotile_buf, &k);
Chris@42 190 }
Chris@42 191