comparison src/fftw-3.3.3/kernel/transpose.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 #include "ifftw.h"
22
23 /* in place square transposition, iterative */
24 void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl)
25 {
26 INT i0, i1, v;
27
28 switch (vl) {
29 case 1:
30 for (i1 = 1; i1 < n; ++i1) {
31 for (i0 = 0; i0 < i1; ++i0) {
32 R x0 = I[i1 * s0 + i0 * s1];
33 R y0 = I[i1 * s1 + i0 * s0];
34 I[i1 * s1 + i0 * s0] = x0;
35 I[i1 * s0 + i0 * s1] = y0;
36 }
37 }
38 break;
39 case 2:
40 for (i1 = 1; i1 < n; ++i1) {
41 for (i0 = 0; i0 < i1; ++i0) {
42 R x0 = I[i1 * s0 + i0 * s1];
43 R x1 = I[i1 * s0 + i0 * s1 + 1];
44 R y0 = I[i1 * s1 + i0 * s0];
45 R y1 = I[i1 * s1 + i0 * s0 + 1];
46 I[i1 * s1 + i0 * s0] = x0;
47 I[i1 * s1 + i0 * s0 + 1] = x1;
48 I[i1 * s0 + i0 * s1] = y0;
49 I[i1 * s0 + i0 * s1 + 1] = y1;
50 }
51 }
52 break;
53 default:
54 for (i1 = 1; i1 < n; ++i1) {
55 for (i0 = 0; i0 < i1; ++i0) {
56 for (v = 0; v < vl; ++v) {
57 R x0 = I[i1 * s0 + i0 * s1 + v];
58 R y0 = I[i1 * s1 + i0 * s0 + v];
59 I[i1 * s1 + i0 * s0 + v] = x0;
60 I[i1 * s0 + i0 * s1 + v] = y0;
61 }
62 }
63 }
64 break;
65 }
66 }
67
68 struct transpose_closure {
69 R *I;
70 INT s0, s1, vl, tilesz;
71 R *buf0, *buf1;
72 };
73
74 static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
75 {
76 struct transpose_closure *k = (struct transpose_closure *)args;
77 R *I = k->I;
78 INT s0 = k->s0, s1 = k->s1, vl = k->vl;
79 INT i0, i1, v;
80
81 switch (vl) {
82 case 1:
83 for (i1 = n1l; i1 < n1u; ++i1) {
84 for (i0 = n0l; i0 < n0u; ++i0) {
85 R x0 = I[i1 * s0 + i0 * s1];
86 R y0 = I[i1 * s1 + i0 * s0];
87 I[i1 * s1 + i0 * s0] = x0;
88 I[i1 * s0 + i0 * s1] = y0;
89 }
90 }
91 break;
92 case 2:
93 for (i1 = n1l; i1 < n1u; ++i1) {
94 for (i0 = n0l; i0 < n0u; ++i0) {
95 R x0 = I[i1 * s0 + i0 * s1];
96 R x1 = I[i1 * s0 + i0 * s1 + 1];
97 R y0 = I[i1 * s1 + i0 * s0];
98 R y1 = I[i1 * s1 + i0 * s0 + 1];
99 I[i1 * s1 + i0 * s0] = x0;
100 I[i1 * s1 + i0 * s0 + 1] = x1;
101 I[i1 * s0 + i0 * s1] = y0;
102 I[i1 * s0 + i0 * s1 + 1] = y1;
103 }
104 }
105 break;
106 default:
107 for (i1 = n1l; i1 < n1u; ++i1) {
108 for (i0 = n0l; i0 < n0u; ++i0) {
109 for (v = 0; v < vl; ++v) {
110 R x0 = I[i1 * s0 + i0 * s1 + v];
111 R y0 = I[i1 * s1 + i0 * s0 + v];
112 I[i1 * s1 + i0 * s0 + v] = x0;
113 I[i1 * s0 + i0 * s1 + v] = y0;
114 }
115 }
116 }
117 }
118 }
119
120 static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
121 {
122 struct transpose_closure *k = (struct transpose_closure *)args;
123 X(cpy2d_ci)(k->I + n0l * k->s0 + n1l * k->s1,
124 k->buf0,
125 n0u - n0l, k->s0, k->vl,
126 n1u - n1l, k->s1, k->vl * (n0u - n0l),
127 k->vl);
128 X(cpy2d_ci)(k->I + n0l * k->s1 + n1l * k->s0,
129 k->buf1,
130 n0u - n0l, k->s1, k->vl,
131 n1u - n1l, k->s0, k->vl * (n0u - n0l),
132 k->vl);
133 X(cpy2d_co)(k->buf1,
134 k->I + n0l * k->s0 + n1l * k->s1,
135 n0u - n0l, k->vl, k->s0,
136 n1u - n1l, k->vl * (n0u - n0l), k->s1,
137 k->vl);
138 X(cpy2d_co)(k->buf0,
139 k->I + n0l * k->s1 + n1l * k->s0,
140 n0u - n0l, k->vl, k->s1,
141 n1u - n1l, k->vl * (n0u - n0l), k->s0,
142 k->vl);
143 }
144
145 static void transpose_rec(R *I, INT n,
146 void (*f)(INT n0l, INT n0u, INT n1l, INT n1u,
147 void *args),
148 struct transpose_closure *k)
149 {
150 tail:
151 if (n > 1) {
152 INT n2 = n / 2;
153 k->I = I;
154 X(tile2d)(0, n2, n2, n, k->tilesz, f, k);
155 transpose_rec(I, n2, f, k);
156 I += n2 * (k->s0 + k->s1); n -= n2; goto tail;
157 }
158 }
159
160 void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl)
161 {
162 struct transpose_closure k;
163 k.s0 = s0;
164 k.s1 = s1;
165 k.vl = vl;
166 /* two blocks must be in cache, to be swapped */
167 k.tilesz = X(compute_tilesz)(vl, 2);
168 k.buf0 = k.buf1 = 0; /* unused */
169 transpose_rec(I, n, dotile, &k);
170 }
171
172 void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl)
173 {
174 struct transpose_closure k;
175 /* Assume that the the rows of I conflict into the same cache
176 lines, and therefore we don't need to reserve cache space for
177 the input. If the rows don't conflict, there is no reason
178 to use tiledbuf at all.*/
179 R buf0[CACHESIZE / (2 * sizeof(R))];
180 R buf1[CACHESIZE / (2 * sizeof(R))];
181 k.s0 = s0;
182 k.s1 = s1;
183 k.vl = vl;
184 k.tilesz = X(compute_tilesz)(vl, 2);
185 k.buf0 = buf0;
186 k.buf1 = buf1;
187 A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf0));
188 A(k.tilesz * k.tilesz * vl * sizeof(R) <= sizeof(buf1));
189 transpose_rec(I, n, dotile_buf, &k);
190 }
191