comparison src/fftw-3.3.5/kernel/cpy2d.c @ 127:7867fa7e1b6b

Current fftw source
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 18 Oct 2016 13:40:26 +0100
parents
children
comparison
equal deleted inserted replaced
126:4a7071416412 127:7867fa7e1b6b
1 /*
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21 /* out of place 2D copy routines */
22 #include "ifftw.h"
23
24 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
25 # ifdef HAVE_XMMINTRIN_H
26 # include <xmmintrin.h>
27 # define WIDE_TYPE __m128
28 # endif
29 #endif
30
31 #ifndef WIDE_TYPE
32 /* fall back to double, which means that WIDE_TYPE will be unused */
33 # define WIDE_TYPE double
34 #endif
35
36 void X(cpy2d)(R *I, R *O,
37 INT n0, INT is0, INT os0,
38 INT n1, INT is1, INT os1,
39 INT vl)
40 {
41 INT i0, i1, v;
42
43 switch (vl) {
44 case 1:
45 for (i1 = 0; i1 < n1; ++i1)
46 for (i0 = 0; i0 < n0; ++i0) {
47 R x0 = I[i0 * is0 + i1 * is1];
48 O[i0 * os0 + i1 * os1] = x0;
49 }
50 break;
51 case 2:
52 if (1
53 && (2 * sizeof(R) == sizeof(WIDE_TYPE))
54 && (sizeof(WIDE_TYPE) > sizeof(double))
55 && (((size_t)I) % sizeof(WIDE_TYPE) == 0)
56 && (((size_t)O) % sizeof(WIDE_TYPE) == 0)
57 && ((is0 & 1) == 0)
58 && ((is1 & 1) == 0)
59 && ((os0 & 1) == 0)
60 && ((os1 & 1) == 0)) {
61 /* copy R[2] as WIDE_TYPE if WIDE_TYPE is large
62 enough to hold R[2], and if the input is
63 properly aligned. This is a win when R==double
64 and WIDE_TYPE is 128 bits. */
65 for (i1 = 0; i1 < n1; ++i1)
66 for (i0 = 0; i0 < n0; ++i0) {
67 *(WIDE_TYPE *)&O[i0 * os0 + i1 * os1] =
68 *(WIDE_TYPE *)&I[i0 * is0 + i1 * is1];
69 }
70 } else if (1
71 && (2 * sizeof(R) == sizeof(double))
72 && (((size_t)I) % sizeof(double) == 0)
73 && (((size_t)O) % sizeof(double) == 0)
74 && ((is0 & 1) == 0)
75 && ((is1 & 1) == 0)
76 && ((os0 & 1) == 0)
77 && ((os1 & 1) == 0)) {
78 /* copy R[2] as double if double is large enough to
79 hold R[2], and if the input is properly aligned.
80 This case applies when R==float */
81 for (i1 = 0; i1 < n1; ++i1)
82 for (i0 = 0; i0 < n0; ++i0) {
83 *(double *)&O[i0 * os0 + i1 * os1] =
84 *(double *)&I[i0 * is0 + i1 * is1];
85 }
86 } else {
87 for (i1 = 0; i1 < n1; ++i1)
88 for (i0 = 0; i0 < n0; ++i0) {
89 R x0 = I[i0 * is0 + i1 * is1];
90 R x1 = I[i0 * is0 + i1 * is1 + 1];
91 O[i0 * os0 + i1 * os1] = x0;
92 O[i0 * os0 + i1 * os1 + 1] = x1;
93 }
94 }
95 break;
96 default:
97 for (i1 = 0; i1 < n1; ++i1)
98 for (i0 = 0; i0 < n0; ++i0)
99 for (v = 0; v < vl; ++v) {
100 R x0 = I[i0 * is0 + i1 * is1 + v];
101 O[i0 * os0 + i1 * os1 + v] = x0;
102 }
103 break;
104 }
105 }
106
107 /* like cpy2d, but read input contiguously if possible */
108 void X(cpy2d_ci)(R *I, R *O,
109 INT n0, INT is0, INT os0,
110 INT n1, INT is1, INT os1,
111 INT vl)
112 {
113 if (IABS(is0) < IABS(is1)) /* inner loop is for n0 */
114 X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
115 else
116 X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
117 }
118
119 /* like cpy2d, but write output contiguously if possible */
120 void X(cpy2d_co)(R *I, R *O,
121 INT n0, INT is0, INT os0,
122 INT n1, INT is1, INT os1,
123 INT vl)
124 {
125 if (IABS(os0) < IABS(os1)) /* inner loop is for n0 */
126 X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
127 else
128 X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
129 }
130
131
132 /* tiled copy routines */
133 struct cpy2d_closure {
134 R *I, *O;
135 INT is0, os0, is1, os1, vl;
136 R *buf;
137 };
138
139 static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
140 {
141 struct cpy2d_closure *k = (struct cpy2d_closure *)args;
142 X(cpy2d)(k->I + n0l * k->is0 + n1l * k->is1,
143 k->O + n0l * k->os0 + n1l * k->os1,
144 n0u - n0l, k->is0, k->os0,
145 n1u - n1l, k->is1, k->os1,
146 k->vl);
147 }
148
149 static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
150 {
151 struct cpy2d_closure *k = (struct cpy2d_closure *)args;
152
153 /* copy from I to buf */
154 X(cpy2d_ci)(k->I + n0l * k->is0 + n1l * k->is1,
155 k->buf,
156 n0u - n0l, k->is0, k->vl,
157 n1u - n1l, k->is1, k->vl * (n0u - n0l),
158 k->vl);
159
160 /* copy from buf to O */
161 X(cpy2d_co)(k->buf,
162 k->O + n0l * k->os0 + n1l * k->os1,
163 n0u - n0l, k->vl, k->os0,
164 n1u - n1l, k->vl * (n0u - n0l), k->os1,
165 k->vl);
166 }
167
168
169 void X(cpy2d_tiled)(R *I, R *O,
170 INT n0, INT is0, INT os0,
171 INT n1, INT is1, INT os1, INT vl)
172 {
173 INT tilesz = X(compute_tilesz)(vl,
174 1 /* input array */
175 + 1 /* ouput array */);
176 struct cpy2d_closure k;
177 k.I = I;
178 k.O = O;
179 k.is0 = is0;
180 k.os0 = os0;
181 k.is1 = is1;
182 k.os1 = os1;
183 k.vl = vl;
184 k.buf = 0; /* unused */
185 X(tile2d)(0, n0, 0, n1, tilesz, dotile, &k);
186 }
187
188 void X(cpy2d_tiledbuf)(R *I, R *O,
189 INT n0, INT is0, INT os0,
190 INT n1, INT is1, INT os1, INT vl)
191 {
192 R buf[CACHESIZE / (2 * sizeof(R))];
193 /* input and buffer in cache, or
194 output and buffer in cache */
195 INT tilesz = X(compute_tilesz)(vl, 2);
196 struct cpy2d_closure k;
197 k.I = I;
198 k.O = O;
199 k.is0 = is0;
200 k.os0 = os0;
201 k.is1 = is1;
202 k.os1 = os1;
203 k.vl = vl;
204 k.buf = buf;
205 A(tilesz * tilesz * vl * sizeof(R) <= sizeof(buf));
206 X(tile2d)(0, n0, 0, n1, tilesz, dotile_buf, &k);
207 }