annotate src/fftw-3.3.5/kernel/cpy2d.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* out of place 2D copy routines */
Chris@42 22 #include "ifftw.h"
Chris@42 23
Chris@42 24 #if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
Chris@42 25 # ifdef HAVE_XMMINTRIN_H
Chris@42 26 # include <xmmintrin.h>
Chris@42 27 # define WIDE_TYPE __m128
Chris@42 28 # endif
Chris@42 29 #endif
Chris@42 30
Chris@42 31 #ifndef WIDE_TYPE
Chris@42 32 /* fall back to double, which means that WIDE_TYPE will be unused */
Chris@42 33 # define WIDE_TYPE double
Chris@42 34 #endif
Chris@42 35
Chris@42 36 void X(cpy2d)(R *I, R *O,
Chris@42 37 INT n0, INT is0, INT os0,
Chris@42 38 INT n1, INT is1, INT os1,
Chris@42 39 INT vl)
Chris@42 40 {
Chris@42 41 INT i0, i1, v;
Chris@42 42
Chris@42 43 switch (vl) {
Chris@42 44 case 1:
Chris@42 45 for (i1 = 0; i1 < n1; ++i1)
Chris@42 46 for (i0 = 0; i0 < n0; ++i0) {
Chris@42 47 R x0 = I[i0 * is0 + i1 * is1];
Chris@42 48 O[i0 * os0 + i1 * os1] = x0;
Chris@42 49 }
Chris@42 50 break;
Chris@42 51 case 2:
Chris@42 52 if (1
Chris@42 53 && (2 * sizeof(R) == sizeof(WIDE_TYPE))
Chris@42 54 && (sizeof(WIDE_TYPE) > sizeof(double))
Chris@42 55 && (((size_t)I) % sizeof(WIDE_TYPE) == 0)
Chris@42 56 && (((size_t)O) % sizeof(WIDE_TYPE) == 0)
Chris@42 57 && ((is0 & 1) == 0)
Chris@42 58 && ((is1 & 1) == 0)
Chris@42 59 && ((os0 & 1) == 0)
Chris@42 60 && ((os1 & 1) == 0)) {
Chris@42 61 /* copy R[2] as WIDE_TYPE if WIDE_TYPE is large
Chris@42 62 enough to hold R[2], and if the input is
Chris@42 63 properly aligned. This is a win when R==double
Chris@42 64 and WIDE_TYPE is 128 bits. */
Chris@42 65 for (i1 = 0; i1 < n1; ++i1)
Chris@42 66 for (i0 = 0; i0 < n0; ++i0) {
Chris@42 67 *(WIDE_TYPE *)&O[i0 * os0 + i1 * os1] =
Chris@42 68 *(WIDE_TYPE *)&I[i0 * is0 + i1 * is1];
Chris@42 69 }
Chris@42 70 } else if (1
Chris@42 71 && (2 * sizeof(R) == sizeof(double))
Chris@42 72 && (((size_t)I) % sizeof(double) == 0)
Chris@42 73 && (((size_t)O) % sizeof(double) == 0)
Chris@42 74 && ((is0 & 1) == 0)
Chris@42 75 && ((is1 & 1) == 0)
Chris@42 76 && ((os0 & 1) == 0)
Chris@42 77 && ((os1 & 1) == 0)) {
Chris@42 78 /* copy R[2] as double if double is large enough to
Chris@42 79 hold R[2], and if the input is properly aligned.
Chris@42 80 This case applies when R==float */
Chris@42 81 for (i1 = 0; i1 < n1; ++i1)
Chris@42 82 for (i0 = 0; i0 < n0; ++i0) {
Chris@42 83 *(double *)&O[i0 * os0 + i1 * os1] =
Chris@42 84 *(double *)&I[i0 * is0 + i1 * is1];
Chris@42 85 }
Chris@42 86 } else {
Chris@42 87 for (i1 = 0; i1 < n1; ++i1)
Chris@42 88 for (i0 = 0; i0 < n0; ++i0) {
Chris@42 89 R x0 = I[i0 * is0 + i1 * is1];
Chris@42 90 R x1 = I[i0 * is0 + i1 * is1 + 1];
Chris@42 91 O[i0 * os0 + i1 * os1] = x0;
Chris@42 92 O[i0 * os0 + i1 * os1 + 1] = x1;
Chris@42 93 }
Chris@42 94 }
Chris@42 95 break;
Chris@42 96 default:
Chris@42 97 for (i1 = 0; i1 < n1; ++i1)
Chris@42 98 for (i0 = 0; i0 < n0; ++i0)
Chris@42 99 for (v = 0; v < vl; ++v) {
Chris@42 100 R x0 = I[i0 * is0 + i1 * is1 + v];
Chris@42 101 O[i0 * os0 + i1 * os1 + v] = x0;
Chris@42 102 }
Chris@42 103 break;
Chris@42 104 }
Chris@42 105 }
Chris@42 106
Chris@42 107 /* like cpy2d, but read input contiguously if possible */
Chris@42 108 void X(cpy2d_ci)(R *I, R *O,
Chris@42 109 INT n0, INT is0, INT os0,
Chris@42 110 INT n1, INT is1, INT os1,
Chris@42 111 INT vl)
Chris@42 112 {
Chris@42 113 if (IABS(is0) < IABS(is1)) /* inner loop is for n0 */
Chris@42 114 X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
Chris@42 115 else
Chris@42 116 X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
Chris@42 117 }
Chris@42 118
Chris@42 119 /* like cpy2d, but write output contiguously if possible */
Chris@42 120 void X(cpy2d_co)(R *I, R *O,
Chris@42 121 INT n0, INT is0, INT os0,
Chris@42 122 INT n1, INT is1, INT os1,
Chris@42 123 INT vl)
Chris@42 124 {
Chris@42 125 if (IABS(os0) < IABS(os1)) /* inner loop is for n0 */
Chris@42 126 X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl);
Chris@42 127 else
Chris@42 128 X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl);
Chris@42 129 }
Chris@42 130
Chris@42 131
Chris@42 132 /* tiled copy routines */
Chris@42 133 struct cpy2d_closure {
Chris@42 134 R *I, *O;
Chris@42 135 INT is0, os0, is1, os1, vl;
Chris@42 136 R *buf;
Chris@42 137 };
Chris@42 138
Chris@42 139 static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
Chris@42 140 {
Chris@42 141 struct cpy2d_closure *k = (struct cpy2d_closure *)args;
Chris@42 142 X(cpy2d)(k->I + n0l * k->is0 + n1l * k->is1,
Chris@42 143 k->O + n0l * k->os0 + n1l * k->os1,
Chris@42 144 n0u - n0l, k->is0, k->os0,
Chris@42 145 n1u - n1l, k->is1, k->os1,
Chris@42 146 k->vl);
Chris@42 147 }
Chris@42 148
Chris@42 149 static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args)
Chris@42 150 {
Chris@42 151 struct cpy2d_closure *k = (struct cpy2d_closure *)args;
Chris@42 152
Chris@42 153 /* copy from I to buf */
Chris@42 154 X(cpy2d_ci)(k->I + n0l * k->is0 + n1l * k->is1,
Chris@42 155 k->buf,
Chris@42 156 n0u - n0l, k->is0, k->vl,
Chris@42 157 n1u - n1l, k->is1, k->vl * (n0u - n0l),
Chris@42 158 k->vl);
Chris@42 159
Chris@42 160 /* copy from buf to O */
Chris@42 161 X(cpy2d_co)(k->buf,
Chris@42 162 k->O + n0l * k->os0 + n1l * k->os1,
Chris@42 163 n0u - n0l, k->vl, k->os0,
Chris@42 164 n1u - n1l, k->vl * (n0u - n0l), k->os1,
Chris@42 165 k->vl);
Chris@42 166 }
Chris@42 167
Chris@42 168
Chris@42 169 void X(cpy2d_tiled)(R *I, R *O,
Chris@42 170 INT n0, INT is0, INT os0,
Chris@42 171 INT n1, INT is1, INT os1, INT vl)
Chris@42 172 {
Chris@42 173 INT tilesz = X(compute_tilesz)(vl,
Chris@42 174 1 /* input array */
Chris@42 175 + 1 /* ouput array */);
Chris@42 176 struct cpy2d_closure k;
Chris@42 177 k.I = I;
Chris@42 178 k.O = O;
Chris@42 179 k.is0 = is0;
Chris@42 180 k.os0 = os0;
Chris@42 181 k.is1 = is1;
Chris@42 182 k.os1 = os1;
Chris@42 183 k.vl = vl;
Chris@42 184 k.buf = 0; /* unused */
Chris@42 185 X(tile2d)(0, n0, 0, n1, tilesz, dotile, &k);
Chris@42 186 }
Chris@42 187
Chris@42 188 void X(cpy2d_tiledbuf)(R *I, R *O,
Chris@42 189 INT n0, INT is0, INT os0,
Chris@42 190 INT n1, INT is1, INT os1, INT vl)
Chris@42 191 {
Chris@42 192 R buf[CACHESIZE / (2 * sizeof(R))];
Chris@42 193 /* input and buffer in cache, or
Chris@42 194 output and buffer in cache */
Chris@42 195 INT tilesz = X(compute_tilesz)(vl, 2);
Chris@42 196 struct cpy2d_closure k;
Chris@42 197 k.I = I;
Chris@42 198 k.O = O;
Chris@42 199 k.is0 = is0;
Chris@42 200 k.os0 = os0;
Chris@42 201 k.is1 = is1;
Chris@42 202 k.os1 = os1;
Chris@42 203 k.vl = vl;
Chris@42 204 k.buf = buf;
Chris@42 205 A(tilesz * tilesz * vl * sizeof(R) <= sizeof(buf));
Chris@42 206 X(tile2d)(0, n0, 0, n1, tilesz, dotile_buf, &k);
Chris@42 207 }